fix: 修复模型加载方式,改用 FP16+CPU offload
RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行: - bitsandbytes 4-bit 不支持 CPU offload - bitsandbytes 8-bit 与 accelerate 存在版本兼容问题 - FP16 + CPU offload 可以加载但推理质量极差(输出乱码) - 推理速度仅 0.4 tokens/s 结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -14,6 +14,7 @@ build/
|
||||
*.pth
|
||||
*.onnx
|
||||
vsp/qwen3.5-9b/model/
|
||||
vsp/qwen3.5-9b/offload/
|
||||
|
||||
# Env
|
||||
.env
|
||||
|
||||
@@ -3,32 +3,14 @@ import time
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
import torch
|
||||
import psutil
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载 4-bit 量化模型"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return model, tokenizer
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
def benchmark_speed(model, tokenizer, num_runs=5):
|
||||
|
||||
41
vsp/qwen3.5-9b/model_utils.py
Normal file
41
vsp/qwen3.5-9b/model_utils.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""共享模型加载工具 - 统一加载配置"""
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# 修复 Windows GBK 编码问题
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def get_model_path():
|
||||
"""获取本地模型路径"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
if paths:
|
||||
return os.path.dirname(paths[0])
|
||||
return "Qwen/Qwen3.5-9B"
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载模型 (FP16 + GPU/CPU offload)
|
||||
|
||||
RTX 3050 8GB VRAM 不够放完整模型,使用 FP16 并将部分层 offload 到 CPU。
|
||||
"""
|
||||
model_path = get_model_path()
|
||||
print(f"模型路径: {model_path}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
max_memory = {0: "6GiB", "cpu": "24GiB"}
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
max_memory=max_memory,
|
||||
offload_folder="vsp/qwen3.5-9b/offload",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return model, tokenizer
|
||||
@@ -1,11 +1,15 @@
|
||||
"""精度评估 - 测试模型在常见任务上的准确性"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
# 测试数据集
|
||||
ACCURACY_TESTS = [
|
||||
@@ -69,28 +73,6 @@ ACCURACY_TESTS = [
|
||||
]
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载 4-bit 量化模型"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def evaluate_accuracy(model, tokenizer):
|
||||
"""运行精度评估"""
|
||||
print("=" * 60)
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
"""基础推理测试 - 验证模型能否正常加载和生成"""
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import torch
|
||||
import psutil
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# 修复 Windows GBK 编码问题
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def get_model_path():
|
||||
@@ -21,14 +26,6 @@ def test_basic_inference():
|
||||
print("Qwen3.5-9B 基础推理测试")
|
||||
print("=" * 60)
|
||||
|
||||
# 4-bit 量化配置 (RTX 3050 8GB 必须量化)
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model_path = get_model_path()
|
||||
print(f"\n模型路径: {model_path}")
|
||||
|
||||
@@ -38,13 +35,16 @@ def test_basic_inference():
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
|
||||
|
||||
# 加载模型 (4-bit 量化)
|
||||
print("加载模型 (4-bit 量化)...")
|
||||
# 加载模型 (FP16 + GPU/CPU offload)
|
||||
print("加载模型 (FP16 + CPU offload)...")
|
||||
max_memory = {0: "6GiB", "cpu": "24GiB"}
|
||||
t0 = time.time()
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
quantization_config=bnb_config,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
max_memory=max_memory,
|
||||
offload_folder="vsp/qwen3.5-9b/offload",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
load_time = time.time() - t0
|
||||
@@ -80,7 +80,7 @@ def test_basic_inference():
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=256,
|
||||
max_new_tokens=32,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.8,
|
||||
|
||||
@@ -1,35 +1,17 @@
|
||||
"""并发压测 - 测试不同并发数下的性能表现"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import torch
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载 4-bit 量化模型"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return model, tokenizer
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
def single_inference(model, tokenizer, prompt, lock, max_tokens=64):
|
||||
|
||||
Reference in New Issue
Block a user