fix: 修复模型加载方式,改用 FP16+CPU offload

RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行:
- bitsandbytes 4-bit 不支持 CPU offload
- bitsandbytes 8-bit 与 accelerate 存在版本兼容问题
- FP16 + CPU offload 可以加载但推理质量极差(输出乱码)
- 推理速度仅 0.4 tokens/s

结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 13:05:20 +08:00
parent f7174464d5
commit 4ac406572e
6 changed files with 68 additions and 80 deletions

1
.gitignore vendored
View File

@@ -14,6 +14,7 @@ build/
*.pth
*.onnx
vsp/qwen3.5-9b/model/
vsp/qwen3.5-9b/offload/
# Env
.env

View File

@@ -3,32 +3,14 @@ import time
import json
import os
import glob
import sys
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datetime import datetime
def load_model():
"""加载 4-bit 量化模型"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
return model, tokenizer
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model
def benchmark_speed(model, tokenizer, num_runs=5):

View File

@@ -0,0 +1,41 @@
"""共享模型加载工具 - 统一加载配置"""
import os
import sys
import glob
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
"""获取本地模型路径"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
if paths:
return os.path.dirname(paths[0])
return "Qwen/Qwen3.5-9B"
def load_model():
"""加载模型 (FP16 + GPU/CPU offload)
RTX 3050 8GB VRAM 不够放完整模型,使用 FP16 并将部分层 offload 到 CPU。
"""
model_path = get_model_path()
print(f"模型路径: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
max_memory = {0: "6GiB", "cpu": "24GiB"}
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto",
max_memory=max_memory,
offload_folder="vsp/qwen3.5-9b/offload",
trust_remote_code=True,
)
return model, tokenizer

View File

@@ -1,11 +1,15 @@
"""精度评估 - 测试模型在常见任务上的准确性"""
import json
import os
import sys
import glob
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model
# 测试数据集
ACCURACY_TESTS = [
@@ -69,28 +73,6 @@ ACCURACY_TESTS = [
]
def load_model():
"""加载 4-bit 量化模型"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
return model, tokenizer
def evaluate_accuracy(model, tokenizer):
"""运行精度评估"""
print("=" * 60)

View File

@@ -1,10 +1,15 @@
"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import sys
import glob
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
@@ -21,14 +26,6 @@ def test_basic_inference():
print("Qwen3.5-9B 基础推理测试")
print("=" * 60)
# 4-bit 量化配置 (RTX 3050 8GB 必须量化)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model_path = get_model_path()
print(f"\n模型路径: {model_path}")
@@ -38,13 +35,16 @@ def test_basic_inference():
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
# 加载模型 (4-bit 量化)
print("加载模型 (4-bit 量化)...")
# 加载模型 (FP16 + GPU/CPU offload)
print("加载模型 (FP16 + CPU offload)...")
max_memory = {0: "6GiB", "cpu": "24GiB"}
t0 = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
torch_dtype=torch.float16,
device_map="auto",
max_memory=max_memory,
offload_folder="vsp/qwen3.5-9b/offload",
trust_remote_code=True,
)
load_time = time.time() - t0
@@ -80,7 +80,7 @@ def test_basic_inference():
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
max_new_tokens=32,
do_sample=True,
temperature=0.7,
top_p=0.8,

View File

@@ -1,35 +1,17 @@
"""并发压测 - 测试不同并发数下的性能表现"""
import json
import os
import sys
import glob
import time
import torch
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datetime import datetime
def load_model():
"""加载 4-bit 量化模型"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
return model, tokenizer
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model
def single_inference(model, tokenizer, prompt, lock, max_tokens=64):