fix: 修复模型加载方式,改用 FP16+CPU offload

RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行:
- bitsandbytes 4-bit 不支持 CPU offload
- bitsandbytes 8-bit 与 accelerate 存在版本兼容问题
- FP16 + CPU offload 可以加载但推理质量极差(输出乱码)
- 推理速度仅 0.4 tokens/s

结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 13:05:20 +08:00
parent f7174464d5
commit 4ac406572e
6 changed files with 68 additions and 80 deletions

View File

@@ -1,10 +1,15 @@
"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import sys
import glob
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
@@ -21,14 +26,6 @@ def test_basic_inference():
print("Qwen3.5-9B 基础推理测试")
print("=" * 60)
# 4-bit 量化配置 (RTX 3050 8GB 必须量化)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model_path = get_model_path()
print(f"\n模型路径: {model_path}")
@@ -38,13 +35,16 @@ def test_basic_inference():
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
# 加载模型 (4-bit 量化)
print("加载模型 (4-bit 量化)...")
# 加载模型 (FP16 + GPU/CPU offload)
print("加载模型 (FP16 + CPU offload)...")
max_memory = {0: "6GiB", "cpu": "24GiB"}
t0 = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
torch_dtype=torch.float16,
device_map="auto",
max_memory=max_memory,
offload_folder="vsp/qwen3.5-9b/offload",
trust_remote_code=True,
)
load_time = time.time() - t0
@@ -80,7 +80,7 @@ def test_basic_inference():
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
max_new_tokens=32,
do_sample=True,
temperature=0.7,
top_p=0.8,