fix: 修复模型加载方式，改用 FP16+CPU offload

RTX 3050 8GB 无法完整加载 Qwen3.5-9B，即使量化也不行： - bitsandbytes 4-bit 不支持 CPU offload - bitsandbytes 8-bit 与 accelerate 存在版本兼容问题 - FP16 + CPU offload 可以加载但推理质量极差（输出乱码） - 推理速度仅 0.4 tokens/s 结论：RTX 3050 8GB 不适合运行 Qwen3.5-9B Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 13:05:20 +08:00
parent f7174464d5
commit 4ac406572e
6 changed files with 68 additions and 80 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ build/
 *.pth
 *.onnx
 vsp/qwen3.5-9b/model/
+vsp/qwen3.5-9b/offload/

 # Env
 .env
--- a/vsp/qwen3.5-9b/benchmark_speed.py
+++ b/vsp/qwen3.5-9b/benchmark_speed.py
@@ -3,32 +3,14 @@ import time
 import json
 import os
 import glob
+import sys
 import torch
 import psutil
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from datetime import datetime

-
-def load_model():
-    """加载 4-bit 量化模型"""
-    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
-    model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
-
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    return model, tokenizer
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model


 def benchmark_speed(model, tokenizer, num_runs=5):
--- a/vsp/qwen3.5-9b/model_utils.py
+++ b/vsp/qwen3.5-9b/model_utils.py
@@ -0,0 +1,41 @@
+"""共享模型加载工具 - 统一加载配置"""
+import os
+import sys
+import glob
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 修复 Windows GBK 编码问题
+sys.stdout.reconfigure(encoding='utf-8', errors='replace')
+sys.stderr.reconfigure(encoding='utf-8', errors='replace')
+
+
+def get_model_path():
+    """获取本地模型路径"""
+    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
+    if paths:
+        return os.path.dirname(paths[0])
+    return "Qwen/Qwen3.5-9B"
+
+
+def load_model():
+    """加载模型 (FP16 + GPU/CPU offload)
+
+    RTX 3050 8GB VRAM 不够放完整模型，使用 FP16 并将部分层 offload 到 CPU。
+    """
+    model_path = get_model_path()
+    print(f"模型路径: {model_path}")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    max_memory = {0: "6GiB", "cpu": "24GiB"}
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        max_memory=max_memory,
+        offload_folder="vsp/qwen3.5-9b/offload",
+        trust_remote_code=True,
+    )
+
+    return model, tokenizer
--- a/vsp/qwen3.5-9b/test_accuracy.py
+++ b/vsp/qwen3.5-9b/test_accuracy.py
@@ -1,11 +1,15 @@
 """精度评估 - 测试模型在常见任务上的准确性"""
 import json
 import os
+import sys
 import glob
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from datetime import datetime

+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model
+

 # 测试数据集
 ACCURACY_TESTS = [
@@ -69,28 +73,6 @@ ACCURACY_TESTS = [
 ]


-def load_model():
-    """加载 4-bit 量化模型"""
-    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
-    model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
-
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    return model, tokenizer
-
-
 def evaluate_accuracy(model, tokenizer):
    """运行精度评估"""
    print("=" * 60)
--- a/vsp/qwen3.5-9b/test_basic_inference.py
+++ b/vsp/qwen3.5-9b/test_basic_inference.py
@@ -1,10 +1,15 @@
 """基础推理测试 - 验证模型能否正常加载和生成"""
 import os
+import sys
 import glob
 import time
 import torch
 import psutil
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 修复 Windows GBK 编码问题
+sys.stdout.reconfigure(encoding='utf-8', errors='replace')
+sys.stderr.reconfigure(encoding='utf-8', errors='replace')


 def get_model_path():
@@ -21,14 +26,6 @@ def test_basic_inference():
    print("Qwen3.5-9B 基础推理测试")
    print("=" * 60)

-    # 4-bit 量化配置 (RTX 3050 8GB 必须量化)
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-    )
-
    model_path = get_model_path()
    print(f"\n模型路径: {model_path}")

@@ -38,13 +35,16 @@ def test_basic_inference():
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    print(f"  Tokenizer 加载耗时: {time.time() - t0:.2f}s")

-    # 加载模型 (4-bit 量化)
-    print("加载模型 (4-bit 量化)...")
+    # 加载模型 (FP16 + GPU/CPU offload)
+    print("加载模型 (FP16 + CPU offload)...")
+    max_memory = {0: "6GiB", "cpu": "24GiB"}
    t0 = time.time()
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
-        quantization_config=bnb_config,
+        torch_dtype=torch.float16,
        device_map="auto",
+        max_memory=max_memory,
+        offload_folder="vsp/qwen3.5-9b/offload",
        trust_remote_code=True,
    )
    load_time = time.time() - t0
@@ -80,7 +80,7 @@ def test_basic_inference():
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
-                max_new_tokens=256,
+                max_new_tokens=32,
                do_sample=True,
                temperature=0.7,
                top_p=0.8,
--- a/vsp/qwen3.5-9b/test_concurrency.py
+++ b/vsp/qwen3.5-9b/test_concurrency.py
@@ -1,35 +1,17 @@
 """并发压测 - 测试不同并发数下的性能表现"""
 import json
 import os
+import sys
 import glob
 import time
 import torch
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from datetime import datetime

-
-def load_model():
-    """加载 4-bit 量化模型"""
-    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
-    model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
-
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    return model, tokenizer
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model


 def single_inference(model, tokenizer, prompt, lock, max_tokens=64):