feat: 改用 4-bit NF4 纯 GPU 推理，关闭 thinking 模式

- 模型加载改为 bitsandbytes 4-bit NF4 量化，device_map={"":0} 纯 GPU - 关闭 Qwen3.5 thinking 模式 (enable_thinking=False) - 精度从 60% 提升到 90%，推理速度 1-2 tokens/s - GPU 显存 7.13GB/8GB，输出质量正常 - 更新所有测试结果和综合报告 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 17:38:33 +08:00
parent 42db2b0ca9
commit 682063abf1
12 changed files with 356 additions and 104 deletions
--- a/vsp/qwen3.5-9b/test_basic_inference.py
+++ b/vsp/qwen3.5-9b/test_basic_inference.py
@@ -1,52 +1,28 @@
 """基础推理测试 - 验证模型能否正常加载和生成"""
 import os
 import sys
-import glob
 import time
 import torch
 import psutil
-from transformers import AutoModelForCausalLM, AutoTokenizer
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model, apply_chat

 # 修复 Windows GBK 编码问题
 sys.stdout.reconfigure(encoding='utf-8', errors='replace')
 sys.stderr.reconfigure(encoding='utf-8', errors='replace')


-def get_model_path():
-    """获取模型路径"""
-    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
-    if paths:
-        return os.path.dirname(paths[0])
-    return "Qwen/Qwen3.5-9B"
-
-
 def test_basic_inference():
    """基础推理测试"""
    print("=" * 60)
-    print("Qwen3.5-9B 基础推理测试")
+    print("Qwen3.5-9B 基础推理测试 (4-bit NF4 量化, 纯GPU)")
    print("=" * 60)

-    model_path = get_model_path()
-    print(f"\n模型路径: {model_path}")
-
-    # 加载 tokenizer
-    print("加载 tokenizer...")
+    # 加载模型
+    print("\n加载模型...")
    t0 = time.time()
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    print(f"  Tokenizer 加载耗时: {time.time() - t0:.2f}s")
-
-    # 加载模型 (FP16 + GPU/CPU offload)
-    print("加载模型 (FP16 + CPU offload)...")
-    max_memory = {0: "6GiB", "cpu": "24GiB"}
-    t0 = time.time()
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        max_memory=max_memory,
-        offload_folder="vsp/qwen3.5-9b/offload",
-        trust_remote_code=True,
-    )
+    model, tokenizer = load_model()
    load_time = time.time() - t0
    print(f"  模型加载耗时: {load_time:.2f}s")

@@ -72,7 +48,7 @@ def test_basic_inference():
    for i, prompt in enumerate(test_prompts):
        print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
        messages = [{"role": "user", "content": prompt}]
-        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text = apply_chat(tokenizer, messages)
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]