feat: 改用 4-bit NF4 纯 GPU 推理,关闭 thinking 模式

- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 17:38:33 +08:00
parent 42db2b0ca9
commit 682063abf1
12 changed files with 356 additions and 104 deletions

View File

@@ -1,52 +1,28 @@
"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import sys
import glob
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model, apply_chat
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
"""获取模型路径"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
if paths:
return os.path.dirname(paths[0])
return "Qwen/Qwen3.5-9B"
def test_basic_inference():
"""基础推理测试"""
print("=" * 60)
print("Qwen3.5-9B 基础推理测试")
print("Qwen3.5-9B 基础推理测试 (4-bit NF4 量化, 纯GPU)")
print("=" * 60)
model_path = get_model_path()
print(f"\n模型路径: {model_path}")
# 加载 tokenizer
print("加载 tokenizer...")
# 加载模型
print("\n加载模型...")
t0 = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
# 加载模型 (FP16 + GPU/CPU offload)
print("加载模型 (FP16 + CPU offload)...")
max_memory = {0: "6GiB", "cpu": "24GiB"}
t0 = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto",
max_memory=max_memory,
offload_folder="vsp/qwen3.5-9b/offload",
trust_remote_code=True,
)
model, tokenizer = load_model()
load_time = time.time() - t0
print(f" 模型加载耗时: {load_time:.2f}s")
@@ -72,7 +48,7 @@ def test_basic_inference():
for i, prompt in enumerate(test_prompts):
print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text = apply_chat(tokenizer, messages)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
input_len = inputs["input_ids"].shape[1]