- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
"""共享模型加载工具 - 统一加载配置"""
|
||
import os
|
||
import sys
|
||
import glob
|
||
import torch
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||
|
||
# 修复 Windows GBK 编码问题
|
||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||
|
||
|
||
def get_model_path():
|
||
"""获取本地模型路径"""
|
||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||
if paths:
|
||
return os.path.dirname(paths[0])
|
||
return "Qwen/Qwen3.5-9B"
|
||
|
||
|
||
def load_model():
|
||
"""加载模型 (4-bit NF4 量化,纯 GPU 运行)
|
||
|
||
使用 bitsandbytes 4-bit 量化,模型约 5GB,全部放在 GPU 上。
|
||
RTX 3050 8GB 显存刚好够用。
|
||
"""
|
||
model_path = get_model_path()
|
||
print(f"模型路径: {model_path}")
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||
|
||
quantization_config = BitsAndBytesConfig(
|
||
load_in_4bit=True,
|
||
bnb_4bit_compute_dtype=torch.float16,
|
||
bnb_4bit_quant_type="nf4",
|
||
bnb_4bit_use_double_quant=True,
|
||
)
|
||
|
||
model = AutoModelForCausalLM.from_pretrained(
|
||
model_path,
|
||
quantization_config=quantization_config,
|
||
device_map={"": 0},
|
||
trust_remote_code=True,
|
||
)
|
||
|
||
return model, tokenizer
|
||
|
||
|
||
def apply_chat(tokenizer, messages):
|
||
"""应用聊天模板,关闭 thinking 模式"""
|
||
return tokenizer.apply_chat_template(
|
||
messages, tokenize=False, add_generation_prompt=True,
|
||
enable_thinking=False,
|
||
)
|