Files
qwen-test/vsp/qwen3.5-9b/model_utils.py
16337 682063abf1 feat: 改用 4-bit NF4 纯 GPU 推理,关闭 thinking 模式
- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 17:38:33 +08:00

55 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""共享模型加载工具 - 统一加载配置"""
import os
import sys
import glob
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
"""获取本地模型路径"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
if paths:
return os.path.dirname(paths[0])
return "Qwen/Qwen3.5-9B"
def load_model():
"""加载模型 (4-bit NF4 量化,纯 GPU 运行)
使用 bitsandbytes 4-bit 量化,模型约 5GB全部放在 GPU 上。
RTX 3050 8GB 显存刚好够用。
"""
model_path = get_model_path()
print(f"模型路径: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=quantization_config,
device_map={"": 0},
trust_remote_code=True,
)
return model, tokenizer
def apply_chat(tokenizer, messages):
"""应用聊天模板,关闭 thinking 模式"""
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True,
enable_thinking=False,
)