"""共享模型加载工具 - 统一加载配置""" import os import sys import glob import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # 修复 Windows GBK 编码问题 sys.stdout.reconfigure(encoding='utf-8', errors='replace') sys.stderr.reconfigure(encoding='utf-8', errors='replace') def get_model_path(): """获取本地模型路径""" paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) if paths: return os.path.dirname(paths[0]) return "Qwen/Qwen3.5-9B" def load_model(): """加载模型 (4-bit NF4 量化,纯 GPU 运行) 使用 bitsandbytes 4-bit 量化,模型约 5GB,全部放在 GPU 上。 RTX 3050 8GB 显存刚好够用。 """ model_path = get_model_path() print(f"模型路径: {model_path}") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_path, quantization_config=quantization_config, device_map={"": 0}, trust_remote_code=True, ) return model, tokenizer def apply_chat(tokenizer, messages): """应用聊天模板,关闭 thinking 模式""" return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False, )