RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行: - bitsandbytes 4-bit 不支持 CPU offload - bitsandbytes 8-bit 与 accelerate 存在版本兼容问题 - FP16 + CPU offload 可以加载但推理质量极差(输出乱码) - 推理速度仅 0.4 tokens/s 结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
"""共享模型加载工具 - 统一加载配置"""
|
|
import os
|
|
import sys
|
|
import glob
|
|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
# 修复 Windows GBK 编码问题
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
|
|
|
|
|
def get_model_path():
|
|
"""获取本地模型路径"""
|
|
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
|
if paths:
|
|
return os.path.dirname(paths[0])
|
|
return "Qwen/Qwen3.5-9B"
|
|
|
|
|
|
def load_model():
|
|
"""加载模型 (FP16 + GPU/CPU offload)
|
|
|
|
RTX 3050 8GB VRAM 不够放完整模型,使用 FP16 并将部分层 offload 到 CPU。
|
|
"""
|
|
model_path = get_model_path()
|
|
print(f"模型路径: {model_path}")
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
|
|
max_memory = {0: "6GiB", "cpu": "24GiB"}
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_path,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
max_memory=max_memory,
|
|
offload_folder="vsp/qwen3.5-9b/offload",
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
return model, tokenizer
|