fix: 修复模型加载方式,改用 FP16+CPU offload

RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行:
- bitsandbytes 4-bit 不支持 CPU offload
- bitsandbytes 8-bit 与 accelerate 存在版本兼容问题
- FP16 + CPU offload 可以加载但推理质量极差(输出乱码)
- 推理速度仅 0.4 tokens/s

结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 13:05:20 +08:00
parent f7174464d5
commit 4ac406572e
6 changed files with 68 additions and 80 deletions

View File

@@ -1,35 +1,17 @@
"""并发压测 - 测试不同并发数下的性能表现"""
import json
import os
import sys
import glob
import time
import torch
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datetime import datetime
def load_model():
"""加载 4-bit 量化模型"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
return model, tokenizer
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model
def single_inference(model, tokenizer, prompt, lock, max_tokens=64):