Files
qwen-test/vsp/qwen3.5-9b/test_basic_inference.py
16337 4ac406572e fix: 修复模型加载方式,改用 FP16+CPU offload
RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行:
- bitsandbytes 4-bit 不支持 CPU offload
- bitsandbytes 8-bit 与 accelerate 存在版本兼容问题
- FP16 + CPU offload 可以加载但推理质量极差(输出乱码)
- 推理速度仅 0.4 tokens/s

结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 13:05:20 +08:00

121 lines
3.9 KiB
Python

"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import sys
import glob
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer
# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
def get_model_path():
"""获取模型路径"""
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
if paths:
return os.path.dirname(paths[0])
return "Qwen/Qwen3.5-9B"
def test_basic_inference():
"""基础推理测试"""
print("=" * 60)
print("Qwen3.5-9B 基础推理测试")
print("=" * 60)
model_path = get_model_path()
print(f"\n模型路径: {model_path}")
# 加载 tokenizer
print("加载 tokenizer...")
t0 = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
# 加载模型 (FP16 + GPU/CPU offload)
print("加载模型 (FP16 + CPU offload)...")
max_memory = {0: "6GiB", "cpu": "24GiB"}
t0 = time.time()
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto",
max_memory=max_memory,
offload_folder="vsp/qwen3.5-9b/offload",
trust_remote_code=True,
)
load_time = time.time() - t0
print(f" 模型加载耗时: {load_time:.2f}s")
# GPU 显存使用
if torch.cuda.is_available():
mem_used = torch.cuda.memory_allocated() / 1024**3
mem_reserved = torch.cuda.memory_reserved() / 1024**3
print(f" GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")
# 测试推理
test_prompts = [
"你好,请介绍一下你自己。",
"What is the capital of France?",
"请用Python写一个快速排序算法。",
"解释一下什么是机器学习。",
]
print(f"\n{'='*60}")
print("推理测试")
print(f"{'='*60}")
results = []
for i, prompt in enumerate(test_prompts):
print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
input_len = inputs["input_ids"].shape[1]
t0 = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=True,
temperature=0.7,
top_p=0.8,
)
gen_time = time.time() - t0
output_len = outputs.shape[1] - input_len
tokens_per_sec = output_len / gen_time if gen_time > 0 else 0
response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
print(f" 输出 tokens: {output_len}")
print(f" 生成耗时: {gen_time:.2f}s")
print(f" 速度: {tokens_per_sec:.1f} tokens/s")
print(f" 回复: {response[:100]}...")
results.append({
"prompt": prompt,
"output_tokens": output_len,
"time_s": gen_time,
"tokens_per_sec": tokens_per_sec,
})
# 汇总
print(f"\n{'='*60}")
print("基础测试汇总")
print(f"{'='*60}")
print(f" 模型加载耗时: {load_time:.2f}s")
avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
print(f" 平均生成速度: {avg_speed:.1f} tokens/s")
print(f" GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f" 系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")
return results
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
test_basic_inference()