- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
"""基础推理测试 - 验证模型能否正常加载和生成"""
|
|
import os
|
|
import sys
|
|
import time
|
|
import torch
|
|
import psutil
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from model_utils import load_model, apply_chat
|
|
|
|
# 修复 Windows GBK 编码问题
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
|
|
|
|
|
def test_basic_inference():
|
|
"""基础推理测试"""
|
|
print("=" * 60)
|
|
print("Qwen3.5-9B 基础推理测试 (4-bit NF4 量化, 纯GPU)")
|
|
print("=" * 60)
|
|
|
|
# 加载模型
|
|
print("\n加载模型...")
|
|
t0 = time.time()
|
|
model, tokenizer = load_model()
|
|
load_time = time.time() - t0
|
|
print(f" 模型加载耗时: {load_time:.2f}s")
|
|
|
|
# GPU 显存使用
|
|
if torch.cuda.is_available():
|
|
mem_used = torch.cuda.memory_allocated() / 1024**3
|
|
mem_reserved = torch.cuda.memory_reserved() / 1024**3
|
|
print(f" GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")
|
|
|
|
# 测试推理
|
|
test_prompts = [
|
|
"你好,请介绍一下你自己。",
|
|
"What is the capital of France?",
|
|
"请用Python写一个快速排序算法。",
|
|
"解释一下什么是机器学习。",
|
|
]
|
|
|
|
print(f"\n{'='*60}")
|
|
print("推理测试")
|
|
print(f"{'='*60}")
|
|
|
|
results = []
|
|
for i, prompt in enumerate(test_prompts):
|
|
print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
|
|
messages = [{"role": "user", "content": prompt}]
|
|
text = apply_chat(tokenizer, messages)
|
|
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
|
input_len = inputs["input_ids"].shape[1]
|
|
|
|
t0 = time.time()
|
|
with torch.no_grad():
|
|
outputs = model.generate(
|
|
**inputs,
|
|
max_new_tokens=32,
|
|
do_sample=True,
|
|
temperature=0.7,
|
|
top_p=0.8,
|
|
)
|
|
gen_time = time.time() - t0
|
|
output_len = outputs.shape[1] - input_len
|
|
tokens_per_sec = output_len / gen_time if gen_time > 0 else 0
|
|
|
|
response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
|
|
print(f" 输出 tokens: {output_len}")
|
|
print(f" 生成耗时: {gen_time:.2f}s")
|
|
print(f" 速度: {tokens_per_sec:.1f} tokens/s")
|
|
print(f" 回复: {response[:100]}...")
|
|
|
|
results.append({
|
|
"prompt": prompt,
|
|
"output_tokens": output_len,
|
|
"time_s": gen_time,
|
|
"tokens_per_sec": tokens_per_sec,
|
|
})
|
|
|
|
# 汇总
|
|
print(f"\n{'='*60}")
|
|
print("基础测试汇总")
|
|
print(f"{'='*60}")
|
|
print(f" 模型加载耗时: {load_time:.2f}s")
|
|
avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
|
|
print(f" 平均生成速度: {avg_speed:.1f} tokens/s")
|
|
print(f" GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
|
print(f" 系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
|
test_basic_inference()
|