Files
qwen-test/vsp/qwen3.5-9b/generate_report.py
16337 682063abf1 feat: 改用 4-bit NF4 纯 GPU 推理,关闭 thinking 模式
- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 17:38:33 +08:00

145 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""综合报告生成 - 汇总所有测试结果"""
import json
import os
from datetime import datetime
def load_json(path):
"""加载 JSON 文件"""
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
return None
def generate_report():
"""生成综合测试报告"""
results_dir = "vsp/qwen3.5-9b/results"
speed = load_json(os.path.join(results_dir, "benchmark_speed.json"))
accuracy = load_json(os.path.join(results_dir, "accuracy_results.json"))
concurrency = load_json(os.path.join(results_dir, "concurrency_results.json"))
gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json"))
report_lines = [
"# Qwen3.5-9B 性能测试报告",
f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"\n## 1. 测试环境",
"",
"| 项目 | 值 |",
"|------|-----|",
"| 模型 | Qwen3.5-9B |",
"| 加载方式 | 4-bit NF4 量化 (bitsandbytes),纯 GPU |",
"| GPU | NVIDIA GeForce RTX 3050 OEM |",
"| GPU 显存 | 8 GB |",
"| CUDA | 12.1 |",
"| Python 环境 | conda yolo |",
]
if speed and "memory" in speed:
mem = speed["memory"]
report_lines.extend([
f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |",
f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |",
])
# 推理速度
report_lines.extend(["\n## 2. 推理速度", ""])
if speed and "speed_benchmark" in speed:
report_lines.extend([
"| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |",
"|---------|-----------|-----------|---------|---------------|",
])
for r in speed["speed_benchmark"]:
report_lines.append(
f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |"
)
else:
report_lines.append("*未运行速度测试*")
# 精度
report_lines.extend(["\n## 3. 精度评估", ""])
if accuracy and "accuracy" in accuracy:
acc = accuracy["accuracy"]
report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n")
report_lines.extend([
"| 分类 | 通过/总数 | 准确率 |",
"|------|---------|--------|",
])
for cat, stats in acc.get("category_stats", {}).items():
rate = stats["passed"] / stats["total"] * 100
report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |")
else:
report_lines.append("*未运行精度测试*")
# 并发
report_lines.extend(["\n## 4. 并发性能", ""])
if concurrency and "concurrency_results" in concurrency:
report_lines.extend([
"| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |",
"|-------|---------|----------------|-----------|",
])
for r in concurrency["concurrency_results"]:
report_lines.append(
f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |"
)
report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}")
else:
report_lines.append("*未运行并发测试*")
# GPU 需求
report_lines.extend(["\n## 5. GPU 算力需求", ""])
if gpu_req:
report_lines.extend([
"| 精度 | 模型大小 | 最低显存 | 推荐显卡 |",
"|------|---------|---------|---------|",
])
for precision, info in gpu_req.get("precision_requirements", {}).items():
gpus = ", ".join(info["recommended_gpus"][:2])
report_lines.append(
f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |"
)
# 实际测试结论
report_lines.extend([
"\n## 6. 实际测试结论",
"",
"### RTX 3050 8GB 测试结果 (4-bit NF4 量化,纯 GPU)",
"",
"| 指标 | 结果 |",
"|------|------|",
"| GPU 显存占用 | 7.13 GB / 8 GB |",
"| 系统内存占用 | 7.59 GB |",
"| 推理速度 | 1.0-1.8 tokens/s |",
"| 精度 | 90% (9/10) |",
"| 输出质量 | 正常,回答准确 |",
"",
"### 注意事项",
"",
"1. **必须使用 4-bit NF4 量化**: device_map={\"\":0} 将模型全部放在 GPU 上",
"2. **必须关闭 thinking 模式**: enable_thinking=False否则输出中包含思考过程且容易被截断",
"3. **显存接近上限**: 7.13GB / 8GB长文本输入可能导致 OOM",
"4. **并发不可行**: 单 GPU 串行推理,吞吐量恒定 ~2 tokens/s",
"",
"### 部署建议",
"",
"1. **RTX 3050 8GB 可用于开发测试**4-bit 量化后勉强可用",
"2. **推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB有更大显存余量",
"3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM",
"4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上更流畅运行",
])
# 保存报告
report_text = "\n".join(report_lines)
report_path = os.path.join(results_dir, "REPORT.md")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report_text)
print(report_text)
print(f"\n\n报告已保存到 {report_path}")
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
generate_report()