Files
qwen-test/vsp/qwen3.5-9b/generate_report.py
2026-03-16 11:45:51 +08:00

125 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""综合报告生成 - 汇总所有测试结果"""
import json
import os
from datetime import datetime
def load_json(path):
"""加载 JSON 文件"""
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
return None
def generate_report():
"""生成综合测试报告"""
results_dir = "vsp/qwen3.5-9b/results"
speed = load_json(os.path.join(results_dir, "benchmark_speed.json"))
accuracy = load_json(os.path.join(results_dir, "accuracy_results.json"))
concurrency = load_json(os.path.join(results_dir, "concurrency_results.json"))
gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json"))
report_lines = [
"# Qwen3.5-9B 性能测试报告",
f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"\n## 1. 测试环境",
"",
"| 项目 | 值 |",
"|------|-----|",
"| 模型 | Qwen3.5-9B |",
"| 量化方式 | 4-bit NF4 (bitsandbytes) |",
]
if speed and "memory" in speed:
mem = speed["memory"]
report_lines.extend([
f"| GPU | {mem.get('gpu_name', 'N/A')} |",
f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |",
f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |",
f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |",
])
# 推理速度
report_lines.extend(["\n## 2. 推理速度", ""])
if speed and "speed_benchmark" in speed:
report_lines.extend([
"| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |",
"|---------|-----------|-----------|---------|---------------|",
])
for r in speed["speed_benchmark"]:
report_lines.append(
f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |"
)
else:
report_lines.append("*未运行速度测试*")
# 精度
report_lines.extend(["\n## 3. 精度评估", ""])
if accuracy and "accuracy" in accuracy:
acc = accuracy["accuracy"]
report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n")
report_lines.extend([
"| 分类 | 通过/总数 | 准确率 |",
"|------|---------|--------|",
])
for cat, stats in acc.get("category_stats", {}).items():
rate = stats["passed"] / stats["total"] * 100
report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |")
else:
report_lines.append("*未运行精度测试*")
# 并发
report_lines.extend(["\n## 4. 并发性能", ""])
if concurrency and "concurrency_results" in concurrency:
report_lines.extend([
"| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |",
"|-------|---------|----------------|-----------|",
])
for r in concurrency["concurrency_results"]:
report_lines.append(
f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |"
)
report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}")
else:
report_lines.append("*未运行并发测试*")
# GPU 需求
report_lines.extend(["\n## 5. GPU 算力需求", ""])
if gpu_req:
report_lines.extend([
"| 精度 | 模型大小 | 最低显存 | 推荐显卡 |",
"|------|---------|---------|---------|",
])
for precision, info in gpu_req.get("precision_requirements", {}).items():
gpus = ", ".join(info["recommended_gpus"][:2])
report_lines.append(
f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |"
)
# 结论
report_lines.extend([
"\n## 6. 结论与建议",
"",
"1. **RTX 3050 8GB 可以运行 Qwen3.5-9B**,但必须使用 4-bit 量化",
"2. 4-bit 量化后显存占用约 5GB留有一定余量",
"3. 单卡推理速度适合开发测试,不适合高并发生产环境",
"4. 生产部署建议使用 RTX 4090 (FP16) 或 A100 (FP16/BF16) + vLLM",
"5. 4-bit 量化对简单任务精度影响较小,复杂推理任务可能有一定损失",
])
# 保存报告
report_text = "\n".join(report_lines)
report_path = os.path.join(results_dir, "REPORT.md")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report_text)
print(report_text)
print(f"\n\n报告已保存到 {report_path}")
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
generate_report()