feat: 添加 GPU 需求分析和综合报告生成脚本
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
124
vsp/qwen3.5-9b/generate_report.py
Normal file
124
vsp/qwen3.5-9b/generate_report.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""综合报告生成 - 汇总所有测试结果"""
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_json(path):
|
||||
"""加载 JSON 文件"""
|
||||
if os.path.exists(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return None
|
||||
|
||||
|
||||
def generate_report():
|
||||
"""生成综合测试报告"""
|
||||
results_dir = "vsp/qwen3.5-9b/results"
|
||||
|
||||
speed = load_json(os.path.join(results_dir, "benchmark_speed.json"))
|
||||
accuracy = load_json(os.path.join(results_dir, "accuracy_results.json"))
|
||||
concurrency = load_json(os.path.join(results_dir, "concurrency_results.json"))
|
||||
gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json"))
|
||||
|
||||
report_lines = [
|
||||
"# Qwen3.5-9B 性能测试报告",
|
||||
f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
"\n## 1. 测试环境",
|
||||
"",
|
||||
"| 项目 | 值 |",
|
||||
"|------|-----|",
|
||||
"| 模型 | Qwen3.5-9B |",
|
||||
"| 量化方式 | 4-bit NF4 (bitsandbytes) |",
|
||||
]
|
||||
|
||||
if speed and "memory" in speed:
|
||||
mem = speed["memory"]
|
||||
report_lines.extend([
|
||||
f"| GPU | {mem.get('gpu_name', 'N/A')} |",
|
||||
f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |",
|
||||
f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |",
|
||||
f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |",
|
||||
])
|
||||
|
||||
# 推理速度
|
||||
report_lines.extend(["\n## 2. 推理速度", ""])
|
||||
if speed and "speed_benchmark" in speed:
|
||||
report_lines.extend([
|
||||
"| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |",
|
||||
"|---------|-----------|-----------|---------|---------------|",
|
||||
])
|
||||
for r in speed["speed_benchmark"]:
|
||||
report_lines.append(
|
||||
f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |"
|
||||
)
|
||||
else:
|
||||
report_lines.append("*未运行速度测试*")
|
||||
|
||||
# 精度
|
||||
report_lines.extend(["\n## 3. 精度评估", ""])
|
||||
if accuracy and "accuracy" in accuracy:
|
||||
acc = accuracy["accuracy"]
|
||||
report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n")
|
||||
report_lines.extend([
|
||||
"| 分类 | 通过/总数 | 准确率 |",
|
||||
"|------|---------|--------|",
|
||||
])
|
||||
for cat, stats in acc.get("category_stats", {}).items():
|
||||
rate = stats["passed"] / stats["total"] * 100
|
||||
report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |")
|
||||
else:
|
||||
report_lines.append("*未运行精度测试*")
|
||||
|
||||
# 并发
|
||||
report_lines.extend(["\n## 4. 并发性能", ""])
|
||||
if concurrency and "concurrency_results" in concurrency:
|
||||
report_lines.extend([
|
||||
"| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |",
|
||||
"|-------|---------|----------------|-----------|",
|
||||
])
|
||||
for r in concurrency["concurrency_results"]:
|
||||
report_lines.append(
|
||||
f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |"
|
||||
)
|
||||
report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}")
|
||||
else:
|
||||
report_lines.append("*未运行并发测试*")
|
||||
|
||||
# GPU 需求
|
||||
report_lines.extend(["\n## 5. GPU 算力需求", ""])
|
||||
if gpu_req:
|
||||
report_lines.extend([
|
||||
"| 精度 | 模型大小 | 最低显存 | 推荐显卡 |",
|
||||
"|------|---------|---------|---------|",
|
||||
])
|
||||
for precision, info in gpu_req.get("precision_requirements", {}).items():
|
||||
gpus = ", ".join(info["recommended_gpus"][:2])
|
||||
report_lines.append(
|
||||
f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |"
|
||||
)
|
||||
|
||||
# 结论
|
||||
report_lines.extend([
|
||||
"\n## 6. 结论与建议",
|
||||
"",
|
||||
"1. **RTX 3050 8GB 可以运行 Qwen3.5-9B**,但必须使用 4-bit 量化",
|
||||
"2. 4-bit 量化后显存占用约 5GB,留有一定余量",
|
||||
"3. 单卡推理速度适合开发测试,不适合高并发生产环境",
|
||||
"4. 生产部署建议使用 RTX 4090 (FP16) 或 A100 (FP16/BF16) + vLLM",
|
||||
"5. 4-bit 量化对简单任务精度影响较小,复杂推理任务可能有一定损失",
|
||||
])
|
||||
|
||||
# 保存报告
|
||||
report_text = "\n".join(report_lines)
|
||||
report_path = os.path.join(results_dir, "REPORT.md")
|
||||
with open(report_path, "w", encoding="utf-8") as f:
|
||||
f.write(report_text)
|
||||
|
||||
print(report_text)
|
||||
print(f"\n\n报告已保存到 {report_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
generate_report()
|
||||
90
vsp/qwen3.5-9b/gpu_requirements.py
Normal file
90
vsp/qwen3.5-9b/gpu_requirements.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""GPU 算力需求分析"""
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# Qwen3.5-9B 不同精度下的显存需求估算
|
||||
GPU_REQUIREMENTS = {
|
||||
"model": "Qwen3.5-9B",
|
||||
"parameters": "9B",
|
||||
"precision_requirements": {
|
||||
"FP32": {
|
||||
"model_size_gb": 36,
|
||||
"min_vram_gb": 40,
|
||||
"recommended_gpus": ["A100 80GB", "H100 80GB"],
|
||||
"note": "不推荐,显存占用过大",
|
||||
},
|
||||
"FP16/BF16": {
|
||||
"model_size_gb": 18,
|
||||
"min_vram_gb": 22,
|
||||
"recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"],
|
||||
"note": "标准推理精度,推荐用于生产环境",
|
||||
},
|
||||
"INT8": {
|
||||
"model_size_gb": 9,
|
||||
"min_vram_gb": 12,
|
||||
"recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"],
|
||||
"note": "轻微精度损失,性价比高",
|
||||
},
|
||||
"INT4 (NF4)": {
|
||||
"model_size_gb": 5,
|
||||
"min_vram_gb": 8,
|
||||
"recommended_gpus": ["RTX 3050 8GB", "RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"],
|
||||
"note": "适合显存有限的消费级显卡,有一定精度损失",
|
||||
},
|
||||
},
|
||||
"deployment_recommendations": {
|
||||
"开发测试": {
|
||||
"gpu": "RTX 3050/4060 (8GB)",
|
||||
"precision": "INT4",
|
||||
"concurrent": 1,
|
||||
"cost_estimate": "~2000-3000 RMB (显卡)",
|
||||
},
|
||||
"小规模部署": {
|
||||
"gpu": "RTX 4090 (24GB)",
|
||||
"precision": "FP16",
|
||||
"concurrent": "2-4",
|
||||
"cost_estimate": "~12000-15000 RMB (显卡)",
|
||||
},
|
||||
"生产环境": {
|
||||
"gpu": "A100 40GB / H100 80GB",
|
||||
"precision": "FP16/BF16",
|
||||
"concurrent": "8-32 (vLLM)",
|
||||
"cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def analyze_gpu_requirements():
|
||||
"""输出 GPU 需求分析"""
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B GPU 算力需求分析")
|
||||
print("=" * 60)
|
||||
|
||||
for precision, info in GPU_REQUIREMENTS["precision_requirements"].items():
|
||||
print(f"\n{precision}:")
|
||||
print(f" 模型大小: ~{info['model_size_gb']} GB")
|
||||
print(f" 最低显存: {info['min_vram_gb']} GB")
|
||||
print(f" 推荐显卡: {', '.join(info['recommended_gpus'])}")
|
||||
print(f" 备注: {info['note']}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("部署方案推荐")
|
||||
print(f"{'='*60}")
|
||||
for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items():
|
||||
print(f"\n{scenario}:")
|
||||
for k, v in info.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
# 保存
|
||||
output_dir = "vsp/qwen3.5-9b/results"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
path = os.path.join(output_dir, "gpu_requirements.json")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n结果已保存到 {path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_gpu_requirements()
|
||||
Reference in New Issue
Block a user