diff --git a/vsp/qwen3.5-9b/generate_report.py b/vsp/qwen3.5-9b/generate_report.py new file mode 100644 index 0000000..cb915fc --- /dev/null +++ b/vsp/qwen3.5-9b/generate_report.py @@ -0,0 +1,124 @@ +"""综合报告生成 - 汇总所有测试结果""" +import json +import os +from datetime import datetime + + +def load_json(path): + """加载 JSON 文件""" + if os.path.exists(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + return None + + +def generate_report(): + """生成综合测试报告""" + results_dir = "vsp/qwen3.5-9b/results" + + speed = load_json(os.path.join(results_dir, "benchmark_speed.json")) + accuracy = load_json(os.path.join(results_dir, "accuracy_results.json")) + concurrency = load_json(os.path.join(results_dir, "concurrency_results.json")) + gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json")) + + report_lines = [ + "# Qwen3.5-9B 性能测试报告", + f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "\n## 1. 测试环境", + "", + "| 项目 | 值 |", + "|------|-----|", + "| 模型 | Qwen3.5-9B |", + "| 量化方式 | 4-bit NF4 (bitsandbytes) |", + ] + + if speed and "memory" in speed: + mem = speed["memory"] + report_lines.extend([ + f"| GPU | {mem.get('gpu_name', 'N/A')} |", + f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |", + f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |", + f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |", + ]) + + # 推理速度 + report_lines.extend(["\n## 2. 推理速度", ""]) + if speed and "speed_benchmark" in speed: + report_lines.extend([ + "| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |", + "|---------|-----------|-----------|---------|---------------|", + ]) + for r in speed["speed_benchmark"]: + report_lines.append( + f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |" + ) + else: + report_lines.append("*未运行速度测试*") + + # 精度 + report_lines.extend(["\n## 3. 精度评估", ""]) + if accuracy and "accuracy" in accuracy: + acc = accuracy["accuracy"] + report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n") + report_lines.extend([ + "| 分类 | 通过/总数 | 准确率 |", + "|------|---------|--------|", + ]) + for cat, stats in acc.get("category_stats", {}).items(): + rate = stats["passed"] / stats["total"] * 100 + report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |") + else: + report_lines.append("*未运行精度测试*") + + # 并发 + report_lines.extend(["\n## 4. 并发性能", ""]) + if concurrency and "concurrency_results" in concurrency: + report_lines.extend([ + "| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |", + "|-------|---------|----------------|-----------|", + ]) + for r in concurrency["concurrency_results"]: + report_lines.append( + f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |" + ) + report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}") + else: + report_lines.append("*未运行并发测试*") + + # GPU 需求 + report_lines.extend(["\n## 5. GPU 算力需求", ""]) + if gpu_req: + report_lines.extend([ + "| 精度 | 模型大小 | 最低显存 | 推荐显卡 |", + "|------|---------|---------|---------|", + ]) + for precision, info in gpu_req.get("precision_requirements", {}).items(): + gpus = ", ".join(info["recommended_gpus"][:2]) + report_lines.append( + f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |" + ) + + # 结论 + report_lines.extend([ + "\n## 6. 结论与建议", + "", + "1. **RTX 3050 8GB 可以运行 Qwen3.5-9B**,但必须使用 4-bit 量化", + "2. 4-bit 量化后显存占用约 5GB,留有一定余量", + "3. 单卡推理速度适合开发测试,不适合高并发生产环境", + "4. 生产部署建议使用 RTX 4090 (FP16) 或 A100 (FP16/BF16) + vLLM", + "5. 4-bit 量化对简单任务精度影响较小,复杂推理任务可能有一定损失", + ]) + + # 保存报告 + report_text = "\n".join(report_lines) + report_path = os.path.join(results_dir, "REPORT.md") + with open(report_path, "w", encoding="utf-8") as f: + f.write(report_text) + + print(report_text) + print(f"\n\n报告已保存到 {report_path}") + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + generate_report() diff --git a/vsp/qwen3.5-9b/gpu_requirements.py b/vsp/qwen3.5-9b/gpu_requirements.py new file mode 100644 index 0000000..38a5269 --- /dev/null +++ b/vsp/qwen3.5-9b/gpu_requirements.py @@ -0,0 +1,90 @@ +"""GPU 算力需求分析""" +import json +import os + + +# Qwen3.5-9B 不同精度下的显存需求估算 +GPU_REQUIREMENTS = { + "model": "Qwen3.5-9B", + "parameters": "9B", + "precision_requirements": { + "FP32": { + "model_size_gb": 36, + "min_vram_gb": 40, + "recommended_gpus": ["A100 80GB", "H100 80GB"], + "note": "不推荐,显存占用过大", + }, + "FP16/BF16": { + "model_size_gb": 18, + "min_vram_gb": 22, + "recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"], + "note": "标准推理精度,推荐用于生产环境", + }, + "INT8": { + "model_size_gb": 9, + "min_vram_gb": 12, + "recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"], + "note": "轻微精度损失,性价比高", + }, + "INT4 (NF4)": { + "model_size_gb": 5, + "min_vram_gb": 8, + "recommended_gpus": ["RTX 3050 8GB", "RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"], + "note": "适合显存有限的消费级显卡,有一定精度损失", + }, + }, + "deployment_recommendations": { + "开发测试": { + "gpu": "RTX 3050/4060 (8GB)", + "precision": "INT4", + "concurrent": 1, + "cost_estimate": "~2000-3000 RMB (显卡)", + }, + "小规模部署": { + "gpu": "RTX 4090 (24GB)", + "precision": "FP16", + "concurrent": "2-4", + "cost_estimate": "~12000-15000 RMB (显卡)", + }, + "生产环境": { + "gpu": "A100 40GB / H100 80GB", + "precision": "FP16/BF16", + "concurrent": "8-32 (vLLM)", + "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需", + }, + }, +} + + +def analyze_gpu_requirements(): + """输出 GPU 需求分析""" + print("=" * 60) + print("Qwen3.5-9B GPU 算力需求分析") + print("=" * 60) + + for precision, info in GPU_REQUIREMENTS["precision_requirements"].items(): + print(f"\n{precision}:") + print(f" 模型大小: ~{info['model_size_gb']} GB") + print(f" 最低显存: {info['min_vram_gb']} GB") + print(f" 推荐显卡: {', '.join(info['recommended_gpus'])}") + print(f" 备注: {info['note']}") + + print(f"\n{'='*60}") + print("部署方案推荐") + print(f"{'='*60}") + for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items(): + print(f"\n{scenario}:") + for k, v in info.items(): + print(f" {k}: {v}") + + # 保存 + output_dir = "vsp/qwen3.5-9b/results" + os.makedirs(output_dir, exist_ok=True) + path = os.path.join(output_dir, "gpu_requirements.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2) + print(f"\n结果已保存到 {path}") + + +if __name__ == "__main__": + analyze_gpu_requirements()