"""GPU 算力需求分析""" import json import os # Qwen3.5-9B 不同精度下的显存需求估算 GPU_REQUIREMENTS = { "model": "Qwen3.5-9B", "parameters": "9B", "precision_requirements": { "FP32": { "model_size_gb": 36, "min_vram_gb": 40, "recommended_gpus": ["A100 80GB", "H100 80GB"], "note": "不推荐,显存占用过大", }, "FP16/BF16": { "model_size_gb": 18, "min_vram_gb": 22, "recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"], "note": "标准推理精度,推荐用于生产环境", }, "INT8": { "model_size_gb": 9, "min_vram_gb": 12, "recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"], "note": "轻微精度损失,性价比高", }, "INT4 (NF4)": { "model_size_gb": 5, "min_vram_gb": 8, "recommended_gpus": ["RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"], "note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐", }, }, "actual_test_results": { "gpu": "NVIDIA GeForce RTX 3050 OEM 8GB", "method": "FP16 + CPU offload (accelerate device_map=auto)", "gpu_vram_used_gb": 3.91, "ram_used_gb": 13.60, "inference_speed_tokens_per_sec": 0.4, "output_quality": "极差(乱码/重复输出)", "conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B,显存不足导致大量层 offload 到 CPU,推理极慢且输出质量不可用", "issues": [ "bitsandbytes 4-bit 量化不支持 CPU offload,8GB 显存装不下完整 4-bit 模型", "bitsandbytes INT8 与 accelerate 版本不兼容(Windows)", "FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s,输出为乱码", ], }, "deployment_recommendations": { "开发测试": { "gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB", "precision": "INT8 或 INT4", "concurrent": 1, "cost_estimate": "~2500-4000 RMB (显卡)", }, "小规模部署": { "gpu": "RTX 4090 (24GB)", "precision": "FP16", "concurrent": "2-4", "cost_estimate": "~12000-15000 RMB (显卡)", }, "生产环境": { "gpu": "A100 40GB / H100 80GB", "precision": "FP16/BF16", "concurrent": "8-32 (vLLM)", "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需", }, }, } def analyze_gpu_requirements(): """输出 GPU 需求分析""" print("=" * 60) print("Qwen3.5-9B GPU 算力需求分析") print("=" * 60) for precision, info in GPU_REQUIREMENTS["precision_requirements"].items(): print(f"\n{precision}:") print(f" 模型大小: ~{info['model_size_gb']} GB") print(f" 最低显存: {info['min_vram_gb']} GB") print(f" 推荐显卡: {', '.join(info['recommended_gpus'])}") print(f" 备注: {info['note']}") print(f"\n{'='*60}") print("部署方案推荐") print(f"{'='*60}") for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items(): print(f"\n{scenario}:") for k, v in info.items(): print(f" {k}: {v}") # 保存 output_dir = "vsp/qwen3.5-9b/results" os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, "gpu_requirements.json") with open(path, "w", encoding="utf-8") as f: json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2) print(f"\n结果已保存到 {path}") if __name__ == "__main__": analyze_gpu_requirements()