From 42db2b0ca9fcee48b9b055e9a17f321424463fb9 Mon Sep 17 00:00:00 2001 From: 16337 <1633794139@qq.com> Date: Mon, 16 Mar 2026 13:09:39 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=9B=B4=E6=96=B0=20GPU=20=E9=9C=80?= =?UTF-8?q?=E6=B1=82=E5=88=86=E6=9E=90=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=AE=9E?= =?UTF-8?q?=E9=99=85=E6=B5=8B=E8=AF=95=E7=BB=93=E6=9E=9C=E5=92=8C=E7=BB=BC?= =?UTF-8?q?=E5=90=88=E6=8A=A5=E5=91=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 根据 RTX 3050 8GB 实测结果更新 GPU 需求建议 - 添加 bitsandbytes 兼容性问题记录 - 生成包含实测数据的综合测试报告 REPORT.md Co-Authored-By: Claude Opus 4.6 --- vsp/qwen3.5-9b/generate_report.py | 36 +++++++-- vsp/qwen3.5-9b/gpu_requirements.py | 24 ++++-- vsp/qwen3.5-9b/results/REPORT.md | 59 ++++++++++++++ vsp/qwen3.5-9b/results/gpu_requirements.json | 81 ++++++++++++++++++++ 4 files changed, 187 insertions(+), 13 deletions(-) create mode 100644 vsp/qwen3.5-9b/results/REPORT.md create mode 100644 vsp/qwen3.5-9b/results/gpu_requirements.json diff --git a/vsp/qwen3.5-9b/generate_report.py b/vsp/qwen3.5-9b/generate_report.py index cb915fc..0a444c5 100644 --- a/vsp/qwen3.5-9b/generate_report.py +++ b/vsp/qwen3.5-9b/generate_report.py @@ -29,7 +29,11 @@ def generate_report(): "| 项目 | 值 |", "|------|-----|", "| 模型 | Qwen3.5-9B |", - "| 量化方式 | 4-bit NF4 (bitsandbytes) |", + "| 加载方式 | FP16 + CPU offload (accelerate) |", + "| GPU | NVIDIA GeForce RTX 3050 OEM |", + "| GPU 显存 | 8 GB |", + "| CUDA | 12.1 |", + "| Python 环境 | conda yolo |", ] if speed and "memory" in speed: @@ -98,15 +102,31 @@ def generate_report(): f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |" ) - # 结论 + # 实际测试结论 report_lines.extend([ - "\n## 6. 结论与建议", + "\n## 6. 实际测试结论", "", - "1. **RTX 3050 8GB 可以运行 Qwen3.5-9B**,但必须使用 4-bit 量化", - "2. 4-bit 量化后显存占用约 5GB,留有一定余量", - "3. 单卡推理速度适合开发测试,不适合高并发生产环境", - "4. 生产部署建议使用 RTX 4090 (FP16) 或 A100 (FP16/BF16) + vLLM", - "5. 4-bit 量化对简单任务精度影响较小,复杂推理任务可能有一定损失", + "### RTX 3050 8GB 测试结果", + "", + "| 指标 | 结果 |", + "|------|------|", + "| GPU 显存占用 | 3.91 GB |", + "| 系统内存占用 | 13.60 GB |", + "| 推理速度 | ~0.4 tokens/s |", + "| 输出质量 | 极差(乱码/重复) |", + "", + "### 问题分析", + "", + "1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload,8GB 显存无法装下完整 4-bit 模型(~5GB 模型 + KV cache + 激活值)", + "2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题(Windows)", + "3. **FP16 + CPU offload**: 可以加载模型,但大量层 offload 到 CPU 导致推理极慢(0.4 tokens/s),输出质量不可用", + "", + "### 建议", + "", + "1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**,显存严重不足", + "2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16)", + "3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM", + "4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行", ]) # 保存报告 diff --git a/vsp/qwen3.5-9b/gpu_requirements.py b/vsp/qwen3.5-9b/gpu_requirements.py index 38a5269..825d678 100644 --- a/vsp/qwen3.5-9b/gpu_requirements.py +++ b/vsp/qwen3.5-9b/gpu_requirements.py @@ -29,16 +29,30 @@ GPU_REQUIREMENTS = { "INT4 (NF4)": { "model_size_gb": 5, "min_vram_gb": 8, - "recommended_gpus": ["RTX 3050 8GB", "RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"], - "note": "适合显存有限的消费级显卡,有一定精度损失", + "recommended_gpus": ["RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"], + "note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐", }, }, + "actual_test_results": { + "gpu": "NVIDIA GeForce RTX 3050 OEM 8GB", + "method": "FP16 + CPU offload (accelerate device_map=auto)", + "gpu_vram_used_gb": 3.91, + "ram_used_gb": 13.60, + "inference_speed_tokens_per_sec": 0.4, + "output_quality": "极差(乱码/重复输出)", + "conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B,显存不足导致大量层 offload 到 CPU,推理极慢且输出质量不可用", + "issues": [ + "bitsandbytes 4-bit 量化不支持 CPU offload,8GB 显存装不下完整 4-bit 模型", + "bitsandbytes INT8 与 accelerate 版本不兼容(Windows)", + "FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s,输出为乱码", + ], + }, "deployment_recommendations": { "开发测试": { - "gpu": "RTX 3050/4060 (8GB)", - "precision": "INT4", + "gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB", + "precision": "INT8 或 INT4", "concurrent": 1, - "cost_estimate": "~2000-3000 RMB (显卡)", + "cost_estimate": "~2500-4000 RMB (显卡)", }, "小规模部署": { "gpu": "RTX 4090 (24GB)", diff --git a/vsp/qwen3.5-9b/results/REPORT.md b/vsp/qwen3.5-9b/results/REPORT.md new file mode 100644 index 0000000..224581a --- /dev/null +++ b/vsp/qwen3.5-9b/results/REPORT.md @@ -0,0 +1,59 @@ +# Qwen3.5-9B 性能测试报告 + +生成时间: 2026-03-16 13:09:10 + +## 1. 测试环境 + +| 项目 | 值 | +|------|-----| +| 模型 | Qwen3.5-9B | +| 加载方式 | FP16 + CPU offload (accelerate) | +| GPU | NVIDIA GeForce RTX 3050 OEM | +| GPU 显存 | 8 GB | +| CUDA | 12.1 | +| Python 环境 | conda yolo | + +## 2. 推理速度 + +*未运行速度测试* + +## 3. 精度评估 + +*未运行精度测试* + +## 4. 并发性能 + +*未运行并发测试* + +## 5. GPU 算力需求 + +| 精度 | 模型大小 | 最低显存 | 推荐显卡 | +|------|---------|---------|---------| +| FP32 | 36GB | 40GB | A100 80GB, H100 80GB | +| FP16/BF16 | 18GB | 22GB | A100 40GB, RTX 4090 24GB | +| INT8 | 9GB | 12GB | RTX 4070 Ti 16GB, RTX 3090 24GB | +| INT4 (NF4) | 5GB | 8GB | RTX 4060 8GB, RTX 3060 12GB | + +## 6. 实际测试结论 + +### RTX 3050 8GB 测试结果 + +| 指标 | 结果 | +|------|------| +| GPU 显存占用 | 3.91 GB | +| 系统内存占用 | 13.60 GB | +| 推理速度 | ~0.4 tokens/s | +| 输出质量 | 极差(乱码/重复) | + +### 问题分析 + +1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload,8GB 显存无法装下完整 4-bit 模型(~5GB 模型 + KV cache + 激活值) +2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题(Windows) +3. **FP16 + CPU offload**: 可以加载模型,但大量层 offload 到 CPU 导致推理极慢(0.4 tokens/s),输出质量不可用 + +### 建议 + +1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**,显存严重不足 +2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16) +3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM +4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行 \ No newline at end of file diff --git a/vsp/qwen3.5-9b/results/gpu_requirements.json b/vsp/qwen3.5-9b/results/gpu_requirements.json new file mode 100644 index 0000000..a8a38c9 --- /dev/null +++ b/vsp/qwen3.5-9b/results/gpu_requirements.json @@ -0,0 +1,81 @@ +{ + "model": "Qwen3.5-9B", + "parameters": "9B", + "precision_requirements": { + "FP32": { + "model_size_gb": 36, + "min_vram_gb": 40, + "recommended_gpus": [ + "A100 80GB", + "H100 80GB" + ], + "note": "不推荐,显存占用过大" + }, + "FP16/BF16": { + "model_size_gb": 18, + "min_vram_gb": 22, + "recommended_gpus": [ + "A100 40GB", + "RTX 4090 24GB", + "RTX A6000 48GB", + "V100 32GB" + ], + "note": "标准推理精度,推荐用于生产环境" + }, + "INT8": { + "model_size_gb": 9, + "min_vram_gb": 12, + "recommended_gpus": [ + "RTX 4070 Ti 16GB", + "RTX 3090 24GB", + "T4 16GB", + "RTX 4080 16GB" + ], + "note": "轻微精度损失,性价比高" + }, + "INT4 (NF4)": { + "model_size_gb": 5, + "min_vram_gb": 8, + "recommended_gpus": [ + "RTX 4060 8GB", + "RTX 3060 12GB", + "RTX 3070 8GB" + ], + "note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐" + } + }, + "actual_test_results": { + "gpu": "NVIDIA GeForce RTX 3050 OEM 8GB", + "method": "FP16 + CPU offload (accelerate device_map=auto)", + "gpu_vram_used_gb": 3.91, + "ram_used_gb": 13.6, + "inference_speed_tokens_per_sec": 0.4, + "output_quality": "极差(乱码/重复输出)", + "conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B,显存不足导致大量层 offload 到 CPU,推理极慢且输出质量不可用", + "issues": [ + "bitsandbytes 4-bit 量化不支持 CPU offload,8GB 显存装不下完整 4-bit 模型", + "bitsandbytes INT8 与 accelerate 版本不兼容(Windows)", + "FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s,输出为乱码" + ] + }, + "deployment_recommendations": { + "开发测试": { + "gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB", + "precision": "INT8 或 INT4", + "concurrent": 1, + "cost_estimate": "~2500-4000 RMB (显卡)" + }, + "小规模部署": { + "gpu": "RTX 4090 (24GB)", + "precision": "FP16", + "concurrent": "2-4", + "cost_estimate": "~12000-15000 RMB (显卡)" + }, + "生产环境": { + "gpu": "A100 40GB / H100 80GB", + "precision": "FP16/BF16", + "concurrent": "8-32 (vLLM)", + "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需" + } + } +} \ No newline at end of file