Files
qwen-test/vsp/qwen3.5-9b/results/gpu_requirements.json
16337 682063abf1 feat: 改用 4-bit NF4 纯 GPU 推理,关闭 thinking 模式
- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 17:38:33 +08:00

82 lines
2.3 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"model": "Qwen3.5-9B",
"parameters": "9B",
"precision_requirements": {
"FP32": {
"model_size_gb": 36,
"min_vram_gb": 40,
"recommended_gpus": [
"A100 80GB",
"H100 80GB"
],
"note": "不推荐,显存占用过大"
},
"FP16/BF16": {
"model_size_gb": 18,
"min_vram_gb": 22,
"recommended_gpus": [
"A100 40GB",
"RTX 4090 24GB",
"RTX A6000 48GB",
"V100 32GB"
],
"note": "标准推理精度,推荐用于生产环境"
},
"INT8": {
"model_size_gb": 9,
"min_vram_gb": 12,
"recommended_gpus": [
"RTX 4070 Ti 16GB",
"RTX 3090 24GB",
"T4 16GB",
"RTX 4080 16GB"
],
"note": "轻微精度损失,性价比高"
},
"INT4 (NF4)": {
"model_size_gb": 5,
"min_vram_gb": 8,
"recommended_gpus": [
"RTX 4060 8GB",
"RTX 3060 12GB",
"RTX 3070 8GB"
],
"note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐"
}
},
"actual_test_results": {
"gpu": "NVIDIA GeForce RTX 3050 OEM 8GB",
"method": "4-bit NF4 量化 (bitsandbytes),纯 GPU 运行,关闭 thinking 模式",
"gpu_vram_used_gb": 7.13,
"ram_used_gb": 7.59,
"inference_speed_tokens_per_sec": "1.0-1.8",
"accuracy": "90% (9/10)",
"output_quality": "正常,回答准确",
"conclusion": "RTX 3050 8GB 可以运行 Qwen3.5-9B 4-bit 量化版本,显存占用 7.13GB,推理速度 1-2 tokens/s适合开发测试",
"issues": [
"显存占用 7.13GB,接近 8GB 上限,长文本可能 OOM",
"推理速度较慢1-2 tokens/s不适合生产环境",
"需关闭 thinking 模式才能正常输出"
]
},
"deployment_recommendations": {
"开发测试": {
"gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB",
"precision": "INT8 或 INT4",
"concurrent": 1,
"cost_estimate": "~2500-4000 RMB (显卡)"
},
"小规模部署": {
"gpu": "RTX 4090 (24GB)",
"precision": "FP16",
"concurrent": "2-4",
"cost_estimate": "~12000-15000 RMB (显卡)"
},
"生产环境": {
"gpu": "A100 40GB / H100 80GB",
"precision": "FP16/BF16",
"concurrent": "8-32 (vLLM)",
"cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需"
}
}
}