- 模型加载改为 bitsandbytes 4-bit NF4 量化,device_map={"":0} 纯 GPU
- 关闭 Qwen3.5 thinking 模式 (enable_thinking=False)
- 精度从 60% 提升到 90%,推理速度 1-2 tokens/s
- GPU 显存 7.13GB/8GB,输出质量正常
- 更新所有测试结果和综合报告
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
40 lines
985 B
JSON
40 lines
985 B
JSON
{
|
||
"timestamp": "2026-03-16T17:34:50.981411",
|
||
"model": "Qwen3.5-9B",
|
||
"quantization": "4-bit NF4",
|
||
"note": "单GPU串行推理,并发测试主要体现请求排队效果",
|
||
"concurrency_results": [
|
||
{
|
||
"concurrency": 1,
|
||
"total_time_s": 33.29,
|
||
"total_tokens": 64,
|
||
"throughput_tokens_per_sec": 1.9,
|
||
"avg_latency_s": 33.18,
|
||
"requests_completed": 1
|
||
},
|
||
{
|
||
"concurrency": 2,
|
||
"total_time_s": 65.01,
|
||
"total_tokens": 128,
|
||
"throughput_tokens_per_sec": 2.0,
|
||
"avg_latency_s": 49.14,
|
||
"requests_completed": 2
|
||
},
|
||
{
|
||
"concurrency": 4,
|
||
"total_time_s": 128.55,
|
||
"total_tokens": 256,
|
||
"throughput_tokens_per_sec": 2.0,
|
||
"avg_latency_s": 80.09,
|
||
"requests_completed": 4
|
||
},
|
||
{
|
||
"concurrency": 8,
|
||
"total_time_s": 275.44,
|
||
"total_tokens": 512,
|
||
"throughput_tokens_per_sec": 1.9,
|
||
"avg_latency_s": 148.94,
|
||
"requests_completed": 8
|
||
}
|
||
]
|
||
} |