feat: 更新 GPU 需求分析，添加实际测试结果和综合报告

- 根据 RTX 3050 8GB 实测结果更新 GPU 需求建议 - 添加 bitsandbytes 兼容性问题记录 - 生成包含实测数据的综合测试报告 REPORT.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: 修复模型加载方式，改用 FP16+CPU offload
2026-03-16 13:09:39 +08:00 · 2026-03-16 13:05:20 +08:00 · 2026-03-16 11:45:52 +08:00 · 2026-03-16 11:45:51 +08:00 · 2026-03-16 11:45:51 +08:00 · 2026-03-16 11:45:50 +08:00
14 changed files with 1105 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ build/
 *.pth
 *.onnx
 vsp/qwen3.5-9b/model/
+vsp/qwen3.5-9b/offload/

 # Env
 .env
--- a/vsp/qwen3.5-9b/benchmark_speed.py
+++ b/vsp/qwen3.5-9b/benchmark_speed.py
@@ -0,0 +1,130 @@
+"""性能基准测试 - 推理速度、首 token 延迟、吞吐量"""
+import time
+import json
+import os
+import glob
+import sys
+import torch
+import psutil
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datetime import datetime
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model
+
+
+def benchmark_speed(model, tokenizer, num_runs=5):
+    """测试不同输入长度和输出长度下的推理速度"""
+    print("=" * 60)
+    print("性能基准测试 - 推理速度")
+    print("=" * 60)
+
+    test_cases = [
+        {"name": "短输入短输出", "prompt": "你好", "max_tokens": 50},
+        {"name": "短输入中输出", "prompt": "介绍一下人工智能", "max_tokens": 128},
+        {"name": "短输入长输出", "prompt": "请详细解释深度学习的原理和应用", "max_tokens": 256},
+        {"name": "中输入中输出", "prompt": "以下是一段代码：\n```python\ndef fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)\n```\n请分析这段代码的时间复杂度并给出优化方案。", "max_tokens": 256},
+        {"name": "长输入短输出", "prompt": "请总结以下内容的关键点：" + "人工智能是计算机科学的一个分支。" * 50, "max_tokens": 64},
+    ]
+
+    results = []
+    for case in test_cases:
+        print(f"\n--- {case['name']} (max_tokens={case['max_tokens']}) ---")
+        times = []
+        output_tokens_list = []
+
+        for run in range(num_runs):
+            messages = [{"role": "user", "content": case["prompt"]}]
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = tokenizer(text, return_tensors="pt").to(model.device)
+            input_len = inputs["input_ids"].shape[1]
+
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=case["max_tokens"],
+                    do_sample=False,  # greedy for reproducibility
+                )
+
+            torch.cuda.synchronize()
+            gen_time = time.perf_counter() - t0
+            output_len = outputs.shape[1] - input_len
+
+            times.append(gen_time)
+            output_tokens_list.append(output_len)
+
+        avg_time = sum(times) / len(times)
+        avg_tokens = sum(output_tokens_list) / len(output_tokens_list)
+        avg_speed = avg_tokens / avg_time if avg_time > 0 else 0
+
+        result = {
+            "test_name": case["name"],
+            "input_tokens": input_len,
+            "avg_output_tokens": round(avg_tokens, 1),
+            "avg_time_s": round(avg_time, 3),
+            "avg_tokens_per_sec": round(avg_speed, 1),
+            "min_time_s": round(min(times), 3),
+            "max_time_s": round(max(times), 3),
+        }
+        results.append(result)
+        print(f"  输入 tokens: {input_len}")
+        print(f"  平均输出 tokens: {result['avg_output_tokens']}")
+        print(f"  平均耗时: {result['avg_time_s']}s")
+        print(f"  平均速度: {result['avg_tokens_per_sec']} tokens/s")
+
+    return results
+
+
+def benchmark_memory(model):
+    """测试显存和内存占用"""
+    print(f"\n{'='*60}")
+    print("显存与内存占用")
+    print(f"{'='*60}")
+
+    result = {}
+    if torch.cuda.is_available():
+        result["gpu_allocated_gb"] = round(torch.cuda.memory_allocated() / 1024**3, 2)
+        result["gpu_reserved_gb"] = round(torch.cuda.memory_reserved() / 1024**3, 2)
+        result["gpu_total_gb"] = round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 1)
+        result["gpu_name"] = torch.cuda.get_device_name(0)
+
+    process = psutil.Process()
+    result["ram_used_gb"] = round(process.memory_info().rss / 1024**3, 2)
+    result["ram_total_gb"] = round(psutil.virtual_memory().total / 1024**3, 1)
+
+    for k, v in result.items():
+        print(f"  {k}: {v}")
+
+    return result
+
+
+def save_results(speed_results, memory_results):
+    """保存测试结果"""
+    output_dir = "vsp/qwen3.5-9b/results"
+    os.makedirs(output_dir, exist_ok=True)
+
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "model": "Qwen3.5-9B",
+        "quantization": "4-bit NF4",
+        "speed_benchmark": speed_results,
+        "memory": memory_results,
+    }
+
+    output_path = os.path.join(output_dir, "benchmark_speed.json")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+
+    print(f"\n结果已保存到 {output_path}")
+    return output_path
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    model, tokenizer = load_model()
+    speed_results = benchmark_speed(model, tokenizer)
+    memory_results = benchmark_memory(model)
+    save_results(speed_results, memory_results)
--- a/vsp/qwen3.5-9b/download_model.py
+++ b/vsp/qwen3.5-9b/download_model.py
@@ -0,0 +1,31 @@
+"""从 ModelScope 下载 Qwen3.5-9B 模型"""
+import os
+import time
+import argparse
+
+
+def download_model(model_dir="vsp/qwen3.5-9b/model"):
+    """下载模型到指定目录"""
+    from modelscope import snapshot_download
+
+    os.makedirs(model_dir, exist_ok=True)
+    print(f"开始下载 Qwen3.5-9B 到 {model_dir} ...")
+    start = time.time()
+
+    model_path = snapshot_download(
+        "Qwen/Qwen3.5-9B",
+        cache_dir=model_dir,
+    )
+
+    elapsed = time.time() - start
+    print(f"下载完成！耗时: {elapsed:.1f}s")
+    print(f"模型路径: {model_path}")
+    return model_path
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", default="vsp/qwen3.5-9b/model",
+                        help="模型保存目录")
+    args = parser.parse_args()
+    download_model(args.model_dir)
--- a/vsp/qwen3.5-9b/generate_report.py
+++ b/vsp/qwen3.5-9b/generate_report.py
@@ -0,0 +1,144 @@
+"""综合报告生成 - 汇总所有测试结果"""
+import json
+import os
+from datetime import datetime
+
+
+def load_json(path):
+    """加载 JSON 文件"""
+    if os.path.exists(path):
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return None
+
+
+def generate_report():
+    """生成综合测试报告"""
+    results_dir = "vsp/qwen3.5-9b/results"
+
+    speed = load_json(os.path.join(results_dir, "benchmark_speed.json"))
+    accuracy = load_json(os.path.join(results_dir, "accuracy_results.json"))
+    concurrency = load_json(os.path.join(results_dir, "concurrency_results.json"))
+    gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json"))
+
+    report_lines = [
+        "# Qwen3.5-9B 性能测试报告",
+        f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "\n## 1. 测试环境",
+        "",
+        "| 项目 | 值 |",
+        "|------|-----|",
+        "| 模型 | Qwen3.5-9B |",
+        "| 加载方式 | FP16 + CPU offload (accelerate) |",
+        "| GPU | NVIDIA GeForce RTX 3050 OEM |",
+        "| GPU 显存 | 8 GB |",
+        "| CUDA | 12.1 |",
+        "| Python 环境 | conda yolo |",
+    ]
+
+    if speed and "memory" in speed:
+        mem = speed["memory"]
+        report_lines.extend([
+            f"| GPU | {mem.get('gpu_name', 'N/A')} |",
+            f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |",
+            f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |",
+            f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |",
+        ])
+
+    # 推理速度
+    report_lines.extend(["\n## 2. 推理速度", ""])
+    if speed and "speed_benchmark" in speed:
+        report_lines.extend([
+            "| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |",
+            "|---------|-----------|-----------|---------|---------------|",
+        ])
+        for r in speed["speed_benchmark"]:
+            report_lines.append(
+                f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |"
+            )
+    else:
+        report_lines.append("*未运行速度测试*")
+
+    # 精度
+    report_lines.extend(["\n## 3. 精度评估", ""])
+    if accuracy and "accuracy" in accuracy:
+        acc = accuracy["accuracy"]
+        report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n")
+        report_lines.extend([
+            "| 分类 | 通过/总数 | 准确率 |",
+            "|------|---------|--------|",
+        ])
+        for cat, stats in acc.get("category_stats", {}).items():
+            rate = stats["passed"] / stats["total"] * 100
+            report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |")
+    else:
+        report_lines.append("*未运行精度测试*")
+
+    # 并发
+    report_lines.extend(["\n## 4. 并发性能", ""])
+    if concurrency and "concurrency_results" in concurrency:
+        report_lines.extend([
+            "| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |",
+            "|-------|---------|----------------|-----------|",
+        ])
+        for r in concurrency["concurrency_results"]:
+            report_lines.append(
+                f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |"
+            )
+        report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}")
+    else:
+        report_lines.append("*未运行并发测试*")
+
+    # GPU 需求
+    report_lines.extend(["\n## 5. GPU 算力需求", ""])
+    if gpu_req:
+        report_lines.extend([
+            "| 精度 | 模型大小 | 最低显存 | 推荐显卡 |",
+            "|------|---------|---------|---------|",
+        ])
+        for precision, info in gpu_req.get("precision_requirements", {}).items():
+            gpus = ", ".join(info["recommended_gpus"][:2])
+            report_lines.append(
+                f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |"
+            )
+
+    # 实际测试结论
+    report_lines.extend([
+        "\n## 6. 实际测试结论",
+        "",
+        "### RTX 3050 8GB 测试结果",
+        "",
+        "| 指标 | 结果 |",
+        "|------|------|",
+        "| GPU 显存占用 | 3.91 GB |",
+        "| 系统内存占用 | 13.60 GB |",
+        "| 推理速度 | ~0.4 tokens/s |",
+        "| 输出质量 | 极差（乱码/重复） |",
+        "",
+        "### 问题分析",
+        "",
+        "1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload，8GB 显存无法装下完整 4-bit 模型（~5GB 模型 + KV cache + 激活值）",
+        "2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题（Windows）",
+        "3. **FP16 + CPU offload**: 可以加载模型，但大量层 offload 到 CPU 导致推理极慢（0.4 tokens/s），输出质量不可用",
+        "",
+        "### 建议",
+        "",
+        "1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**，显存严重不足",
+        "2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16)",
+        "3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM",
+        "4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行",
+    ])
+
+    # 保存报告
+    report_text = "\n".join(report_lines)
+    report_path = os.path.join(results_dir, "REPORT.md")
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(report_text)
+
+    print(report_text)
+    print(f"\n\n报告已保存到 {report_path}")
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    generate_report()
--- a/vsp/qwen3.5-9b/gpu_requirements.py
+++ b/vsp/qwen3.5-9b/gpu_requirements.py
@@ -0,0 +1,104 @@
+"""GPU 算力需求分析"""
+import json
+import os
+
+
+# Qwen3.5-9B 不同精度下的显存需求估算
+GPU_REQUIREMENTS = {
+    "model": "Qwen3.5-9B",
+    "parameters": "9B",
+    "precision_requirements": {
+        "FP32": {
+            "model_size_gb": 36,
+            "min_vram_gb": 40,
+            "recommended_gpus": ["A100 80GB", "H100 80GB"],
+            "note": "不推荐，显存占用过大",
+        },
+        "FP16/BF16": {
+            "model_size_gb": 18,
+            "min_vram_gb": 22,
+            "recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"],
+            "note": "标准推理精度，推荐用于生产环境",
+        },
+        "INT8": {
+            "model_size_gb": 9,
+            "min_vram_gb": 12,
+            "recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"],
+            "note": "轻微精度损失，性价比高",
+        },
+        "INT4 (NF4)": {
+            "model_size_gb": 5,
+            "min_vram_gb": 8,
+            "recommended_gpus": ["RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"],
+            "note": "理论可行但 bitsandbytes 在 Windows 上兼容性差，不推荐",
+        },
+    },
+    "actual_test_results": {
+        "gpu": "NVIDIA GeForce RTX 3050 OEM 8GB",
+        "method": "FP16 + CPU offload (accelerate device_map=auto)",
+        "gpu_vram_used_gb": 3.91,
+        "ram_used_gb": 13.60,
+        "inference_speed_tokens_per_sec": 0.4,
+        "output_quality": "极差（乱码/重复输出）",
+        "conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B，显存不足导致大量层 offload 到 CPU，推理极慢且输出质量不可用",
+        "issues": [
+            "bitsandbytes 4-bit 量化不支持 CPU offload，8GB 显存装不下完整 4-bit 模型",
+            "bitsandbytes INT8 与 accelerate 版本不兼容（Windows）",
+            "FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s，输出为乱码",
+        ],
+    },
+    "deployment_recommendations": {
+        "开发测试": {
+            "gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB",
+            "precision": "INT8 或 INT4",
+            "concurrent": 1,
+            "cost_estimate": "~2500-4000 RMB (显卡)",
+        },
+        "小规模部署": {
+            "gpu": "RTX 4090 (24GB)",
+            "precision": "FP16",
+            "concurrent": "2-4",
+            "cost_estimate": "~12000-15000 RMB (显卡)",
+        },
+        "生产环境": {
+            "gpu": "A100 40GB / H100 80GB",
+            "precision": "FP16/BF16",
+            "concurrent": "8-32 (vLLM)",
+            "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需",
+        },
+    },
+}
+
+
+def analyze_gpu_requirements():
+    """输出 GPU 需求分析"""
+    print("=" * 60)
+    print("Qwen3.5-9B GPU 算力需求分析")
+    print("=" * 60)
+
+    for precision, info in GPU_REQUIREMENTS["precision_requirements"].items():
+        print(f"\n{precision}:")
+        print(f"  模型大小: ~{info['model_size_gb']} GB")
+        print(f"  最低显存: {info['min_vram_gb']} GB")
+        print(f"  推荐显卡: {', '.join(info['recommended_gpus'])}")
+        print(f"  备注: {info['note']}")
+
+    print(f"\n{'='*60}")
+    print("部署方案推荐")
+    print(f"{'='*60}")
+    for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items():
+        print(f"\n{scenario}:")
+        for k, v in info.items():
+            print(f"  {k}: {v}")
+
+    # 保存
+    output_dir = "vsp/qwen3.5-9b/results"
+    os.makedirs(output_dir, exist_ok=True)
+    path = os.path.join(output_dir, "gpu_requirements.json")
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2)
+    print(f"\n结果已保存到 {path}")
+
+
+if __name__ == "__main__":
+    analyze_gpu_requirements()
--- a/vsp/qwen3.5-9b/model_utils.py
+++ b/vsp/qwen3.5-9b/model_utils.py
@@ -0,0 +1,41 @@
+"""共享模型加载工具 - 统一加载配置"""
+import os
+import sys
+import glob
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 修复 Windows GBK 编码问题
+sys.stdout.reconfigure(encoding='utf-8', errors='replace')
+sys.stderr.reconfigure(encoding='utf-8', errors='replace')
+
+
+def get_model_path():
+    """获取本地模型路径"""
+    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
+    if paths:
+        return os.path.dirname(paths[0])
+    return "Qwen/Qwen3.5-9B"
+
+
+def load_model():
+    """加载模型 (FP16 + GPU/CPU offload)
+
+    RTX 3050 8GB VRAM 不够放完整模型，使用 FP16 并将部分层 offload 到 CPU。
+    """
+    model_path = get_model_path()
+    print(f"模型路径: {model_path}")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    max_memory = {0: "6GiB", "cpu": "24GiB"}
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        max_memory=max_memory,
+        offload_folder="vsp/qwen3.5-9b/offload",
+        trust_remote_code=True,
+    )
+
+    return model, tokenizer
--- a/vsp/qwen3.5-9b/requirements.txt
+++ b/vsp/qwen3.5-9b/requirements.txt
@@ -0,0 +1,10 @@
+modelscope>=1.9.0
+transformers>=4.37.0
+accelerate>=0.25.0
+bitsandbytes>=0.41.0
+sentencepiece
+protobuf
+psutil
+pandas
+matplotlib
+tqdm
--- a/vsp/qwen3.5-9b/results/REPORT.md
+++ b/vsp/qwen3.5-9b/results/REPORT.md
@@ -0,0 +1,59 @@
+# Qwen3.5-9B 性能测试报告
+
+生成时间: 2026-03-16 13:09:10
+
+## 1. 测试环境
+
+| 项目 | 值 |
+|------|-----|
+| 模型 | Qwen3.5-9B |
+| 加载方式 | FP16 + CPU offload (accelerate) |
+| GPU | NVIDIA GeForce RTX 3050 OEM |
+| GPU 显存 | 8 GB |
+| CUDA | 12.1 |
+| Python 环境 | conda yolo |
+
+## 2. 推理速度
+
+*未运行速度测试*
+
+## 3. 精度评估
+
+*未运行精度测试*
+
+## 4. 并发性能
+
+*未运行并发测试*
+
+## 5. GPU 算力需求
+
+| 精度 | 模型大小 | 最低显存 | 推荐显卡 |
+|------|---------|---------|---------|
+| FP32 | 36GB | 40GB | A100 80GB, H100 80GB |
+| FP16/BF16 | 18GB | 22GB | A100 40GB, RTX 4090 24GB |
+| INT8 | 9GB | 12GB | RTX 4070 Ti 16GB, RTX 3090 24GB |
+| INT4 (NF4) | 5GB | 8GB | RTX 4060 8GB, RTX 3060 12GB |
+
+## 6. 实际测试结论
+
+### RTX 3050 8GB 测试结果
+
+| 指标 | 结果 |
+|------|------|
+| GPU 显存占用 | 3.91 GB |
+| 系统内存占用 | 13.60 GB |
+| 推理速度 | ~0.4 tokens/s |
+| 输出质量 | 极差（乱码/重复） |
+
+### 问题分析
+
+1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload，8GB 显存无法装下完整 4-bit 模型（~5GB 模型 + KV cache + 激活值）
+2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题（Windows）
+3. **FP16 + CPU offload**: 可以加载模型，但大量层 offload 到 CPU 导致推理极慢（0.4 tokens/s），输出质量不可用
+
+### 建议
+
+1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**，显存严重不足
+2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16)
+3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM
+4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行
--- a/vsp/qwen3.5-9b/results/gpu_requirements.json
+++ b/vsp/qwen3.5-9b/results/gpu_requirements.json
@@ -0,0 +1,81 @@
+{
+  "model": "Qwen3.5-9B",
+  "parameters": "9B",
+  "precision_requirements": {
+    "FP32": {
+      "model_size_gb": 36,
+      "min_vram_gb": 40,
+      "recommended_gpus": [
+        "A100 80GB",
+        "H100 80GB"
+      ],
+      "note": "不推荐，显存占用过大"
+    },
+    "FP16/BF16": {
+      "model_size_gb": 18,
+      "min_vram_gb": 22,
+      "recommended_gpus": [
+        "A100 40GB",
+        "RTX 4090 24GB",
+        "RTX A6000 48GB",
+        "V100 32GB"
+      ],
+      "note": "标准推理精度，推荐用于生产环境"
+    },
+    "INT8": {
+      "model_size_gb": 9,
+      "min_vram_gb": 12,
+      "recommended_gpus": [
+        "RTX 4070 Ti 16GB",
+        "RTX 3090 24GB",
+        "T4 16GB",
+        "RTX 4080 16GB"
+      ],
+      "note": "轻微精度损失，性价比高"
+    },
+    "INT4 (NF4)": {
+      "model_size_gb": 5,
+      "min_vram_gb": 8,
+      "recommended_gpus": [
+        "RTX 4060 8GB",
+        "RTX 3060 12GB",
+        "RTX 3070 8GB"
+      ],
+      "note": "理论可行但 bitsandbytes 在 Windows 上兼容性差，不推荐"
+    }
+  },
+  "actual_test_results": {
+    "gpu": "NVIDIA GeForce RTX 3050 OEM 8GB",
+    "method": "FP16 + CPU offload (accelerate device_map=auto)",
+    "gpu_vram_used_gb": 3.91,
+    "ram_used_gb": 13.6,
+    "inference_speed_tokens_per_sec": 0.4,
+    "output_quality": "极差（乱码/重复输出）",
+    "conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B，显存不足导致大量层 offload 到 CPU，推理极慢且输出质量不可用",
+    "issues": [
+      "bitsandbytes 4-bit 量化不支持 CPU offload，8GB 显存装不下完整 4-bit 模型",
+      "bitsandbytes INT8 与 accelerate 版本不兼容（Windows）",
+      "FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s，输出为乱码"
+    ]
+  },
+  "deployment_recommendations": {
+    "开发测试": {
+      "gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB",
+      "precision": "INT8 或 INT4",
+      "concurrent": 1,
+      "cost_estimate": "~2500-4000 RMB (显卡)"
+    },
+    "小规模部署": {
+      "gpu": "RTX 4090 (24GB)",
+      "precision": "FP16",
+      "concurrent": "2-4",
+      "cost_estimate": "~12000-15000 RMB (显卡)"
+    },
+    "生产环境": {
+      "gpu": "A100 40GB / H100 80GB",
+      "precision": "FP16/BF16",
+      "concurrent": "8-32 (vLLM)",
+      "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需"
+    }
+  }
+}
--- a/vsp/qwen3.5-9b/run_all.py
+++ b/vsp/qwen3.5-9b/run_all.py
@@ -0,0 +1,49 @@
+"""一键运行所有测试"""
+import subprocess
+import sys
+import os
+import time
+
+
+SCRIPTS = [
+    ("环境检查", "vsp/qwen3.5-9b/setup_env.py"),
+    ("模型下载", "vsp/qwen3.5-9b/download_model.py"),
+    ("基础推理测试", "vsp/qwen3.5-9b/test_basic_inference.py"),
+    ("性能基准测试", "vsp/qwen3.5-9b/benchmark_speed.py"),
+    ("精度评估", "vsp/qwen3.5-9b/test_accuracy.py"),
+    ("并发压测", "vsp/qwen3.5-9b/test_concurrency.py"),
+    ("GPU需求分析", "vsp/qwen3.5-9b/gpu_requirements.py"),
+    ("生成报告", "vsp/qwen3.5-9b/generate_report.py"),
+]
+
+
+def main():
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    print("=" * 60)
+    print("Qwen3.5-9B 全量测试")
+    print("=" * 60)
+
+    for name, script in SCRIPTS:
+        print(f"\n{'='*60}")
+        print(f"[{name}] 运行 {script}")
+        print("=" * 60)
+
+        t0 = time.time()
+        result = subprocess.run([sys.executable, script], capture_output=False)
+        elapsed = time.time() - t0
+
+        if result.returncode != 0:
+            print(f"\n[ERROR] {name} 失败 (退出码: {result.returncode})")
+            choice = input("继续运行后续测试？(y/n): ").strip().lower()
+            if choice != "y":
+                sys.exit(1)
+        else:
+            print(f"\n[OK] {name} 完成 ({elapsed:.1f}s)")
+
+    print(f"\n{'='*60}")
+    print("所有测试完成！查看报告: vsp/qwen3.5-9b/results/REPORT.md")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/vsp/qwen3.5-9b/setup_env.py
+++ b/vsp/qwen3.5-9b/setup_env.py
@@ -0,0 +1,49 @@
+"""环境检查与依赖验证脚本"""
+import subprocess
+import sys
+
+
+def check_and_install():
+    """检查并安装依赖"""
+    print("=" * 60)
+    print("Qwen3.5-9B 测试环境检查")
+    print("=" * 60)
+
+    # 检查 Python 版本
+    print(f"\nPython 版本: {sys.version}")
+
+    # 检查 CUDA
+    try:
+        import torch
+        print(f"PyTorch 版本: {torch.__version__}")
+        print(f"CUDA 可用: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            print(f"GPU: {torch.cuda.get_device_name(0)}")
+            vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            print(f"VRAM: {vram_gb:.1f} GB")
+    except ImportError:
+        print("ERROR: PyTorch 未安装")
+        sys.exit(1)
+
+    # 安装依赖
+    print("\n安装依赖包...")
+    subprocess.check_call([
+        sys.executable, "-m", "pip", "install", "-r",
+        "vsp/qwen3.5-9b/requirements.txt", "-q"
+    ])
+
+    # 验证关键包
+    packages = ["transformers", "modelscope", "accelerate", "bitsandbytes"]
+    for pkg in packages:
+        try:
+            mod = __import__(pkg)
+            ver = getattr(mod, "__version__", "unknown")
+            print(f"  {pkg}: {ver}")
+        except ImportError:
+            print(f"  ERROR: {pkg} 安装失败")
+
+    print("\n环境检查完成！")
+
+
+if __name__ == "__main__":
+    check_and_install()
--- a/vsp/qwen3.5-9b/test_accuracy.py
+++ b/vsp/qwen3.5-9b/test_accuracy.py
@@ -0,0 +1,167 @@
+"""精度评估 - 测试模型在常见任务上的准确性"""
+import json
+import os
+import sys
+import glob
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datetime import datetime
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model
+
+
+# 测试数据集
+ACCURACY_TESTS = [
+    # 知识问答
+    {
+        "category": "知识问答",
+        "prompt": "中国的首都是哪个城市？请只回答城市名。",
+        "expected_contains": ["北京"],
+    },
+    {
+        "category": "知识问答",
+        "prompt": "水的化学式是什么？请只回答化学式。",
+        "expected_contains": ["H2O"],
+    },
+    {
+        "category": "知识问答",
+        "prompt": "地球到太阳的平均距离大约是多少公里？A. 1.5亿 B. 3亿 C. 5亿 D. 1亿。请只回答选项字母。",
+        "expected_contains": ["A"],
+    },
+    # 数学推理
+    {
+        "category": "数学推理",
+        "prompt": "计算 15 * 23 = ? 请只回答数字。",
+        "expected_contains": ["345"],
+    },
+    {
+        "category": "数学推理",
+        "prompt": "一个三角形三边分别是3、4、5，它是什么三角形？请只回答类型。",
+        "expected_contains": ["直角"],
+    },
+    # 逻辑推理
+    {
+        "category": "逻辑推理",
+        "prompt": "所有的狗都是动物。小白是一只狗。所以小白是什么？请只回答一个词。",
+        "expected_contains": ["动物"],
+    },
+    # 代码理解
+    {
+        "category": "代码理解",
+        "prompt": "以下Python代码的输出是什么？\n```python\nprint(len([1, 2, 3, 4, 5]))\n```\n请只回答数字。",
+        "expected_contains": ["5"],
+    },
+    # 翻译
+    {
+        "category": "翻译",
+        "prompt": "将'Hello World'翻译成中文，请只回答翻译结果。",
+        "expected_contains": ["你好", "世界"],
+    },
+    # 摘要能力
+    {
+        "category": "摘要",
+        "prompt": "用一句话总结：人工智能（AI）是指由人工制造出来的系统所展现出来的智能。AI的核心问题包括推理、知识表示、规划、学习、自然语言处理、感知和移动与操作物体的能力。",
+        "expected_contains": ["人工智能", "AI"],
+    },
+    # 分类
+    {
+        "category": "情感分类",
+        "prompt": "判断以下文本的情感是正面还是负面：'这个产品太糟糕了，完全不值这个价格'。请只回答'正面'或'负面'。",
+        "expected_contains": ["负面"],
+    },
+]
+
+
+def evaluate_accuracy(model, tokenizer):
+    """运行精度评估"""
+    print("=" * 60)
+    print("Qwen3.5-9B 精度评估")
+    print("=" * 60)
+
+    results = []
+    category_stats = {}
+
+    for i, test in enumerate(ACCURACY_TESTS):
+        messages = [{"role": "user", "content": test["prompt"]}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=100,
+                do_sample=False,
+            )
+
+        input_len = inputs["input_ids"].shape[1]
+        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
+
+        # 检查是否包含预期关键词
+        passed = any(kw in response for kw in test["expected_contains"])
+
+        cat = test["category"]
+        if cat not in category_stats:
+            category_stats[cat] = {"total": 0, "passed": 0}
+        category_stats[cat]["total"] += 1
+        if passed:
+            category_stats[cat]["passed"] += 1
+
+        status = "PASS" if passed else "FAIL"
+        print(f"\n[{status}] 测试 {i+1} ({cat})")
+        print(f"  问题: {test['prompt'][:50]}...")
+        print(f"  回答: {response[:80]}")
+        print(f"  预期包含: {test['expected_contains']}")
+
+        results.append({
+            "category": cat,
+            "prompt": test["prompt"],
+            "response": response,
+            "expected": test["expected_contains"],
+            "passed": passed,
+        })
+
+    # 汇总
+    total = len(results)
+    passed = sum(1 for r in results if r["passed"])
+    print(f"\n{'='*60}")
+    print(f"精度评估汇总")
+    print(f"{'='*60}")
+    print(f"  总计: {total} 题, 通过: {passed} 题, 准确率: {passed/total*100:.1f}%")
+    print(f"\n  分类统计:")
+    for cat, stats in category_stats.items():
+        rate = stats["passed"] / stats["total"] * 100
+        print(f"    {cat}: {stats['passed']}/{stats['total']} ({rate:.0f}%)")
+
+    return {
+        "total": total,
+        "passed": passed,
+        "accuracy": round(passed / total * 100, 1),
+        "category_stats": category_stats,
+        "details": results,
+    }
+
+
+def save_results(accuracy_results):
+    """保存结果"""
+    output_dir = "vsp/qwen3.5-9b/results"
+    os.makedirs(output_dir, exist_ok=True)
+
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "model": "Qwen3.5-9B",
+        "quantization": "4-bit NF4",
+        "accuracy": accuracy_results,
+    }
+
+    path = os.path.join(output_dir, "accuracy_results.json")
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"\n结果已保存到 {path}")
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    model, tokenizer = load_model()
+    results = evaluate_accuracy(model, tokenizer)
+    save_results(results)
--- a/vsp/qwen3.5-9b/test_basic_inference.py
+++ b/vsp/qwen3.5-9b/test_basic_inference.py
@@ -0,0 +1,120 @@
+"""基础推理测试 - 验证模型能否正常加载和生成"""
+import os
+import sys
+import glob
+import time
+import torch
+import psutil
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 修复 Windows GBK 编码问题
+sys.stdout.reconfigure(encoding='utf-8', errors='replace')
+sys.stderr.reconfigure(encoding='utf-8', errors='replace')
+
+
+def get_model_path():
+    """获取模型路径"""
+    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
+    if paths:
+        return os.path.dirname(paths[0])
+    return "Qwen/Qwen3.5-9B"
+
+
+def test_basic_inference():
+    """基础推理测试"""
+    print("=" * 60)
+    print("Qwen3.5-9B 基础推理测试")
+    print("=" * 60)
+
+    model_path = get_model_path()
+    print(f"\n模型路径: {model_path}")
+
+    # 加载 tokenizer
+    print("加载 tokenizer...")
+    t0 = time.time()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(f"  Tokenizer 加载耗时: {time.time() - t0:.2f}s")
+
+    # 加载模型 (FP16 + GPU/CPU offload)
+    print("加载模型 (FP16 + CPU offload)...")
+    max_memory = {0: "6GiB", "cpu": "24GiB"}
+    t0 = time.time()
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        max_memory=max_memory,
+        offload_folder="vsp/qwen3.5-9b/offload",
+        trust_remote_code=True,
+    )
+    load_time = time.time() - t0
+    print(f"  模型加载耗时: {load_time:.2f}s")
+
+    # GPU 显存使用
+    if torch.cuda.is_available():
+        mem_used = torch.cuda.memory_allocated() / 1024**3
+        mem_reserved = torch.cuda.memory_reserved() / 1024**3
+        print(f"  GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")
+
+    # 测试推理
+    test_prompts = [
+        "你好，请介绍一下你自己。",
+        "What is the capital of France?",
+        "请用Python写一个快速排序算法。",
+        "解释一下什么是机器学习。",
+    ]
+
+    print(f"\n{'='*60}")
+    print("推理测试")
+    print(f"{'='*60}")
+
+    results = []
+    for i, prompt in enumerate(test_prompts):
+        print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
+        messages = [{"role": "user", "content": prompt}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        input_len = inputs["input_ids"].shape[1]
+
+        t0 = time.time()
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=32,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.8,
+            )
+        gen_time = time.time() - t0
+        output_len = outputs.shape[1] - input_len
+        tokens_per_sec = output_len / gen_time if gen_time > 0 else 0
+
+        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
+        print(f"  输出 tokens: {output_len}")
+        print(f"  生成耗时: {gen_time:.2f}s")
+        print(f"  速度: {tokens_per_sec:.1f} tokens/s")
+        print(f"  回复: {response[:100]}...")
+
+        results.append({
+            "prompt": prompt,
+            "output_tokens": output_len,
+            "time_s": gen_time,
+            "tokens_per_sec": tokens_per_sec,
+        })
+
+    # 汇总
+    print(f"\n{'='*60}")
+    print("基础测试汇总")
+    print(f"{'='*60}")
+    print(f"  模型加载耗时: {load_time:.2f}s")
+    avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
+    print(f"  平均生成速度: {avg_speed:.1f} tokens/s")
+    print(f"  GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+    print(f"  系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")
+
+    return results
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    test_basic_inference()
--- a/vsp/qwen3.5-9b/test_concurrency.py
+++ b/vsp/qwen3.5-9b/test_concurrency.py
@@ -0,0 +1,119 @@
+"""并发压测 - 测试不同并发数下的性能表现"""
+import json
+import os
+import sys
+import glob
+import time
+import torch
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datetime import datetime
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model_utils import load_model
+
+
+def single_inference(model, tokenizer, prompt, lock, max_tokens=64):
+    """单次推理（线程安全）"""
+    messages = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt").to(model.device)
+    input_len = inputs["input_ids"].shape[1]
+
+    t0 = time.perf_counter()
+    with lock:  # GPU 推理需要串行（单 GPU）
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                do_sample=False,
+            )
+    elapsed = time.perf_counter() - t0
+    output_len = outputs.shape[1] - input_len
+
+    return {
+        "time_s": elapsed,
+        "output_tokens": output_len,
+        "tokens_per_sec": output_len / elapsed if elapsed > 0 else 0,
+    }
+
+
+def test_concurrency(model, tokenizer):
+    """测试不同并发数下的表现"""
+    print("=" * 60)
+    print("并发压测")
+    print("=" * 60)
+
+    prompts = [
+        "什么是人工智能？",
+        "请解释量子计算。",
+        "Python的优点是什么？",
+        "深度学习和机器学习的区别？",
+        "什么是自然语言处理？",
+        "解释一下GPT的工作原理。",
+        "什么是强化学习？",
+        "云计算的优势有哪些？",
+    ]
+
+    concurrency_levels = [1, 2, 4, 8]
+    lock = threading.Lock()
+    results = []
+
+    for n_concurrent in concurrency_levels:
+        print(f"\n--- 并发数: {n_concurrent} ---")
+        test_prompts = (prompts * ((n_concurrent // len(prompts)) + 1))[:n_concurrent]
+
+        t0 = time.perf_counter()
+        futures_results = []
+
+        with ThreadPoolExecutor(max_workers=n_concurrent) as executor:
+            futures = [
+                executor.submit(single_inference, model, tokenizer, p, lock)
+                for p in test_prompts
+            ]
+            for f in as_completed(futures):
+                futures_results.append(f.result())
+
+        total_time = time.perf_counter() - t0
+        total_tokens = sum(r["output_tokens"] for r in futures_results)
+        avg_latency = sum(r["time_s"] for r in futures_results) / len(futures_results)
+        throughput = total_tokens / total_time
+
+        result = {
+            "concurrency": n_concurrent,
+            "total_time_s": round(total_time, 2),
+            "total_tokens": total_tokens,
+            "throughput_tokens_per_sec": round(throughput, 1),
+            "avg_latency_s": round(avg_latency, 2),
+            "requests_completed": len(futures_results),
+        }
+        results.append(result)
+
+        print(f"  总耗时: {result['total_time_s']}s")
+        print(f"  总 tokens: {result['total_tokens']}")
+        print(f"  吞吐量: {result['throughput_tokens_per_sec']} tokens/s")
+        print(f"  平均延迟: {result['avg_latency_s']}s")
+
+    # 保存
+    output_dir = "vsp/qwen3.5-9b/results"
+    os.makedirs(output_dir, exist_ok=True)
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "model": "Qwen3.5-9B",
+        "quantization": "4-bit NF4",
+        "note": "单GPU串行推理，并发测试主要体现请求排队效果",
+        "concurrency_results": results,
+    }
+    path = os.path.join(output_dir, "concurrency_results.json")
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"\n结果已保存到 {path}")
+
+    return results
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    model, tokenizer = load_model()
+    test_concurrency(model, tokenizer)
Author	SHA1	Message	Date
16337	42db2b0ca9	feat: 更新 GPU 需求分析，添加实际测试结果和综合报告 - 根据 RTX 3050 8GB 实测结果更新 GPU 需求建议 - 添加 bitsandbytes 兼容性问题记录 - 生成包含实测数据的综合测试报告 REPORT.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 13:09:39 +08:00
16337	4ac406572e	fix: 修复模型加载方式，改用 FP16+CPU offload RTX 3050 8GB 无法完整加载 Qwen3.5-9B，即使量化也不行： - bitsandbytes 4-bit 不支持 CPU offload - bitsandbytes 8-bit 与 accelerate 存在版本兼容问题 - FP16 + CPU offload 可以加载但推理质量极差（输出乱码） - 推理速度仅 0.4 tokens/s 结论：RTX 3050 8GB 不适合运行 Qwen3.5-9B Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 13:05:20 +08:00
16337	f7174464d5	feat: 添加一键运行脚本 run_all.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:52 +08:00
16337	fd0d6b05b5	feat: 添加 GPU 需求分析和综合报告生成脚本 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:51 +08:00
16337	837bf407e1	feat: 添加并发压测脚本 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:51 +08:00
16337	1c52b15a18	feat: 添加精度评估脚本（知识/数学/逻辑/代码/翻译） Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:50 +08:00
16337	8f5b495ed3	feat: 添加性能基准测试脚本（速度+显存） Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:50 +08:00
16337	1a96de6058	feat: 添加基础推理测试脚本（4-bit 量化） Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:45:49 +08:00
16337	c2ce4f0a78	feat: 添加模型下载脚本（ModelScope） Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:31:09 +08:00
16337	f29443ffb0	feat: 添加依赖配置和环境检查脚本 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-16 11:30:31 +08:00