From 837bf407e1a2664dc080aa4e92fe80635f531e88 Mon Sep 17 00:00:00 2001
From: 16337 <1633794139@qq.com>
Date: Mon, 16 Mar 2026 11:45:51 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B9=B6=E5=8F=91?=
 =?UTF-8?q?=E5=8E=8B=E6=B5=8B=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 vsp/qwen3.5-9b/test_concurrency.py | 137 +++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 vsp/qwen3.5-9b/test_concurrency.py

diff --git a/vsp/qwen3.5-9b/test_concurrency.py b/vsp/qwen3.5-9b/test_concurrency.py
new file mode 100644
index 0000000..86b935d
--- /dev/null
+++ b/vsp/qwen3.5-9b/test_concurrency.py
@@ -0,0 +1,137 @@
+"""并发压测 - 测试不同并发数下的性能表现"""
+import json
+import os
+import glob
+import time
+import torch
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from datetime import datetime
+
+
+def load_model():
+    """加载 4-bit 量化模型"""
+    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
+    model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B"
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    return model, tokenizer
+
+
+def single_inference(model, tokenizer, prompt, lock, max_tokens=64):
+    """单次推理（线程安全）"""
+    messages = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt").to(model.device)
+    input_len = inputs["input_ids"].shape[1]
+
+    t0 = time.perf_counter()
+    with lock:  # GPU 推理需要串行（单 GPU）
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                do_sample=False,
+            )
+    elapsed = time.perf_counter() - t0
+    output_len = outputs.shape[1] - input_len
+
+    return {
+        "time_s": elapsed,
+        "output_tokens": output_len,
+        "tokens_per_sec": output_len / elapsed if elapsed > 0 else 0,
+    }
+
+
+def test_concurrency(model, tokenizer):
+    """测试不同并发数下的表现"""
+    print("=" * 60)
+    print("并发压测")
+    print("=" * 60)
+
+    prompts = [
+        "什么是人工智能？",
+        "请解释量子计算。",
+        "Python的优点是什么？",
+        "深度学习和机器学习的区别？",
+        "什么是自然语言处理？",
+        "解释一下GPT的工作原理。",
+        "什么是强化学习？",
+        "云计算的优势有哪些？",
+    ]
+
+    concurrency_levels = [1, 2, 4, 8]
+    lock = threading.Lock()
+    results = []
+
+    for n_concurrent in concurrency_levels:
+        print(f"\n--- 并发数: {n_concurrent} ---")
+        test_prompts = (prompts * ((n_concurrent // len(prompts)) + 1))[:n_concurrent]
+
+        t0 = time.perf_counter()
+        futures_results = []
+
+        with ThreadPoolExecutor(max_workers=n_concurrent) as executor:
+            futures = [
+                executor.submit(single_inference, model, tokenizer, p, lock)
+                for p in test_prompts
+            ]
+            for f in as_completed(futures):
+                futures_results.append(f.result())
+
+        total_time = time.perf_counter() - t0
+        total_tokens = sum(r["output_tokens"] for r in futures_results)
+        avg_latency = sum(r["time_s"] for r in futures_results) / len(futures_results)
+        throughput = total_tokens / total_time
+
+        result = {
+            "concurrency": n_concurrent,
+            "total_time_s": round(total_time, 2),
+            "total_tokens": total_tokens,
+            "throughput_tokens_per_sec": round(throughput, 1),
+            "avg_latency_s": round(avg_latency, 2),
+            "requests_completed": len(futures_results),
+        }
+        results.append(result)
+
+        print(f"  总耗时: {result['total_time_s']}s")
+        print(f"  总 tokens: {result['total_tokens']}")
+        print(f"  吞吐量: {result['throughput_tokens_per_sec']} tokens/s")
+        print(f"  平均延迟: {result['avg_latency_s']}s")
+
+    # 保存
+    output_dir = "vsp/qwen3.5-9b/results"
+    os.makedirs(output_dir, exist_ok=True)
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "model": "Qwen3.5-9B",
+        "quantization": "4-bit NF4",
+        "note": "单GPU串行推理，并发测试主要体现请求排队效果",
+        "concurrency_results": results,
+    }
+    path = os.path.join(output_dir, "concurrency_results.json")
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"\n结果已保存到 {path}")
+
+    return results
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    model, tokenizer = load_model()
+    test_concurrency(model, tokenizer)