commit e522242ad4e01c23bda96da54298f7638e11a2b1 Author: 16337 <1633794139@qq.com> Date: Mon Mar 16 11:27:17 2026 +0800 init: 项目初始化,添加 .gitignore 和 README Co-Authored-By: Claude Opus 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7bd34c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +.eggs/ + +# Model files (太大不提交) +*.bin +*.safetensors +*.gguf +*.pt +*.pth +*.onnx +vsp/qwen3.5-9b/model/ + +# Env +.env +*.log + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store +Thumbs.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..727a26d --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Qwen3.5-9B 性能测试 + +对 Qwen/Qwen3.5-9B 模型进行全面性能评估,包括推理速度、精度、并发能力和算力需求分析。 + +## 目录结构 +- `vsp/qwen3.5-9b/` - 测试代码和结果 +- `docs/plans/` - 实施计划 + +## 运行环境 +- conda env: yolo +- Python 3.10, PyTorch 2.5.1+cu121 +- GPU: NVIDIA RTX 3050 OEM 8GB diff --git a/docs/plans/2026-03-16-qwen3.5-9b-benchmark.md b/docs/plans/2026-03-16-qwen3.5-9b-benchmark.md new file mode 100644 index 0000000..836c125 --- /dev/null +++ b/docs/plans/2026-03-16-qwen3.5-9b-benchmark.md @@ -0,0 +1,1275 @@ +# Qwen3.5-9B 性能测试 Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** 在 `vsp/qwen3.5-9b/` 目录下搭建完整的 Qwen3.5-9B 模型测试框架,测试推理速度、精度、并发性能和算力需求。 + +**Architecture:** 分模块构建:环境搭建 → 模型下载 → 基础推理测试 → 性能基准测试 → 精度评估 → 并发压测 → 算力需求分析 → 报告生成。每个模块独立脚本,统一由 `run_all.py` 调度。 + +**Tech Stack:** Python 3.10, PyTorch 2.5.1+cu121, transformers, modelscope, bitsandbytes (4-bit量化), accelerate, psutil + +**硬件环境:** RTX 3050 OEM 8GB VRAM — Qwen3.5-9B FP16 需 ~18GB,必须使用 4-bit 量化(~5GB VRAM)才能在此卡上运行。 + +--- + +## Task 1: Git 仓库初始化与远程配置 + +**Files:** +- Create: `.gitignore` +- Create: `README.md` + +**Step 1: 初始化 git 仓库** + +```bash +cd /c/workspace/qwen-test +git init +git remote add origin http://124.222.218.198:3000/XW-AIOT/qwen-test.git +``` + +**Step 2: 创建 .gitignore** + +```gitignore +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +.eggs/ + +# Model files (太大不提交) +*.bin +*.safetensors +*.gguf +*.pt +*.pth +*.onnx +vsp/qwen3.5-9b/model/ + +# Env +.env +*.log + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store +Thumbs.db +``` + +**Step 3: 创建 README.md** + +```markdown +# Qwen3.5-9B 性能测试 + +对 Qwen/Qwen3.5-9B 模型进行全面性能评估,包括推理速度、精度、并发能力和算力需求分析。 + +## 目录结构 +- `vsp/qwen3.5-9b/` - 测试代码和结果 +- `docs/plans/` - 实施计划 + +## 运行环境 +- conda env: yolo +- Python 3.10, PyTorch 2.5.1+cu121 +- GPU: NVIDIA RTX 3050 OEM 8GB +``` + +**Step 4: 清空远程 master 分支并推送初始提交** + +```bash +git add .gitignore README.md docs/ +git commit -m "init: 项目初始化,添加 .gitignore 和 README" +git push origin --delete master 2>/dev/null || true +git branch -M master +git push -u origin master --force +``` + +--- + +## Task 2: 环境依赖安装 + +**Files:** +- Create: `vsp/qwen3.5-9b/requirements.txt` +- Create: `vsp/qwen3.5-9b/setup_env.py` + +**Step 1: 创建 requirements.txt** + +```txt +modelscope>=1.9.0 +transformers>=4.37.0 +accelerate>=0.25.0 +bitsandbytes>=0.41.0 +sentencepiece +protobuf +psutil +pandas +matplotlib +tqdm +``` + +**Step 2: 创建环境检查脚本 setup_env.py** + +```python +"""环境检查与依赖验证脚本""" +import subprocess +import sys + + +def check_and_install(): + """检查并安装依赖""" + print("=" * 60) + print("Qwen3.5-9B 测试环境检查") + print("=" * 60) + + # 检查 Python 版本 + print(f"\nPython 版本: {sys.version}") + + # 检查 CUDA + try: + import torch + print(f"PyTorch 版本: {torch.__version__}") + print(f"CUDA 可用: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f"GPU: {torch.cuda.get_device_name(0)}") + vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 + print(f"VRAM: {vram_gb:.1f} GB") + except ImportError: + print("ERROR: PyTorch 未安装") + sys.exit(1) + + # 安装依赖 + print("\n安装依赖包...") + subprocess.check_call([ + sys.executable, "-m", "pip", "install", "-r", + "vsp/qwen3.5-9b/requirements.txt", "-q" + ]) + + # 验证关键包 + packages = ["transformers", "modelscope", "accelerate", "bitsandbytes"] + for pkg in packages: + try: + mod = __import__(pkg) + ver = getattr(mod, "__version__", "unknown") + print(f" {pkg}: {ver}") + except ImportError: + print(f" ERROR: {pkg} 安装失败") + + print("\n环境检查完成!") + + +if __name__ == "__main__": + check_and_install() +``` + +**Step 3: 运行环境安装** + +```bash +conda activate yolo +python vsp/qwen3.5-9b/setup_env.py +``` + +**Step 4: 提交** + +```bash +git add vsp/qwen3.5-9b/requirements.txt vsp/qwen3.5-9b/setup_env.py +git commit -m "feat: 添加依赖配置和环境检查脚本" +``` + +--- + +## Task 3: 模型下载脚本 + +**Files:** +- Create: `vsp/qwen3.5-9b/download_model.py` + +**Step 1: 创建模型下载脚本** + +```python +"""从 ModelScope 下载 Qwen3.5-9B 模型""" +import os +import time +import argparse + + +def download_model(model_dir="vsp/qwen3.5-9b/model"): + """下载模型到指定目录""" + from modelscope import snapshot_download + + os.makedirs(model_dir, exist_ok=True) + print(f"开始下载 Qwen3.5-9B 到 {model_dir} ...") + start = time.time() + + model_path = snapshot_download( + "Qwen/Qwen3.5-9B", + cache_dir=model_dir, + ) + + elapsed = time.time() - start + print(f"下载完成!耗时: {elapsed:.1f}s") + print(f"模型路径: {model_path}") + return model_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", default="vsp/qwen3.5-9b/model", + help="模型保存目录") + args = parser.parse_args() + download_model(args.model_dir) +``` + +**Step 2: 运行下载** + +```bash +conda activate yolo +python vsp/qwen3.5-9b/download_model.py +``` + +**Step 3: 提交** + +```bash +git add vsp/qwen3.5-9b/download_model.py +git commit -m "feat: 添加模型下载脚本(ModelScope)" +``` + +--- + +## Task 4: 基础推理测试 + +**Files:** +- Create: `vsp/qwen3.5-9b/test_basic_inference.py` + +**Step 1: 创建基础推理测试脚本** + +```python +"""基础推理测试 - 验证模型能否正常加载和生成""" +import time +import torch +import psutil +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + + +def get_model_path(): + """获取模型路径""" + import glob + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + if paths: + return os.path.dirname(paths[0]) + return "Qwen/Qwen3.5-9B" + + +def test_basic_inference(): + """基础推理测试""" + print("=" * 60) + print("Qwen3.5-9B 基础推理测试") + print("=" * 60) + + # 4-bit 量化配置 (RTX 3050 8GB 必须量化) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + model_path = get_model_path() + print(f"\n模型路径: {model_path}") + + # 加载 tokenizer + print("加载 tokenizer...") + t0 = time.time() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s") + + # 加载模型 (4-bit 量化) + print("加载模型 (4-bit 量化)...") + t0 = time.time() + model = AutoModelForCausalLM.from_pretrained( + model_path, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ) + load_time = time.time() - t0 + print(f" 模型加载耗时: {load_time:.2f}s") + + # GPU 显存使用 + if torch.cuda.is_available(): + mem_used = torch.cuda.memory_allocated() / 1024**3 + mem_reserved = torch.cuda.memory_reserved() / 1024**3 + print(f" GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)") + + # 测试推理 + test_prompts = [ + "你好,请介绍一下你自己。", + "What is the capital of France?", + "请用Python写一个快速排序算法。", + "解释一下什么是机器学习。", + ] + + print(f"\n{'='*60}") + print("推理测试") + print(f"{'='*60}") + + results = [] + for i, prompt in enumerate(test_prompts): + print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---") + messages = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + input_len = inputs["input_ids"].shape[1] + + t0 = time.time() + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=256, + do_sample=True, + temperature=0.7, + top_p=0.8, + ) + gen_time = time.time() - t0 + output_len = outputs.shape[1] - input_len + tokens_per_sec = output_len / gen_time if gen_time > 0 else 0 + + response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True) + print(f" 输出 tokens: {output_len}") + print(f" 生成耗时: {gen_time:.2f}s") + print(f" 速度: {tokens_per_sec:.1f} tokens/s") + print(f" 回复: {response[:100]}...") + + results.append({ + "prompt": prompt, + "output_tokens": output_len, + "time_s": gen_time, + "tokens_per_sec": tokens_per_sec, + }) + + # 汇总 + print(f"\n{'='*60}") + print("基础测试汇总") + print(f"{'='*60}") + print(f" 模型加载耗时: {load_time:.2f}s") + avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results) + print(f" 平均生成速度: {avg_speed:.1f} tokens/s") + print(f" GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") + print(f" 系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB") + + return results + + +if __name__ == "__main__": + import os + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + test_basic_inference() +``` + +**Step 2: 运行基础推理测试** + +```bash +conda activate yolo +cd /c/workspace/qwen-test +python vsp/qwen3.5-9b/test_basic_inference.py +``` + +**Step 3: 提交** + +```bash +git add vsp/qwen3.5-9b/test_basic_inference.py +git commit -m "feat: 添加基础推理测试脚本(4-bit 量化)" +``` + +--- + +## Task 5: 性能基准测试(推理速度 + 吞吐量) + +**Files:** +- Create: `vsp/qwen3.5-9b/benchmark_speed.py` + +**Step 1: 创建性能基准测试脚本** + +```python +"""性能基准测试 - 推理速度、首 token 延迟、吞吐量""" +import time +import json +import os +import torch +import psutil +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from datetime import datetime + + +def load_model(): + """加载 4-bit 量化模型""" + import glob + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ) + return model, tokenizer + + +def benchmark_speed(model, tokenizer, num_runs=5): + """测试不同输入长度和输出长度下的推理速度""" + print("=" * 60) + print("性能基准测试 - 推理速度") + print("=" * 60) + + test_cases = [ + {"name": "短输入短输出", "prompt": "你好", "max_tokens": 50}, + {"name": "短输入中输出", "prompt": "介绍一下人工智能", "max_tokens": 128}, + {"name": "短输入长输出", "prompt": "请详细解释深度学习的原理和应用", "max_tokens": 256}, + {"name": "中输入中输出", "prompt": "以下是一段代码:\n```python\ndef fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)\n```\n请分析这段代码的时间复杂度并给出优化方案。", "max_tokens": 256}, + {"name": "长输入短输出", "prompt": "请总结以下内容的关键点:" + "人工智能是计算机科学的一个分支。" * 50, "max_tokens": 64}, + ] + + results = [] + for case in test_cases: + print(f"\n--- {case['name']} (max_tokens={case['max_tokens']}) ---") + times = [] + first_token_times = [] + output_tokens_list = [] + + for run in range(num_runs): + messages = [{"role": "user", "content": case["prompt"]}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + input_len = inputs["input_ids"].shape[1] + + torch.cuda.synchronize() + t0 = time.perf_counter() + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=case["max_tokens"], + do_sample=False, # greedy for reproducibility + ) + + torch.cuda.synchronize() + gen_time = time.perf_counter() - t0 + output_len = outputs.shape[1] - input_len + + times.append(gen_time) + output_tokens_list.append(output_len) + + avg_time = sum(times) / len(times) + avg_tokens = sum(output_tokens_list) / len(output_tokens_list) + avg_speed = avg_tokens / avg_time if avg_time > 0 else 0 + + result = { + "test_name": case["name"], + "input_tokens": input_len, + "avg_output_tokens": round(avg_tokens, 1), + "avg_time_s": round(avg_time, 3), + "avg_tokens_per_sec": round(avg_speed, 1), + "min_time_s": round(min(times), 3), + "max_time_s": round(max(times), 3), + } + results.append(result) + print(f" 输入 tokens: {input_len}") + print(f" 平均输出 tokens: {result['avg_output_tokens']}") + print(f" 平均耗时: {result['avg_time_s']}s") + print(f" 平均速度: {result['avg_tokens_per_sec']} tokens/s") + + return results + + +def benchmark_memory(model): + """测试显存和内存占用""" + print(f"\n{'='*60}") + print("显存与内存占用") + print(f"{'='*60}") + + result = {} + if torch.cuda.is_available(): + result["gpu_allocated_gb"] = round(torch.cuda.memory_allocated() / 1024**3, 2) + result["gpu_reserved_gb"] = round(torch.cuda.memory_reserved() / 1024**3, 2) + result["gpu_total_gb"] = round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 1) + result["gpu_name"] = torch.cuda.get_device_name(0) + + process = psutil.Process() + result["ram_used_gb"] = round(process.memory_info().rss / 1024**3, 2) + result["ram_total_gb"] = round(psutil.virtual_memory().total / 1024**3, 1) + + for k, v in result.items(): + print(f" {k}: {v}") + + return result + + +def save_results(speed_results, memory_results): + """保存测试结果""" + output_dir = "vsp/qwen3.5-9b/results" + os.makedirs(output_dir, exist_ok=True) + + report = { + "timestamp": datetime.now().isoformat(), + "model": "Qwen3.5-9B", + "quantization": "4-bit NF4", + "speed_benchmark": speed_results, + "memory": memory_results, + } + + output_path = os.path.join(output_dir, "benchmark_speed.json") + with open(output_path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + print(f"\n结果已保存到 {output_path}") + return output_path + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + model, tokenizer = load_model() + speed_results = benchmark_speed(model, tokenizer) + memory_results = benchmark_memory(model) + save_results(speed_results, memory_results) +``` + +**Step 2: 运行性能基准测试** + +```bash +conda activate yolo +cd /c/workspace/qwen-test +python vsp/qwen3.5-9b/benchmark_speed.py +``` + +**Step 3: 提交** + +```bash +git add vsp/qwen3.5-9b/benchmark_speed.py +git commit -m "feat: 添加性能基准测试脚本(速度+显存)" +``` + +--- + +## Task 6: 精度评估测试 + +**Files:** +- Create: `vsp/qwen3.5-9b/test_accuracy.py` + +**Step 1: 创建精度评估脚本** + +```python +"""精度评估 - 测试模型在常见任务上的准确性""" +import json +import os +import time +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from datetime import datetime + + +# 测试数据集 +ACCURACY_TESTS = [ + # 知识问答 + { + "category": "知识问答", + "prompt": "中国的首都是哪个城市?请只回答城市名。", + "expected_contains": ["北京"], + }, + { + "category": "知识问答", + "prompt": "水的化学式是什么?请只回答化学式。", + "expected_contains": ["H2O"], + }, + { + "category": "知识问答", + "prompt": "地球到太阳的平均距离大约是多少公里?A. 1.5亿 B. 3亿 C. 5亿 D. 1亿。请只回答选项字母。", + "expected_contains": ["A"], + }, + # 数学推理 + { + "category": "数学推理", + "prompt": "计算 15 * 23 = ? 请只回答数字。", + "expected_contains": ["345"], + }, + { + "category": "数学推理", + "prompt": "一个三角形三边分别是3、4、5,它是什么三角形?请只回答类型。", + "expected_contains": ["直角"], + }, + # 逻辑推理 + { + "category": "逻辑推理", + "prompt": "所有的狗都是动物。小白是一只狗。所以小白是什么?请只回答一个词。", + "expected_contains": ["动物"], + }, + # 代码理解 + { + "category": "代码理解", + "prompt": "以下Python代码的输出是什么?\n```python\nprint(len([1, 2, 3, 4, 5]))\n```\n请只回答数字。", + "expected_contains": ["5"], + }, + # 翻译 + { + "category": "翻译", + "prompt": "将'Hello World'翻译成中文,请只回答翻译结果。", + "expected_contains": ["你好", "世界"], + }, + # 摘要能力 + { + "category": "摘要", + "prompt": "用一句话总结:人工智能(AI)是指由人工制造出来的系统所展现出来的智能。AI的核心问题包括推理、知识表示、规划、学习、自然语言处理、感知和移动与操作物体的能力。", + "expected_contains": ["人工智能", "AI"], + }, + # 分类 + { + "category": "情感分类", + "prompt": "判断以下文本的情感是正面还是负面:'这个产品太糟糕了,完全不值这个价格'。请只回答'正面'或'负面'。", + "expected_contains": ["负面"], + }, +] + + +def load_model(): + """加载 4-bit 量化模型""" + import glob + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ) + return model, tokenizer + + +def evaluate_accuracy(model, tokenizer): + """运行精度评估""" + print("=" * 60) + print("Qwen3.5-9B 精度评估") + print("=" * 60) + + results = [] + category_stats = {} + + for i, test in enumerate(ACCURACY_TESTS): + messages = [{"role": "user", "content": test["prompt"]}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=100, + do_sample=False, + ) + + input_len = inputs["input_ids"].shape[1] + response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip() + + # 检查是否包含预期关键词 + passed = any(kw in response for kw in test["expected_contains"]) + + cat = test["category"] + if cat not in category_stats: + category_stats[cat] = {"total": 0, "passed": 0} + category_stats[cat]["total"] += 1 + if passed: + category_stats[cat]["passed"] += 1 + + status = "PASS" if passed else "FAIL" + print(f"\n[{status}] 测试 {i+1} ({cat})") + print(f" 问题: {test['prompt'][:50]}...") + print(f" 回答: {response[:80]}") + print(f" 预期包含: {test['expected_contains']}") + + results.append({ + "category": cat, + "prompt": test["prompt"], + "response": response, + "expected": test["expected_contains"], + "passed": passed, + }) + + # 汇总 + total = len(results) + passed = sum(1 for r in results if r["passed"]) + print(f"\n{'='*60}") + print(f"精度评估汇总") + print(f"{'='*60}") + print(f" 总计: {total} 题, 通过: {passed} 题, 准确率: {passed/total*100:.1f}%") + print(f"\n 分类统计:") + for cat, stats in category_stats.items(): + rate = stats["passed"] / stats["total"] * 100 + print(f" {cat}: {stats['passed']}/{stats['total']} ({rate:.0f}%)") + + return { + "total": total, + "passed": passed, + "accuracy": round(passed / total * 100, 1), + "category_stats": category_stats, + "details": results, + } + + +def save_results(accuracy_results): + """保存结果""" + output_dir = "vsp/qwen3.5-9b/results" + os.makedirs(output_dir, exist_ok=True) + + report = { + "timestamp": datetime.now().isoformat(), + "model": "Qwen3.5-9B", + "quantization": "4-bit NF4", + "accuracy": accuracy_results, + } + + path = os.path.join(output_dir, "accuracy_results.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + print(f"\n结果已保存到 {path}") + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + model, tokenizer = load_model() + results = evaluate_accuracy(model, tokenizer) + save_results(results) +``` + +**Step 2: 运行精度评估** + +```bash +conda activate yolo +cd /c/workspace/qwen-test +python vsp/qwen3.5-9b/test_accuracy.py +``` + +**Step 3: 提交** + +```bash +git add vsp/qwen3.5-9b/test_accuracy.py +git commit -m "feat: 添加精度评估脚本(知识/数学/逻辑/代码/翻译)" +``` + +--- + +## Task 7: 并发压测 + +**Files:** +- Create: `vsp/qwen3.5-9b/test_concurrency.py` + +**Step 1: 创建并发测试脚本** + +```python +"""并发压测 - 测试不同并发数下的性能表现""" +import json +import os +import time +import torch +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from datetime import datetime + + +def load_model(): + """加载 4-bit 量化模型""" + import glob + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ) + return model, tokenizer + + +def single_inference(model, tokenizer, prompt, lock, max_tokens=64): + """单次推理(线程安全)""" + messages = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + input_len = inputs["input_ids"].shape[1] + + t0 = time.perf_counter() + with lock: # GPU 推理需要串行(单 GPU) + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=max_tokens, + do_sample=False, + ) + elapsed = time.perf_counter() - t0 + output_len = outputs.shape[1] - input_len + + return { + "time_s": elapsed, + "output_tokens": output_len, + "tokens_per_sec": output_len / elapsed if elapsed > 0 else 0, + } + + +def test_concurrency(model, tokenizer): + """测试不同并发数下的表现""" + print("=" * 60) + print("并发压测") + print("=" * 60) + + prompts = [ + "什么是人工智能?", + "请解释量子计算。", + "Python的优点是什么?", + "深度学习和机器学习的区别?", + "什么是自然语言处理?", + "解释一下GPT的工作原理。", + "什么是强化学习?", + "云计算的优势有哪些?", + ] + + concurrency_levels = [1, 2, 4, 8] + lock = threading.Lock() + results = [] + + for n_concurrent in concurrency_levels: + print(f"\n--- 并发数: {n_concurrent} ---") + test_prompts = (prompts * ((n_concurrent // len(prompts)) + 1))[:n_concurrent] + + t0 = time.perf_counter() + futures_results = [] + + with ThreadPoolExecutor(max_workers=n_concurrent) as executor: + futures = [ + executor.submit(single_inference, model, tokenizer, p, lock) + for p in test_prompts + ] + for f in as_completed(futures): + futures_results.append(f.result()) + + total_time = time.perf_counter() - t0 + total_tokens = sum(r["output_tokens"] for r in futures_results) + avg_latency = sum(r["time_s"] for r in futures_results) / len(futures_results) + throughput = total_tokens / total_time + + result = { + "concurrency": n_concurrent, + "total_time_s": round(total_time, 2), + "total_tokens": total_tokens, + "throughput_tokens_per_sec": round(throughput, 1), + "avg_latency_s": round(avg_latency, 2), + "requests_completed": len(futures_results), + } + results.append(result) + + print(f" 总耗时: {result['total_time_s']}s") + print(f" 总 tokens: {result['total_tokens']}") + print(f" 吞吐量: {result['throughput_tokens_per_sec']} tokens/s") + print(f" 平均延迟: {result['avg_latency_s']}s") + + # 保存 + output_dir = "vsp/qwen3.5-9b/results" + os.makedirs(output_dir, exist_ok=True) + report = { + "timestamp": datetime.now().isoformat(), + "model": "Qwen3.5-9B", + "quantization": "4-bit NF4", + "note": "单GPU串行推理,并发测试主要体现请求排队效果", + "concurrency_results": results, + } + path = os.path.join(output_dir, "concurrency_results.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + print(f"\n结果已保存到 {path}") + + return results + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + model, tokenizer = load_model() + test_concurrency(model, tokenizer) +``` + +**Step 2: 运行并发测试** + +```bash +conda activate yolo +cd /c/workspace/qwen-test +python vsp/qwen3.5-9b/test_concurrency.py +``` + +**Step 3: 提交** + +```bash +git add vsp/qwen3.5-9b/test_concurrency.py +git commit -m "feat: 添加并发压测脚本" +``` + +--- + +## Task 8: 算力需求分析与综合报告 + +**Files:** +- Create: `vsp/qwen3.5-9b/generate_report.py` +- Create: `vsp/qwen3.5-9b/gpu_requirements.py` + +**Step 1: 创建 GPU 需求分析脚本** + +```python +"""GPU 算力需求分析""" +import json +import os + + +# Qwen3.5-9B 不同精度下的显存需求估算 +GPU_REQUIREMENTS = { + "model": "Qwen3.5-9B", + "parameters": "9B", + "precision_requirements": { + "FP32": { + "model_size_gb": 36, + "min_vram_gb": 40, + "recommended_gpus": ["A100 80GB", "H100 80GB"], + "note": "不推荐,显存占用过大", + }, + "FP16/BF16": { + "model_size_gb": 18, + "min_vram_gb": 22, + "recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"], + "note": "标准推理精度,推荐用于生产环境", + }, + "INT8": { + "model_size_gb": 9, + "min_vram_gb": 12, + "recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"], + "note": "轻微精度损失,性价比高", + }, + "INT4 (NF4)": { + "model_size_gb": 5, + "min_vram_gb": 8, + "recommended_gpus": ["RTX 3050 8GB", "RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"], + "note": "适合显存有限的消费级显卡,有一定精度损失", + }, + }, + "deployment_recommendations": { + "开发测试": { + "gpu": "RTX 3050/4060 (8GB)", + "precision": "INT4", + "concurrent": 1, + "cost_estimate": "~2000-3000 RMB (显卡)", + }, + "小规模部署": { + "gpu": "RTX 4090 (24GB)", + "precision": "FP16", + "concurrent": "2-4", + "cost_estimate": "~12000-15000 RMB (显卡)", + }, + "生产环境": { + "gpu": "A100 40GB / H100 80GB", + "precision": "FP16/BF16", + "concurrent": "8-32 (vLLM)", + "cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需", + }, + }, +} + + +def analyze_gpu_requirements(): + """输出 GPU 需求分析""" + print("=" * 60) + print("Qwen3.5-9B GPU 算力需求分析") + print("=" * 60) + + for precision, info in GPU_REQUIREMENTS["precision_requirements"].items(): + print(f"\n{precision}:") + print(f" 模型大小: ~{info['model_size_gb']} GB") + print(f" 最低显存: {info['min_vram_gb']} GB") + print(f" 推荐显卡: {', '.join(info['recommended_gpus'])}") + print(f" 备注: {info['note']}") + + print(f"\n{'='*60}") + print("部署方案推荐") + print(f"{'='*60}") + for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items(): + print(f"\n{scenario}:") + for k, v in info.items(): + print(f" {k}: {v}") + + # 保存 + output_dir = "vsp/qwen3.5-9b/results" + os.makedirs(output_dir, exist_ok=True) + path = os.path.join(output_dir, "gpu_requirements.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2) + print(f"\n结果已保存到 {path}") + + +if __name__ == "__main__": + analyze_gpu_requirements() +``` + +**Step 2: 创建综合报告生成脚本** + +```python +"""综合报告生成 - 汇总所有测试结果""" +import json +import os +from datetime import datetime + + +def load_json(path): + """加载 JSON 文件""" + if os.path.exists(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + return None + + +def generate_report(): + """生成综合测试报告""" + results_dir = "vsp/qwen3.5-9b/results" + + speed = load_json(os.path.join(results_dir, "benchmark_speed.json")) + accuracy = load_json(os.path.join(results_dir, "accuracy_results.json")) + concurrency = load_json(os.path.join(results_dir, "concurrency_results.json")) + gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json")) + + report_lines = [ + "# Qwen3.5-9B 性能测试报告", + f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "\n## 1. 测试环境", + "", + "| 项目 | 值 |", + "|------|-----|", + "| 模型 | Qwen3.5-9B |", + "| 量化方式 | 4-bit NF4 (bitsandbytes) |", + ] + + if speed and "memory" in speed: + mem = speed["memory"] + report_lines.extend([ + f"| GPU | {mem.get('gpu_name', 'N/A')} |", + f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |", + f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |", + f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |", + ]) + + # 推理速度 + report_lines.extend(["\n## 2. 推理速度", ""]) + if speed and "speed_benchmark" in speed: + report_lines.extend([ + "| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |", + "|---------|-----------|-----------|---------|---------------|", + ]) + for r in speed["speed_benchmark"]: + report_lines.append( + f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |" + ) + else: + report_lines.append("*未运行速度测试*") + + # 精度 + report_lines.extend(["\n## 3. 精度评估", ""]) + if accuracy and "accuracy" in accuracy: + acc = accuracy["accuracy"] + report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n") + report_lines.extend([ + "| 分类 | 通过/总数 | 准确率 |", + "|------|---------|--------|", + ]) + for cat, stats in acc.get("category_stats", {}).items(): + rate = stats["passed"] / stats["total"] * 100 + report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |") + else: + report_lines.append("*未运行精度测试*") + + # 并发 + report_lines.extend(["\n## 4. 并发性能", ""]) + if concurrency and "concurrency_results" in concurrency: + report_lines.extend([ + "| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |", + "|-------|---------|----------------|-----------|", + ]) + for r in concurrency["concurrency_results"]: + report_lines.append( + f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |" + ) + report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}") + else: + report_lines.append("*未运行并发测试*") + + # GPU 需求 + report_lines.extend(["\n## 5. GPU 算力需求", ""]) + if gpu_req: + report_lines.extend([ + "| 精度 | 模型大小 | 最低显存 | 推荐显卡 |", + "|------|---------|---------|---------|", + ]) + for precision, info in gpu_req.get("precision_requirements", {}).items(): + gpus = ", ".join(info["recommended_gpus"][:2]) + report_lines.append( + f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |" + ) + + # 结论 + report_lines.extend([ + "\n## 6. 结论与建议", + "", + "1. **RTX 3050 8GB 可以运行 Qwen3.5-9B**,但必须使用 4-bit 量化", + "2. 4-bit 量化后显存占用约 5GB,留有一定余量", + "3. 单卡推理速度适合开发测试,不适合高并发生产环境", + "4. 生产部署建议使用 RTX 4090 (FP16) 或 A100 (FP16/BF16) + vLLM", + "5. 4-bit 量化对简单任务精度影响较小,复杂推理任务可能有一定损失", + ]) + + # 保存报告 + report_text = "\n".join(report_lines) + report_path = os.path.join(results_dir, "REPORT.md") + with open(report_path, "w", encoding="utf-8") as f: + f.write(report_text) + + print(report_text) + print(f"\n\n报告已保存到 {report_path}") + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + generate_report() +``` + +**Step 3: 运行报告生成** + +```bash +conda activate yolo +cd /c/workspace/qwen-test +python vsp/qwen3.5-9b/gpu_requirements.py +python vsp/qwen3.5-9b/generate_report.py +``` + +**Step 4: 提交** + +```bash +git add vsp/qwen3.5-9b/gpu_requirements.py vsp/qwen3.5-9b/generate_report.py +git commit -m "feat: 添加 GPU 需求分析和综合报告生成脚本" +``` + +--- + +## Task 9: 主运行脚本与最终推送 + +**Files:** +- Create: `vsp/qwen3.5-9b/run_all.py` + +**Step 1: 创建一键运行脚本** + +```python +"""一键运行所有测试""" +import subprocess +import sys +import os +import time + + +SCRIPTS = [ + ("环境检查", "vsp/qwen3.5-9b/setup_env.py"), + ("模型下载", "vsp/qwen3.5-9b/download_model.py"), + ("基础推理测试", "vsp/qwen3.5-9b/test_basic_inference.py"), + ("性能基准测试", "vsp/qwen3.5-9b/benchmark_speed.py"), + ("精度评估", "vsp/qwen3.5-9b/test_accuracy.py"), + ("并发压测", "vsp/qwen3.5-9b/test_concurrency.py"), + ("GPU需求分析", "vsp/qwen3.5-9b/gpu_requirements.py"), + ("生成报告", "vsp/qwen3.5-9b/generate_report.py"), +] + + +def main(): + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + print("=" * 60) + print("Qwen3.5-9B 全量测试") + print("=" * 60) + + for name, script in SCRIPTS: + print(f"\n{'='*60}") + print(f"[{name}] 运行 {script}") + print("=" * 60) + + t0 = time.time() + result = subprocess.run([sys.executable, script], capture_output=False) + elapsed = time.time() - t0 + + if result.returncode != 0: + print(f"\n[ERROR] {name} 失败 (退出码: {result.returncode})") + choice = input("继续运行后续测试?(y/n): ").strip().lower() + if choice != "y": + sys.exit(1) + else: + print(f"\n[OK] {name} 完成 ({elapsed:.1f}s)") + + print(f"\n{'='*60}") + print("所有测试完成!查看报告: vsp/qwen3.5-9b/results/REPORT.md") + print("=" * 60) + + +if __name__ == "__main__": + main() +``` + +**Step 2: 提交并推送** + +```bash +git add vsp/qwen3.5-9b/run_all.py +git commit -m "feat: 添加一键运行脚本 run_all.py" + +# 提交测试结果(如果有) +git add vsp/qwen3.5-9b/results/ 2>/dev/null +git commit -m "docs: 添加测试结果数据" 2>/dev/null || true + +# 推送到远程 +git push origin master +``` + +--- + +## 提交计划汇总 + +| 提交序号 | 提交信息 | 包含文件 | +|---------|---------|---------| +| 1 | `init: 项目初始化,添加 .gitignore 和 README` | .gitignore, README.md, docs/ | +| 2 | `feat: 添加依赖配置和环境检查脚本` | requirements.txt, setup_env.py | +| 3 | `feat: 添加模型下载脚本(ModelScope)` | download_model.py | +| 4 | `feat: 添加基础推理测试脚本(4-bit 量化)` | test_basic_inference.py | +| 5 | `feat: 添加性能基准测试脚本(速度+显存)` | benchmark_speed.py | +| 6 | `feat: 添加精度评估脚本` | test_accuracy.py | +| 7 | `feat: 添加并发压测脚本` | test_concurrency.py | +| 8 | `feat: 添加 GPU 需求分析和综合报告生成脚本` | gpu_requirements.py, generate_report.py | +| 9 | `feat: 添加一键运行脚本 run_all.py` | run_all.py | +| 10 | `docs: 添加测试结果数据` | results/*.json, REPORT.md |