Compare commits
10 Commits
e522242ad4
...
42db2b0ca9
| Author | SHA1 | Date | |
|---|---|---|---|
| 42db2b0ca9 | |||
| 4ac406572e | |||
| f7174464d5 | |||
| fd0d6b05b5 | |||
| 837bf407e1 | |||
| 1c52b15a18 | |||
| 8f5b495ed3 | |||
| 1a96de6058 | |||
| c2ce4f0a78 | |||
| f29443ffb0 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -14,6 +14,7 @@ build/
|
||||
*.pth
|
||||
*.onnx
|
||||
vsp/qwen3.5-9b/model/
|
||||
vsp/qwen3.5-9b/offload/
|
||||
|
||||
# Env
|
||||
.env
|
||||
|
||||
130
vsp/qwen3.5-9b/benchmark_speed.py
Normal file
130
vsp/qwen3.5-9b/benchmark_speed.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""性能基准测试 - 推理速度、首 token 延迟、吞吐量"""
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
import torch
|
||||
import psutil
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
def benchmark_speed(model, tokenizer, num_runs=5):
|
||||
"""测试不同输入长度和输出长度下的推理速度"""
|
||||
print("=" * 60)
|
||||
print("性能基准测试 - 推理速度")
|
||||
print("=" * 60)
|
||||
|
||||
test_cases = [
|
||||
{"name": "短输入短输出", "prompt": "你好", "max_tokens": 50},
|
||||
{"name": "短输入中输出", "prompt": "介绍一下人工智能", "max_tokens": 128},
|
||||
{"name": "短输入长输出", "prompt": "请详细解释深度学习的原理和应用", "max_tokens": 256},
|
||||
{"name": "中输入中输出", "prompt": "以下是一段代码:\n```python\ndef fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)\n```\n请分析这段代码的时间复杂度并给出优化方案。", "max_tokens": 256},
|
||||
{"name": "长输入短输出", "prompt": "请总结以下内容的关键点:" + "人工智能是计算机科学的一个分支。" * 50, "max_tokens": 64},
|
||||
]
|
||||
|
||||
results = []
|
||||
for case in test_cases:
|
||||
print(f"\n--- {case['name']} (max_tokens={case['max_tokens']}) ---")
|
||||
times = []
|
||||
output_tokens_list = []
|
||||
|
||||
for run in range(num_runs):
|
||||
messages = [{"role": "user", "content": case["prompt"]}]
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
input_len = inputs["input_ids"].shape[1]
|
||||
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=case["max_tokens"],
|
||||
do_sample=False, # greedy for reproducibility
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
gen_time = time.perf_counter() - t0
|
||||
output_len = outputs.shape[1] - input_len
|
||||
|
||||
times.append(gen_time)
|
||||
output_tokens_list.append(output_len)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
avg_tokens = sum(output_tokens_list) / len(output_tokens_list)
|
||||
avg_speed = avg_tokens / avg_time if avg_time > 0 else 0
|
||||
|
||||
result = {
|
||||
"test_name": case["name"],
|
||||
"input_tokens": input_len,
|
||||
"avg_output_tokens": round(avg_tokens, 1),
|
||||
"avg_time_s": round(avg_time, 3),
|
||||
"avg_tokens_per_sec": round(avg_speed, 1),
|
||||
"min_time_s": round(min(times), 3),
|
||||
"max_time_s": round(max(times), 3),
|
||||
}
|
||||
results.append(result)
|
||||
print(f" 输入 tokens: {input_len}")
|
||||
print(f" 平均输出 tokens: {result['avg_output_tokens']}")
|
||||
print(f" 平均耗时: {result['avg_time_s']}s")
|
||||
print(f" 平均速度: {result['avg_tokens_per_sec']} tokens/s")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def benchmark_memory(model):
|
||||
"""测试显存和内存占用"""
|
||||
print(f"\n{'='*60}")
|
||||
print("显存与内存占用")
|
||||
print(f"{'='*60}")
|
||||
|
||||
result = {}
|
||||
if torch.cuda.is_available():
|
||||
result["gpu_allocated_gb"] = round(torch.cuda.memory_allocated() / 1024**3, 2)
|
||||
result["gpu_reserved_gb"] = round(torch.cuda.memory_reserved() / 1024**3, 2)
|
||||
result["gpu_total_gb"] = round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 1)
|
||||
result["gpu_name"] = torch.cuda.get_device_name(0)
|
||||
|
||||
process = psutil.Process()
|
||||
result["ram_used_gb"] = round(process.memory_info().rss / 1024**3, 2)
|
||||
result["ram_total_gb"] = round(psutil.virtual_memory().total / 1024**3, 1)
|
||||
|
||||
for k, v in result.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def save_results(speed_results, memory_results):
|
||||
"""保存测试结果"""
|
||||
output_dir = "vsp/qwen3.5-9b/results"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
report = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"model": "Qwen3.5-9B",
|
||||
"quantization": "4-bit NF4",
|
||||
"speed_benchmark": speed_results,
|
||||
"memory": memory_results,
|
||||
}
|
||||
|
||||
output_path = os.path.join(output_dir, "benchmark_speed.json")
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n结果已保存到 {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
model, tokenizer = load_model()
|
||||
speed_results = benchmark_speed(model, tokenizer)
|
||||
memory_results = benchmark_memory(model)
|
||||
save_results(speed_results, memory_results)
|
||||
31
vsp/qwen3.5-9b/download_model.py
Normal file
31
vsp/qwen3.5-9b/download_model.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""从 ModelScope 下载 Qwen3.5-9B 模型"""
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
|
||||
|
||||
def download_model(model_dir="vsp/qwen3.5-9b/model"):
|
||||
"""下载模型到指定目录"""
|
||||
from modelscope import snapshot_download
|
||||
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
print(f"开始下载 Qwen3.5-9B 到 {model_dir} ...")
|
||||
start = time.time()
|
||||
|
||||
model_path = snapshot_download(
|
||||
"Qwen/Qwen3.5-9B",
|
||||
cache_dir=model_dir,
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"下载完成!耗时: {elapsed:.1f}s")
|
||||
print(f"模型路径: {model_path}")
|
||||
return model_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model-dir", default="vsp/qwen3.5-9b/model",
|
||||
help="模型保存目录")
|
||||
args = parser.parse_args()
|
||||
download_model(args.model_dir)
|
||||
144
vsp/qwen3.5-9b/generate_report.py
Normal file
144
vsp/qwen3.5-9b/generate_report.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""综合报告生成 - 汇总所有测试结果"""
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_json(path):
|
||||
"""加载 JSON 文件"""
|
||||
if os.path.exists(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return None
|
||||
|
||||
|
||||
def generate_report():
|
||||
"""生成综合测试报告"""
|
||||
results_dir = "vsp/qwen3.5-9b/results"
|
||||
|
||||
speed = load_json(os.path.join(results_dir, "benchmark_speed.json"))
|
||||
accuracy = load_json(os.path.join(results_dir, "accuracy_results.json"))
|
||||
concurrency = load_json(os.path.join(results_dir, "concurrency_results.json"))
|
||||
gpu_req = load_json(os.path.join(results_dir, "gpu_requirements.json"))
|
||||
|
||||
report_lines = [
|
||||
"# Qwen3.5-9B 性能测试报告",
|
||||
f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
"\n## 1. 测试环境",
|
||||
"",
|
||||
"| 项目 | 值 |",
|
||||
"|------|-----|",
|
||||
"| 模型 | Qwen3.5-9B |",
|
||||
"| 加载方式 | FP16 + CPU offload (accelerate) |",
|
||||
"| GPU | NVIDIA GeForce RTX 3050 OEM |",
|
||||
"| GPU 显存 | 8 GB |",
|
||||
"| CUDA | 12.1 |",
|
||||
"| Python 环境 | conda yolo |",
|
||||
]
|
||||
|
||||
if speed and "memory" in speed:
|
||||
mem = speed["memory"]
|
||||
report_lines.extend([
|
||||
f"| GPU | {mem.get('gpu_name', 'N/A')} |",
|
||||
f"| GPU 显存 | {mem.get('gpu_total_gb', 'N/A')} GB |",
|
||||
f"| 模型显存占用 | {mem.get('gpu_allocated_gb', 'N/A')} GB |",
|
||||
f"| 系统内存占用 | {mem.get('ram_used_gb', 'N/A')} GB |",
|
||||
])
|
||||
|
||||
# 推理速度
|
||||
report_lines.extend(["\n## 2. 推理速度", ""])
|
||||
if speed and "speed_benchmark" in speed:
|
||||
report_lines.extend([
|
||||
"| 测试场景 | 输入tokens | 输出tokens | 耗时(s) | 速度(tokens/s) |",
|
||||
"|---------|-----------|-----------|---------|---------------|",
|
||||
])
|
||||
for r in speed["speed_benchmark"]:
|
||||
report_lines.append(
|
||||
f"| {r['test_name']} | {r['input_tokens']} | {r['avg_output_tokens']} | {r['avg_time_s']} | {r['avg_tokens_per_sec']} |"
|
||||
)
|
||||
else:
|
||||
report_lines.append("*未运行速度测试*")
|
||||
|
||||
# 精度
|
||||
report_lines.extend(["\n## 3. 精度评估", ""])
|
||||
if accuracy and "accuracy" in accuracy:
|
||||
acc = accuracy["accuracy"]
|
||||
report_lines.append(f"**总准确率: {acc['accuracy']}% ({acc['passed']}/{acc['total']})**\n")
|
||||
report_lines.extend([
|
||||
"| 分类 | 通过/总数 | 准确率 |",
|
||||
"|------|---------|--------|",
|
||||
])
|
||||
for cat, stats in acc.get("category_stats", {}).items():
|
||||
rate = stats["passed"] / stats["total"] * 100
|
||||
report_lines.append(f"| {cat} | {stats['passed']}/{stats['total']} | {rate:.0f}% |")
|
||||
else:
|
||||
report_lines.append("*未运行精度测试*")
|
||||
|
||||
# 并发
|
||||
report_lines.extend(["\n## 4. 并发性能", ""])
|
||||
if concurrency and "concurrency_results" in concurrency:
|
||||
report_lines.extend([
|
||||
"| 并发数 | 总耗时(s) | 吞吐量(tokens/s) | 平均延迟(s) |",
|
||||
"|-------|---------|----------------|-----------|",
|
||||
])
|
||||
for r in concurrency["concurrency_results"]:
|
||||
report_lines.append(
|
||||
f"| {r['concurrency']} | {r['total_time_s']} | {r['throughput_tokens_per_sec']} | {r['avg_latency_s']} |"
|
||||
)
|
||||
report_lines.append(f"\n> 注: {concurrency.get('note', '单GPU串行推理')}")
|
||||
else:
|
||||
report_lines.append("*未运行并发测试*")
|
||||
|
||||
# GPU 需求
|
||||
report_lines.extend(["\n## 5. GPU 算力需求", ""])
|
||||
if gpu_req:
|
||||
report_lines.extend([
|
||||
"| 精度 | 模型大小 | 最低显存 | 推荐显卡 |",
|
||||
"|------|---------|---------|---------|",
|
||||
])
|
||||
for precision, info in gpu_req.get("precision_requirements", {}).items():
|
||||
gpus = ", ".join(info["recommended_gpus"][:2])
|
||||
report_lines.append(
|
||||
f"| {precision} | {info['model_size_gb']}GB | {info['min_vram_gb']}GB | {gpus} |"
|
||||
)
|
||||
|
||||
# 实际测试结论
|
||||
report_lines.extend([
|
||||
"\n## 6. 实际测试结论",
|
||||
"",
|
||||
"### RTX 3050 8GB 测试结果",
|
||||
"",
|
||||
"| 指标 | 结果 |",
|
||||
"|------|------|",
|
||||
"| GPU 显存占用 | 3.91 GB |",
|
||||
"| 系统内存占用 | 13.60 GB |",
|
||||
"| 推理速度 | ~0.4 tokens/s |",
|
||||
"| 输出质量 | 极差(乱码/重复) |",
|
||||
"",
|
||||
"### 问题分析",
|
||||
"",
|
||||
"1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload,8GB 显存无法装下完整 4-bit 模型(~5GB 模型 + KV cache + 激活值)",
|
||||
"2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题(Windows)",
|
||||
"3. **FP16 + CPU offload**: 可以加载模型,但大量层 offload 到 CPU 导致推理极慢(0.4 tokens/s),输出质量不可用",
|
||||
"",
|
||||
"### 建议",
|
||||
"",
|
||||
"1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**,显存严重不足",
|
||||
"2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16)",
|
||||
"3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM",
|
||||
"4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行",
|
||||
])
|
||||
|
||||
# 保存报告
|
||||
report_text = "\n".join(report_lines)
|
||||
report_path = os.path.join(results_dir, "REPORT.md")
|
||||
with open(report_path, "w", encoding="utf-8") as f:
|
||||
f.write(report_text)
|
||||
|
||||
print(report_text)
|
||||
print(f"\n\n报告已保存到 {report_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
generate_report()
|
||||
104
vsp/qwen3.5-9b/gpu_requirements.py
Normal file
104
vsp/qwen3.5-9b/gpu_requirements.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""GPU 算力需求分析"""
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# Qwen3.5-9B 不同精度下的显存需求估算
|
||||
GPU_REQUIREMENTS = {
|
||||
"model": "Qwen3.5-9B",
|
||||
"parameters": "9B",
|
||||
"precision_requirements": {
|
||||
"FP32": {
|
||||
"model_size_gb": 36,
|
||||
"min_vram_gb": 40,
|
||||
"recommended_gpus": ["A100 80GB", "H100 80GB"],
|
||||
"note": "不推荐,显存占用过大",
|
||||
},
|
||||
"FP16/BF16": {
|
||||
"model_size_gb": 18,
|
||||
"min_vram_gb": 22,
|
||||
"recommended_gpus": ["A100 40GB", "RTX 4090 24GB", "RTX A6000 48GB", "V100 32GB"],
|
||||
"note": "标准推理精度,推荐用于生产环境",
|
||||
},
|
||||
"INT8": {
|
||||
"model_size_gb": 9,
|
||||
"min_vram_gb": 12,
|
||||
"recommended_gpus": ["RTX 4070 Ti 16GB", "RTX 3090 24GB", "T4 16GB", "RTX 4080 16GB"],
|
||||
"note": "轻微精度损失,性价比高",
|
||||
},
|
||||
"INT4 (NF4)": {
|
||||
"model_size_gb": 5,
|
||||
"min_vram_gb": 8,
|
||||
"recommended_gpus": ["RTX 4060 8GB", "RTX 3060 12GB", "RTX 3070 8GB"],
|
||||
"note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐",
|
||||
},
|
||||
},
|
||||
"actual_test_results": {
|
||||
"gpu": "NVIDIA GeForce RTX 3050 OEM 8GB",
|
||||
"method": "FP16 + CPU offload (accelerate device_map=auto)",
|
||||
"gpu_vram_used_gb": 3.91,
|
||||
"ram_used_gb": 13.60,
|
||||
"inference_speed_tokens_per_sec": 0.4,
|
||||
"output_quality": "极差(乱码/重复输出)",
|
||||
"conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B,显存不足导致大量层 offload 到 CPU,推理极慢且输出质量不可用",
|
||||
"issues": [
|
||||
"bitsandbytes 4-bit 量化不支持 CPU offload,8GB 显存装不下完整 4-bit 模型",
|
||||
"bitsandbytes INT8 与 accelerate 版本不兼容(Windows)",
|
||||
"FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s,输出为乱码",
|
||||
],
|
||||
},
|
||||
"deployment_recommendations": {
|
||||
"开发测试": {
|
||||
"gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB",
|
||||
"precision": "INT8 或 INT4",
|
||||
"concurrent": 1,
|
||||
"cost_estimate": "~2500-4000 RMB (显卡)",
|
||||
},
|
||||
"小规模部署": {
|
||||
"gpu": "RTX 4090 (24GB)",
|
||||
"precision": "FP16",
|
||||
"concurrent": "2-4",
|
||||
"cost_estimate": "~12000-15000 RMB (显卡)",
|
||||
},
|
||||
"生产环境": {
|
||||
"gpu": "A100 40GB / H100 80GB",
|
||||
"precision": "FP16/BF16",
|
||||
"concurrent": "8-32 (vLLM)",
|
||||
"cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def analyze_gpu_requirements():
|
||||
"""输出 GPU 需求分析"""
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B GPU 算力需求分析")
|
||||
print("=" * 60)
|
||||
|
||||
for precision, info in GPU_REQUIREMENTS["precision_requirements"].items():
|
||||
print(f"\n{precision}:")
|
||||
print(f" 模型大小: ~{info['model_size_gb']} GB")
|
||||
print(f" 最低显存: {info['min_vram_gb']} GB")
|
||||
print(f" 推荐显卡: {', '.join(info['recommended_gpus'])}")
|
||||
print(f" 备注: {info['note']}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("部署方案推荐")
|
||||
print(f"{'='*60}")
|
||||
for scenario, info in GPU_REQUIREMENTS["deployment_recommendations"].items():
|
||||
print(f"\n{scenario}:")
|
||||
for k, v in info.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
# 保存
|
||||
output_dir = "vsp/qwen3.5-9b/results"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
path = os.path.join(output_dir, "gpu_requirements.json")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(GPU_REQUIREMENTS, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n结果已保存到 {path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_gpu_requirements()
|
||||
41
vsp/qwen3.5-9b/model_utils.py
Normal file
41
vsp/qwen3.5-9b/model_utils.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""共享模型加载工具 - 统一加载配置"""
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# 修复 Windows GBK 编码问题
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def get_model_path():
|
||||
"""获取本地模型路径"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
if paths:
|
||||
return os.path.dirname(paths[0])
|
||||
return "Qwen/Qwen3.5-9B"
|
||||
|
||||
|
||||
def load_model():
|
||||
"""加载模型 (FP16 + GPU/CPU offload)
|
||||
|
||||
RTX 3050 8GB VRAM 不够放完整模型,使用 FP16 并将部分层 offload 到 CPU。
|
||||
"""
|
||||
model_path = get_model_path()
|
||||
print(f"模型路径: {model_path}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
max_memory = {0: "6GiB", "cpu": "24GiB"}
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
max_memory=max_memory,
|
||||
offload_folder="vsp/qwen3.5-9b/offload",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return model, tokenizer
|
||||
10
vsp/qwen3.5-9b/requirements.txt
Normal file
10
vsp/qwen3.5-9b/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
modelscope>=1.9.0
|
||||
transformers>=4.37.0
|
||||
accelerate>=0.25.0
|
||||
bitsandbytes>=0.41.0
|
||||
sentencepiece
|
||||
protobuf
|
||||
psutil
|
||||
pandas
|
||||
matplotlib
|
||||
tqdm
|
||||
59
vsp/qwen3.5-9b/results/REPORT.md
Normal file
59
vsp/qwen3.5-9b/results/REPORT.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Qwen3.5-9B 性能测试报告
|
||||
|
||||
生成时间: 2026-03-16 13:09:10
|
||||
|
||||
## 1. 测试环境
|
||||
|
||||
| 项目 | 值 |
|
||||
|------|-----|
|
||||
| 模型 | Qwen3.5-9B |
|
||||
| 加载方式 | FP16 + CPU offload (accelerate) |
|
||||
| GPU | NVIDIA GeForce RTX 3050 OEM |
|
||||
| GPU 显存 | 8 GB |
|
||||
| CUDA | 12.1 |
|
||||
| Python 环境 | conda yolo |
|
||||
|
||||
## 2. 推理速度
|
||||
|
||||
*未运行速度测试*
|
||||
|
||||
## 3. 精度评估
|
||||
|
||||
*未运行精度测试*
|
||||
|
||||
## 4. 并发性能
|
||||
|
||||
*未运行并发测试*
|
||||
|
||||
## 5. GPU 算力需求
|
||||
|
||||
| 精度 | 模型大小 | 最低显存 | 推荐显卡 |
|
||||
|------|---------|---------|---------|
|
||||
| FP32 | 36GB | 40GB | A100 80GB, H100 80GB |
|
||||
| FP16/BF16 | 18GB | 22GB | A100 40GB, RTX 4090 24GB |
|
||||
| INT8 | 9GB | 12GB | RTX 4070 Ti 16GB, RTX 3090 24GB |
|
||||
| INT4 (NF4) | 5GB | 8GB | RTX 4060 8GB, RTX 3060 12GB |
|
||||
|
||||
## 6. 实际测试结论
|
||||
|
||||
### RTX 3050 8GB 测试结果
|
||||
|
||||
| 指标 | 结果 |
|
||||
|------|------|
|
||||
| GPU 显存占用 | 3.91 GB |
|
||||
| 系统内存占用 | 13.60 GB |
|
||||
| 推理速度 | ~0.4 tokens/s |
|
||||
| 输出质量 | 极差(乱码/重复) |
|
||||
|
||||
### 问题分析
|
||||
|
||||
1. **bitsandbytes 4-bit 量化不可用**: 不支持 CPU offload,8GB 显存无法装下完整 4-bit 模型(~5GB 模型 + KV cache + 激活值)
|
||||
2. **bitsandbytes INT8 不可用**: 与 accelerate 新版本存在兼容性问题(Windows)
|
||||
3. **FP16 + CPU offload**: 可以加载模型,但大量层 offload 到 CPU 导致推理极慢(0.4 tokens/s),输出质量不可用
|
||||
|
||||
### 建议
|
||||
|
||||
1. **RTX 3050 8GB 无法有效运行 Qwen3.5-9B**,显存严重不足
|
||||
2. **最低推荐**: RTX 3060 12GB (INT8) 或 RTX 4060 Ti 16GB (INT8/FP16)
|
||||
3. **生产部署**: RTX 4090 24GB (FP16) 或 A100 40GB/80GB + vLLM
|
||||
4. **替代方案**: 使用更小的模型如 Qwen2.5-3B 或 Qwen2.5-7B 在 8GB 显卡上运行
|
||||
81
vsp/qwen3.5-9b/results/gpu_requirements.json
Normal file
81
vsp/qwen3.5-9b/results/gpu_requirements.json
Normal file
@@ -0,0 +1,81 @@
|
||||
{
|
||||
"model": "Qwen3.5-9B",
|
||||
"parameters": "9B",
|
||||
"precision_requirements": {
|
||||
"FP32": {
|
||||
"model_size_gb": 36,
|
||||
"min_vram_gb": 40,
|
||||
"recommended_gpus": [
|
||||
"A100 80GB",
|
||||
"H100 80GB"
|
||||
],
|
||||
"note": "不推荐,显存占用过大"
|
||||
},
|
||||
"FP16/BF16": {
|
||||
"model_size_gb": 18,
|
||||
"min_vram_gb": 22,
|
||||
"recommended_gpus": [
|
||||
"A100 40GB",
|
||||
"RTX 4090 24GB",
|
||||
"RTX A6000 48GB",
|
||||
"V100 32GB"
|
||||
],
|
||||
"note": "标准推理精度,推荐用于生产环境"
|
||||
},
|
||||
"INT8": {
|
||||
"model_size_gb": 9,
|
||||
"min_vram_gb": 12,
|
||||
"recommended_gpus": [
|
||||
"RTX 4070 Ti 16GB",
|
||||
"RTX 3090 24GB",
|
||||
"T4 16GB",
|
||||
"RTX 4080 16GB"
|
||||
],
|
||||
"note": "轻微精度损失,性价比高"
|
||||
},
|
||||
"INT4 (NF4)": {
|
||||
"model_size_gb": 5,
|
||||
"min_vram_gb": 8,
|
||||
"recommended_gpus": [
|
||||
"RTX 4060 8GB",
|
||||
"RTX 3060 12GB",
|
||||
"RTX 3070 8GB"
|
||||
],
|
||||
"note": "理论可行但 bitsandbytes 在 Windows 上兼容性差,不推荐"
|
||||
}
|
||||
},
|
||||
"actual_test_results": {
|
||||
"gpu": "NVIDIA GeForce RTX 3050 OEM 8GB",
|
||||
"method": "FP16 + CPU offload (accelerate device_map=auto)",
|
||||
"gpu_vram_used_gb": 3.91,
|
||||
"ram_used_gb": 13.6,
|
||||
"inference_speed_tokens_per_sec": 0.4,
|
||||
"output_quality": "极差(乱码/重复输出)",
|
||||
"conclusion": "RTX 3050 8GB 无法有效运行 Qwen3.5-9B,显存不足导致大量层 offload 到 CPU,推理极慢且输出质量不可用",
|
||||
"issues": [
|
||||
"bitsandbytes 4-bit 量化不支持 CPU offload,8GB 显存装不下完整 4-bit 模型",
|
||||
"bitsandbytes INT8 与 accelerate 版本不兼容(Windows)",
|
||||
"FP16 + CPU offload 虽可加载但速度仅 0.4 tokens/s,输出为乱码"
|
||||
]
|
||||
},
|
||||
"deployment_recommendations": {
|
||||
"开发测试": {
|
||||
"gpu": "RTX 3060 12GB / RTX 4060 Ti 16GB",
|
||||
"precision": "INT8 或 INT4",
|
||||
"concurrent": 1,
|
||||
"cost_estimate": "~2500-4000 RMB (显卡)"
|
||||
},
|
||||
"小规模部署": {
|
||||
"gpu": "RTX 4090 (24GB)",
|
||||
"precision": "FP16",
|
||||
"concurrent": "2-4",
|
||||
"cost_estimate": "~12000-15000 RMB (显卡)"
|
||||
},
|
||||
"生产环境": {
|
||||
"gpu": "A100 40GB / H100 80GB",
|
||||
"precision": "FP16/BF16",
|
||||
"concurrent": "8-32 (vLLM)",
|
||||
"cost_estimate": "~60000-200000 RMB (显卡) 或云服务按需"
|
||||
}
|
||||
}
|
||||
}
|
||||
49
vsp/qwen3.5-9b/run_all.py
Normal file
49
vsp/qwen3.5-9b/run_all.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""一键运行所有测试"""
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
|
||||
|
||||
SCRIPTS = [
|
||||
("环境检查", "vsp/qwen3.5-9b/setup_env.py"),
|
||||
("模型下载", "vsp/qwen3.5-9b/download_model.py"),
|
||||
("基础推理测试", "vsp/qwen3.5-9b/test_basic_inference.py"),
|
||||
("性能基准测试", "vsp/qwen3.5-9b/benchmark_speed.py"),
|
||||
("精度评估", "vsp/qwen3.5-9b/test_accuracy.py"),
|
||||
("并发压测", "vsp/qwen3.5-9b/test_concurrency.py"),
|
||||
("GPU需求分析", "vsp/qwen3.5-9b/gpu_requirements.py"),
|
||||
("生成报告", "vsp/qwen3.5-9b/generate_report.py"),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B 全量测试")
|
||||
print("=" * 60)
|
||||
|
||||
for name, script in SCRIPTS:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[{name}] 运行 {script}")
|
||||
print("=" * 60)
|
||||
|
||||
t0 = time.time()
|
||||
result = subprocess.run([sys.executable, script], capture_output=False)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"\n[ERROR] {name} 失败 (退出码: {result.returncode})")
|
||||
choice = input("继续运行后续测试?(y/n): ").strip().lower()
|
||||
if choice != "y":
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"\n[OK] {name} 完成 ({elapsed:.1f}s)")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("所有测试完成!查看报告: vsp/qwen3.5-9b/results/REPORT.md")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
49
vsp/qwen3.5-9b/setup_env.py
Normal file
49
vsp/qwen3.5-9b/setup_env.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""环境检查与依赖验证脚本"""
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def check_and_install():
|
||||
"""检查并安装依赖"""
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B 测试环境检查")
|
||||
print("=" * 60)
|
||||
|
||||
# 检查 Python 版本
|
||||
print(f"\nPython 版本: {sys.version}")
|
||||
|
||||
# 检查 CUDA
|
||||
try:
|
||||
import torch
|
||||
print(f"PyTorch 版本: {torch.__version__}")
|
||||
print(f"CUDA 可用: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
||||
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
||||
print(f"VRAM: {vram_gb:.1f} GB")
|
||||
except ImportError:
|
||||
print("ERROR: PyTorch 未安装")
|
||||
sys.exit(1)
|
||||
|
||||
# 安装依赖
|
||||
print("\n安装依赖包...")
|
||||
subprocess.check_call([
|
||||
sys.executable, "-m", "pip", "install", "-r",
|
||||
"vsp/qwen3.5-9b/requirements.txt", "-q"
|
||||
])
|
||||
|
||||
# 验证关键包
|
||||
packages = ["transformers", "modelscope", "accelerate", "bitsandbytes"]
|
||||
for pkg in packages:
|
||||
try:
|
||||
mod = __import__(pkg)
|
||||
ver = getattr(mod, "__version__", "unknown")
|
||||
print(f" {pkg}: {ver}")
|
||||
except ImportError:
|
||||
print(f" ERROR: {pkg} 安装失败")
|
||||
|
||||
print("\n环境检查完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_and_install()
|
||||
167
vsp/qwen3.5-9b/test_accuracy.py
Normal file
167
vsp/qwen3.5-9b/test_accuracy.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""精度评估 - 测试模型在常见任务上的准确性"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
# 测试数据集
|
||||
ACCURACY_TESTS = [
|
||||
# 知识问答
|
||||
{
|
||||
"category": "知识问答",
|
||||
"prompt": "中国的首都是哪个城市?请只回答城市名。",
|
||||
"expected_contains": ["北京"],
|
||||
},
|
||||
{
|
||||
"category": "知识问答",
|
||||
"prompt": "水的化学式是什么?请只回答化学式。",
|
||||
"expected_contains": ["H2O"],
|
||||
},
|
||||
{
|
||||
"category": "知识问答",
|
||||
"prompt": "地球到太阳的平均距离大约是多少公里?A. 1.5亿 B. 3亿 C. 5亿 D. 1亿。请只回答选项字母。",
|
||||
"expected_contains": ["A"],
|
||||
},
|
||||
# 数学推理
|
||||
{
|
||||
"category": "数学推理",
|
||||
"prompt": "计算 15 * 23 = ? 请只回答数字。",
|
||||
"expected_contains": ["345"],
|
||||
},
|
||||
{
|
||||
"category": "数学推理",
|
||||
"prompt": "一个三角形三边分别是3、4、5,它是什么三角形?请只回答类型。",
|
||||
"expected_contains": ["直角"],
|
||||
},
|
||||
# 逻辑推理
|
||||
{
|
||||
"category": "逻辑推理",
|
||||
"prompt": "所有的狗都是动物。小白是一只狗。所以小白是什么?请只回答一个词。",
|
||||
"expected_contains": ["动物"],
|
||||
},
|
||||
# 代码理解
|
||||
{
|
||||
"category": "代码理解",
|
||||
"prompt": "以下Python代码的输出是什么?\n```python\nprint(len([1, 2, 3, 4, 5]))\n```\n请只回答数字。",
|
||||
"expected_contains": ["5"],
|
||||
},
|
||||
# 翻译
|
||||
{
|
||||
"category": "翻译",
|
||||
"prompt": "将'Hello World'翻译成中文,请只回答翻译结果。",
|
||||
"expected_contains": ["你好", "世界"],
|
||||
},
|
||||
# 摘要能力
|
||||
{
|
||||
"category": "摘要",
|
||||
"prompt": "用一句话总结:人工智能(AI)是指由人工制造出来的系统所展现出来的智能。AI的核心问题包括推理、知识表示、规划、学习、自然语言处理、感知和移动与操作物体的能力。",
|
||||
"expected_contains": ["人工智能", "AI"],
|
||||
},
|
||||
# 分类
|
||||
{
|
||||
"category": "情感分类",
|
||||
"prompt": "判断以下文本的情感是正面还是负面:'这个产品太糟糕了,完全不值这个价格'。请只回答'正面'或'负面'。",
|
||||
"expected_contains": ["负面"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def evaluate_accuracy(model, tokenizer):
|
||||
"""运行精度评估"""
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B 精度评估")
|
||||
print("=" * 60)
|
||||
|
||||
results = []
|
||||
category_stats = {}
|
||||
|
||||
for i, test in enumerate(ACCURACY_TESTS):
|
||||
messages = [{"role": "user", "content": test["prompt"]}]
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
do_sample=False,
|
||||
)
|
||||
|
||||
input_len = inputs["input_ids"].shape[1]
|
||||
response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
|
||||
|
||||
# 检查是否包含预期关键词
|
||||
passed = any(kw in response for kw in test["expected_contains"])
|
||||
|
||||
cat = test["category"]
|
||||
if cat not in category_stats:
|
||||
category_stats[cat] = {"total": 0, "passed": 0}
|
||||
category_stats[cat]["total"] += 1
|
||||
if passed:
|
||||
category_stats[cat]["passed"] += 1
|
||||
|
||||
status = "PASS" if passed else "FAIL"
|
||||
print(f"\n[{status}] 测试 {i+1} ({cat})")
|
||||
print(f" 问题: {test['prompt'][:50]}...")
|
||||
print(f" 回答: {response[:80]}")
|
||||
print(f" 预期包含: {test['expected_contains']}")
|
||||
|
||||
results.append({
|
||||
"category": cat,
|
||||
"prompt": test["prompt"],
|
||||
"response": response,
|
||||
"expected": test["expected_contains"],
|
||||
"passed": passed,
|
||||
})
|
||||
|
||||
# 汇总
|
||||
total = len(results)
|
||||
passed = sum(1 for r in results if r["passed"])
|
||||
print(f"\n{'='*60}")
|
||||
print(f"精度评估汇总")
|
||||
print(f"{'='*60}")
|
||||
print(f" 总计: {total} 题, 通过: {passed} 题, 准确率: {passed/total*100:.1f}%")
|
||||
print(f"\n 分类统计:")
|
||||
for cat, stats in category_stats.items():
|
||||
rate = stats["passed"] / stats["total"] * 100
|
||||
print(f" {cat}: {stats['passed']}/{stats['total']} ({rate:.0f}%)")
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"accuracy": round(passed / total * 100, 1),
|
||||
"category_stats": category_stats,
|
||||
"details": results,
|
||||
}
|
||||
|
||||
|
||||
def save_results(accuracy_results):
|
||||
"""保存结果"""
|
||||
output_dir = "vsp/qwen3.5-9b/results"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
report = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"model": "Qwen3.5-9B",
|
||||
"quantization": "4-bit NF4",
|
||||
"accuracy": accuracy_results,
|
||||
}
|
||||
|
||||
path = os.path.join(output_dir, "accuracy_results.json")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n结果已保存到 {path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
model, tokenizer = load_model()
|
||||
results = evaluate_accuracy(model, tokenizer)
|
||||
save_results(results)
|
||||
120
vsp/qwen3.5-9b/test_basic_inference.py
Normal file
120
vsp/qwen3.5-9b/test_basic_inference.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""基础推理测试 - 验证模型能否正常加载和生成"""
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import torch
|
||||
import psutil
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# 修复 Windows GBK 编码问题
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def get_model_path():
|
||||
"""获取模型路径"""
|
||||
paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
|
||||
if paths:
|
||||
return os.path.dirname(paths[0])
|
||||
return "Qwen/Qwen3.5-9B"
|
||||
|
||||
|
||||
def test_basic_inference():
|
||||
"""基础推理测试"""
|
||||
print("=" * 60)
|
||||
print("Qwen3.5-9B 基础推理测试")
|
||||
print("=" * 60)
|
||||
|
||||
model_path = get_model_path()
|
||||
print(f"\n模型路径: {model_path}")
|
||||
|
||||
# 加载 tokenizer
|
||||
print("加载 tokenizer...")
|
||||
t0 = time.time()
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s")
|
||||
|
||||
# 加载模型 (FP16 + GPU/CPU offload)
|
||||
print("加载模型 (FP16 + CPU offload)...")
|
||||
max_memory = {0: "6GiB", "cpu": "24GiB"}
|
||||
t0 = time.time()
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
max_memory=max_memory,
|
||||
offload_folder="vsp/qwen3.5-9b/offload",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
load_time = time.time() - t0
|
||||
print(f" 模型加载耗时: {load_time:.2f}s")
|
||||
|
||||
# GPU 显存使用
|
||||
if torch.cuda.is_available():
|
||||
mem_used = torch.cuda.memory_allocated() / 1024**3
|
||||
mem_reserved = torch.cuda.memory_reserved() / 1024**3
|
||||
print(f" GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")
|
||||
|
||||
# 测试推理
|
||||
test_prompts = [
|
||||
"你好,请介绍一下你自己。",
|
||||
"What is the capital of France?",
|
||||
"请用Python写一个快速排序算法。",
|
||||
"解释一下什么是机器学习。",
|
||||
]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("推理测试")
|
||||
print(f"{'='*60}")
|
||||
|
||||
results = []
|
||||
for i, prompt in enumerate(test_prompts):
|
||||
print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
input_len = inputs["input_ids"].shape[1]
|
||||
|
||||
t0 = time.time()
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=32,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.8,
|
||||
)
|
||||
gen_time = time.time() - t0
|
||||
output_len = outputs.shape[1] - input_len
|
||||
tokens_per_sec = output_len / gen_time if gen_time > 0 else 0
|
||||
|
||||
response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
|
||||
print(f" 输出 tokens: {output_len}")
|
||||
print(f" 生成耗时: {gen_time:.2f}s")
|
||||
print(f" 速度: {tokens_per_sec:.1f} tokens/s")
|
||||
print(f" 回复: {response[:100]}...")
|
||||
|
||||
results.append({
|
||||
"prompt": prompt,
|
||||
"output_tokens": output_len,
|
||||
"time_s": gen_time,
|
||||
"tokens_per_sec": tokens_per_sec,
|
||||
})
|
||||
|
||||
# 汇总
|
||||
print(f"\n{'='*60}")
|
||||
print("基础测试汇总")
|
||||
print(f"{'='*60}")
|
||||
print(f" 模型加载耗时: {load_time:.2f}s")
|
||||
avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
|
||||
print(f" 平均生成速度: {avg_speed:.1f} tokens/s")
|
||||
print(f" GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
print(f" 系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
test_basic_inference()
|
||||
119
vsp/qwen3.5-9b/test_concurrency.py
Normal file
119
vsp/qwen3.5-9b/test_concurrency.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""并发压测 - 测试不同并发数下的性能表现"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import torch
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from model_utils import load_model
|
||||
|
||||
|
||||
def single_inference(model, tokenizer, prompt, lock, max_tokens=64):
|
||||
"""单次推理(线程安全)"""
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
input_len = inputs["input_ids"].shape[1]
|
||||
|
||||
t0 = time.perf_counter()
|
||||
with lock: # GPU 推理需要串行(单 GPU)
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_tokens,
|
||||
do_sample=False,
|
||||
)
|
||||
elapsed = time.perf_counter() - t0
|
||||
output_len = outputs.shape[1] - input_len
|
||||
|
||||
return {
|
||||
"time_s": elapsed,
|
||||
"output_tokens": output_len,
|
||||
"tokens_per_sec": output_len / elapsed if elapsed > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
def test_concurrency(model, tokenizer):
|
||||
"""测试不同并发数下的表现"""
|
||||
print("=" * 60)
|
||||
print("并发压测")
|
||||
print("=" * 60)
|
||||
|
||||
prompts = [
|
||||
"什么是人工智能?",
|
||||
"请解释量子计算。",
|
||||
"Python的优点是什么?",
|
||||
"深度学习和机器学习的区别?",
|
||||
"什么是自然语言处理?",
|
||||
"解释一下GPT的工作原理。",
|
||||
"什么是强化学习?",
|
||||
"云计算的优势有哪些?",
|
||||
]
|
||||
|
||||
concurrency_levels = [1, 2, 4, 8]
|
||||
lock = threading.Lock()
|
||||
results = []
|
||||
|
||||
for n_concurrent in concurrency_levels:
|
||||
print(f"\n--- 并发数: {n_concurrent} ---")
|
||||
test_prompts = (prompts * ((n_concurrent // len(prompts)) + 1))[:n_concurrent]
|
||||
|
||||
t0 = time.perf_counter()
|
||||
futures_results = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=n_concurrent) as executor:
|
||||
futures = [
|
||||
executor.submit(single_inference, model, tokenizer, p, lock)
|
||||
for p in test_prompts
|
||||
]
|
||||
for f in as_completed(futures):
|
||||
futures_results.append(f.result())
|
||||
|
||||
total_time = time.perf_counter() - t0
|
||||
total_tokens = sum(r["output_tokens"] for r in futures_results)
|
||||
avg_latency = sum(r["time_s"] for r in futures_results) / len(futures_results)
|
||||
throughput = total_tokens / total_time
|
||||
|
||||
result = {
|
||||
"concurrency": n_concurrent,
|
||||
"total_time_s": round(total_time, 2),
|
||||
"total_tokens": total_tokens,
|
||||
"throughput_tokens_per_sec": round(throughput, 1),
|
||||
"avg_latency_s": round(avg_latency, 2),
|
||||
"requests_completed": len(futures_results),
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
print(f" 总耗时: {result['total_time_s']}s")
|
||||
print(f" 总 tokens: {result['total_tokens']}")
|
||||
print(f" 吞吐量: {result['throughput_tokens_per_sec']} tokens/s")
|
||||
print(f" 平均延迟: {result['avg_latency_s']}s")
|
||||
|
||||
# 保存
|
||||
output_dir = "vsp/qwen3.5-9b/results"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
report = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"model": "Qwen3.5-9B",
|
||||
"quantization": "4-bit NF4",
|
||||
"note": "单GPU串行推理,并发测试主要体现请求排队效果",
|
||||
"concurrency_results": results,
|
||||
}
|
||||
path = os.path.join(output_dir, "concurrency_results.json")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n结果已保存到 {path}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
|
||||
model, tokenizer = load_model()
|
||||
test_concurrency(model, tokenizer)
|
||||
Reference in New Issue
Block a user