feat: 添加基础推理测试脚本（4-bit 量化）

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 11:45:49 +08:00
parent c2ce4f0a78
commit 1a96de6058
1 changed files with 120 additions and 0 deletions
--- a/vsp/qwen3.5-9b/test_basic_inference.py
+++ b/vsp/qwen3.5-9b/test_basic_inference.py
@@ -0,0 +1,120 @@
+"""基础推理测试 - 验证模型能否正常加载和生成"""
+import os
+import glob
+import time
+import torch
+import psutil
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+
+def get_model_path():
+    """获取模型路径"""
+    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
+    if paths:
+        return os.path.dirname(paths[0])
+    return "Qwen/Qwen3.5-9B"
+
+
+def test_basic_inference():
+    """基础推理测试"""
+    print("=" * 60)
+    print("Qwen3.5-9B 基础推理测试")
+    print("=" * 60)
+
+    # 4-bit 量化配置 (RTX 3050 8GB 必须量化)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    )
+
+    model_path = get_model_path()
+    print(f"\n模型路径: {model_path}")
+
+    # 加载 tokenizer
+    print("加载 tokenizer...")
+    t0 = time.time()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(f"  Tokenizer 加载耗时: {time.time() - t0:.2f}s")
+
+    # 加载模型 (4-bit 量化)
+    print("加载模型 (4-bit 量化)...")
+    t0 = time.time()
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    load_time = time.time() - t0
+    print(f"  模型加载耗时: {load_time:.2f}s")
+
+    # GPU 显存使用
+    if torch.cuda.is_available():
+        mem_used = torch.cuda.memory_allocated() / 1024**3
+        mem_reserved = torch.cuda.memory_reserved() / 1024**3
+        print(f"  GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")
+
+    # 测试推理
+    test_prompts = [
+        "你好，请介绍一下你自己。",
+        "What is the capital of France?",
+        "请用Python写一个快速排序算法。",
+        "解释一下什么是机器学习。",
+    ]
+
+    print(f"\n{'='*60}")
+    print("推理测试")
+    print(f"{'='*60}")
+
+    results = []
+    for i, prompt in enumerate(test_prompts):
+        print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
+        messages = [{"role": "user", "content": prompt}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        input_len = inputs["input_ids"].shape[1]
+
+        t0 = time.time()
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.8,
+            )
+        gen_time = time.time() - t0
+        output_len = outputs.shape[1] - input_len
+        tokens_per_sec = output_len / gen_time if gen_time > 0 else 0
+
+        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
+        print(f"  输出 tokens: {output_len}")
+        print(f"  生成耗时: {gen_time:.2f}s")
+        print(f"  速度: {tokens_per_sec:.1f} tokens/s")
+        print(f"  回复: {response[:100]}...")
+
+        results.append({
+            "prompt": prompt,
+            "output_tokens": output_len,
+            "time_s": gen_time,
+            "tokens_per_sec": tokens_per_sec,
+        })
+
+    # 汇总
+    print(f"\n{'='*60}")
+    print("基础测试汇总")
+    print(f"{'='*60}")
+    print(f"  模型加载耗时: {load_time:.2f}s")
+    avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
+    print(f"  平均生成速度: {avg_speed:.1f} tokens/s")
+    print(f"  GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+    print(f"  系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")
+
+    return results
+
+
+if __name__ == "__main__":
+    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
+    test_basic_inference()