qwen-test/vsp/qwen3.5-9b/test_basic_inference.py

"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import sys
import time
import torch
import psutil

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from model_utils import load_model, apply_chat

# 修复 Windows GBK 编码问题
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')


def test_basic_inference():
    """基础推理测试"""
    print("=" * 60)
    print("Qwen3.5-9B 基础推理测试 (4-bit NF4 量化, 纯GPU)")
    print("=" * 60)

    # 加载模型
    print("\n加载模型...")
    t0 = time.time()
    model, tokenizer = load_model()
    load_time = time.time() - t0
    print(f"  模型加载耗时: {load_time:.2f}s")

    # GPU 显存使用
    if torch.cuda.is_available():
        mem_used = torch.cuda.memory_allocated() / 1024**3
        mem_reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"  GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")

    # 测试推理
    test_prompts = [
        "你好，请介绍一下你自己。",
        "What is the capital of France?",
        "请用Python写一个快速排序算法。",
        "解释一下什么是机器学习。",
    ]

    print(f"\n{'='*60}")
    print("推理测试")
    print(f"{'='*60}")

    results = []
    for i, prompt in enumerate(test_prompts):
        print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
        messages = [{"role": "user", "content": prompt}]
        text = apply_chat(tokenizer, messages)
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]

        t0 = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=32,
                do_sample=True,
                temperature=0.7,
                top_p=0.8,
            )
        gen_time = time.time() - t0
        output_len = outputs.shape[1] - input_len
        tokens_per_sec = output_len / gen_time if gen_time > 0 else 0

        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
        print(f"  输出 tokens: {output_len}")
        print(f"  生成耗时: {gen_time:.2f}s")
        print(f"  速度: {tokens_per_sec:.1f} tokens/s")
        print(f"  回复: {response[:100]}...")

        results.append({
            "prompt": prompt,
            "output_tokens": output_len,
            "time_s": gen_time,
            "tokens_per_sec": tokens_per_sec,
        })

    # 汇总
    print(f"\n{'='*60}")
    print("基础测试汇总")
    print(f"{'='*60}")
    print(f"  模型加载耗时: {load_time:.2f}s")
    avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
    print(f"  平均生成速度: {avg_speed:.1f} tokens/s")
    print(f"  GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")

    return results


if __name__ == "__main__":
    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
    test_basic_inference()