qwen-test/vsp/qwen3.5-9b/test_basic_inference.py

"""基础推理测试 - 验证模型能否正常加载和生成"""
import os
import glob
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def get_model_path():
    """获取模型路径"""
    paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True)
    if paths:
        return os.path.dirname(paths[0])
    return "Qwen/Qwen3.5-9B"


def test_basic_inference():
    """基础推理测试"""
    print("=" * 60)
    print("Qwen3.5-9B 基础推理测试")
    print("=" * 60)

    # 4-bit 量化配置 (RTX 3050 8GB 必须量化)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    model_path = get_model_path()
    print(f"\n模型路径: {model_path}")

    # 加载 tokenizer
    print("加载 tokenizer...")
    t0 = time.time()
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    print(f"  Tokenizer 加载耗时: {time.time() - t0:.2f}s")

    # 加载模型 (4-bit 量化)
    print("加载模型 (4-bit 量化)...")
    t0 = time.time()
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    load_time = time.time() - t0
    print(f"  模型加载耗时: {load_time:.2f}s")

    # GPU 显存使用
    if torch.cuda.is_available():
        mem_used = torch.cuda.memory_allocated() / 1024**3
        mem_reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"  GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)")

    # 测试推理
    test_prompts = [
        "你好，请介绍一下你自己。",
        "What is the capital of France?",
        "请用Python写一个快速排序算法。",
        "解释一下什么是机器学习。",
    ]

    print(f"\n{'='*60}")
    print("推理测试")
    print(f"{'='*60}")

    results = []
    for i, prompt in enumerate(test_prompts):
        print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---")
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]

        t0 = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7,
                top_p=0.8,
            )
        gen_time = time.time() - t0
        output_len = outputs.shape[1] - input_len
        tokens_per_sec = output_len / gen_time if gen_time > 0 else 0

        response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
        print(f"  输出 tokens: {output_len}")
        print(f"  生成耗时: {gen_time:.2f}s")
        print(f"  速度: {tokens_per_sec:.1f} tokens/s")
        print(f"  回复: {response[:100]}...")

        results.append({
            "prompt": prompt,
            "output_tokens": output_len,
            "time_s": gen_time,
            "tokens_per_sec": tokens_per_sec,
        })

    # 汇总
    print(f"\n{'='*60}")
    print("基础测试汇总")
    print(f"{'='*60}")
    print(f"  模型加载耗时: {load_time:.2f}s")
    avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results)
    print(f"  平均生成速度: {avg_speed:.1f} tokens/s")
    print(f"  GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"  系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB")

    return results


if __name__ == "__main__":
    os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..")
    test_basic_inference()