From 1a96de605829931a12d9f7f9a7e39bd1d21a19a4 Mon Sep 17 00:00:00 2001 From: 16337 <1633794139@qq.com> Date: Mon, 16 Mar 2026 11:45:49 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=9F=BA=E7=A1=80?= =?UTF-8?q?=E6=8E=A8=E7=90=86=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC=EF=BC=88?= =?UTF-8?q?4-bit=20=E9=87=8F=E5=8C=96=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- vsp/qwen3.5-9b/test_basic_inference.py | 120 +++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 vsp/qwen3.5-9b/test_basic_inference.py diff --git a/vsp/qwen3.5-9b/test_basic_inference.py b/vsp/qwen3.5-9b/test_basic_inference.py new file mode 100644 index 0000000..c4ead6e --- /dev/null +++ b/vsp/qwen3.5-9b/test_basic_inference.py @@ -0,0 +1,120 @@ +"""基础推理测试 - 验证模型能否正常加载和生成""" +import os +import glob +import time +import torch +import psutil +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + + +def get_model_path(): + """获取模型路径""" + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + if paths: + return os.path.dirname(paths[0]) + return "Qwen/Qwen3.5-9B" + + +def test_basic_inference(): + """基础推理测试""" + print("=" * 60) + print("Qwen3.5-9B 基础推理测试") + print("=" * 60) + + # 4-bit 量化配置 (RTX 3050 8GB 必须量化) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + model_path = get_model_path() + print(f"\n模型路径: {model_path}") + + # 加载 tokenizer + print("加载 tokenizer...") + t0 = time.time() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s") + + # 加载模型 (4-bit 量化) + print("加载模型 (4-bit 量化)...") + t0 = time.time() + model = AutoModelForCausalLM.from_pretrained( + model_path, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ) + load_time = time.time() - t0 + print(f" 模型加载耗时: {load_time:.2f}s") + + # GPU 显存使用 + if torch.cuda.is_available(): + mem_used = torch.cuda.memory_allocated() / 1024**3 + mem_reserved = torch.cuda.memory_reserved() / 1024**3 + print(f" GPU 显存占用: {mem_used:.2f} GB (已分配) / {mem_reserved:.2f} GB (已预留)") + + # 测试推理 + test_prompts = [ + "你好,请介绍一下你自己。", + "What is the capital of France?", + "请用Python写一个快速排序算法。", + "解释一下什么是机器学习。", + ] + + print(f"\n{'='*60}") + print("推理测试") + print(f"{'='*60}") + + results = [] + for i, prompt in enumerate(test_prompts): + print(f"\n--- 测试 {i+1}: {prompt[:30]}... ---") + messages = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + input_len = inputs["input_ids"].shape[1] + + t0 = time.time() + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=256, + do_sample=True, + temperature=0.7, + top_p=0.8, + ) + gen_time = time.time() - t0 + output_len = outputs.shape[1] - input_len + tokens_per_sec = output_len / gen_time if gen_time > 0 else 0 + + response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True) + print(f" 输出 tokens: {output_len}") + print(f" 生成耗时: {gen_time:.2f}s") + print(f" 速度: {tokens_per_sec:.1f} tokens/s") + print(f" 回复: {response[:100]}...") + + results.append({ + "prompt": prompt, + "output_tokens": output_len, + "time_s": gen_time, + "tokens_per_sec": tokens_per_sec, + }) + + # 汇总 + print(f"\n{'='*60}") + print("基础测试汇总") + print(f"{'='*60}") + print(f" 模型加载耗时: {load_time:.2f}s") + avg_speed = sum(r["tokens_per_sec"] for r in results) / len(results) + print(f" 平均生成速度: {avg_speed:.1f} tokens/s") + print(f" GPU 显存占用: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") + print(f" 系统内存占用: {psutil.Process().memory_info().rss / 1024**3:.2f} GB") + + return results + + +if __name__ == "__main__": + os.chdir(os.path.dirname(os.path.abspath(__file__)) + "/../..") + test_basic_inference()