From 4ac406572eaeb67cd1beb686255e8e3153dc7b77 Mon Sep 17 00:00:00 2001 From: 16337 <1633794139@qq.com> Date: Mon, 16 Mar 2026 13:05:20 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E6=96=B9=E5=BC=8F=EF=BC=8C=E6=94=B9=E7=94=A8?= =?UTF-8?q?=20FP16+CPU=20offload?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RTX 3050 8GB 无法完整加载 Qwen3.5-9B,即使量化也不行: - bitsandbytes 4-bit 不支持 CPU offload - bitsandbytes 8-bit 与 accelerate 存在版本兼容问题 - FP16 + CPU offload 可以加载但推理质量极差(输出乱码) - 推理速度仅 0.4 tokens/s 结论:RTX 3050 8GB 不适合运行 Qwen3.5-9B Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + vsp/qwen3.5-9b/benchmark_speed.py | 26 +++------------- vsp/qwen3.5-9b/model_utils.py | 41 ++++++++++++++++++++++++++ vsp/qwen3.5-9b/test_accuracy.py | 28 ++++-------------- vsp/qwen3.5-9b/test_basic_inference.py | 26 ++++++++-------- vsp/qwen3.5-9b/test_concurrency.py | 26 +++------------- 6 files changed, 68 insertions(+), 80 deletions(-) create mode 100644 vsp/qwen3.5-9b/model_utils.py diff --git a/.gitignore b/.gitignore index 7bd34c0..7c69916 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ build/ *.pth *.onnx vsp/qwen3.5-9b/model/ +vsp/qwen3.5-9b/offload/ # Env .env diff --git a/vsp/qwen3.5-9b/benchmark_speed.py b/vsp/qwen3.5-9b/benchmark_speed.py index 2d8eeb4..225d299 100644 --- a/vsp/qwen3.5-9b/benchmark_speed.py +++ b/vsp/qwen3.5-9b/benchmark_speed.py @@ -3,32 +3,14 @@ import time import json import os import glob +import sys import torch import psutil -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer from datetime import datetime - -def load_model(): - """加载 4-bit 量化模型""" - paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) - model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" - - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_path, - quantization_config=bnb_config, - device_map="auto", - trust_remote_code=True, - ) - return model, tokenizer +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from model_utils import load_model def benchmark_speed(model, tokenizer, num_runs=5): diff --git a/vsp/qwen3.5-9b/model_utils.py b/vsp/qwen3.5-9b/model_utils.py new file mode 100644 index 0000000..f843acd --- /dev/null +++ b/vsp/qwen3.5-9b/model_utils.py @@ -0,0 +1,41 @@ +"""共享模型加载工具 - 统一加载配置""" +import os +import sys +import glob +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +# 修复 Windows GBK 编码问题 +sys.stdout.reconfigure(encoding='utf-8', errors='replace') +sys.stderr.reconfigure(encoding='utf-8', errors='replace') + + +def get_model_path(): + """获取本地模型路径""" + paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) + if paths: + return os.path.dirname(paths[0]) + return "Qwen/Qwen3.5-9B" + + +def load_model(): + """加载模型 (FP16 + GPU/CPU offload) + + RTX 3050 8GB VRAM 不够放完整模型,使用 FP16 并将部分层 offload 到 CPU。 + """ + model_path = get_model_path() + print(f"模型路径: {model_path}") + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + max_memory = {0: "6GiB", "cpu": "24GiB"} + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + device_map="auto", + max_memory=max_memory, + offload_folder="vsp/qwen3.5-9b/offload", + trust_remote_code=True, + ) + + return model, tokenizer diff --git a/vsp/qwen3.5-9b/test_accuracy.py b/vsp/qwen3.5-9b/test_accuracy.py index 4895d5d..e9dff4f 100644 --- a/vsp/qwen3.5-9b/test_accuracy.py +++ b/vsp/qwen3.5-9b/test_accuracy.py @@ -1,11 +1,15 @@ """精度评估 - 测试模型在常见任务上的准确性""" import json import os +import sys import glob import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer from datetime import datetime +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from model_utils import load_model + # 测试数据集 ACCURACY_TESTS = [ @@ -69,28 +73,6 @@ ACCURACY_TESTS = [ ] -def load_model(): - """加载 4-bit 量化模型""" - paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) - model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" - - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_path, - quantization_config=bnb_config, - device_map="auto", - trust_remote_code=True, - ) - return model, tokenizer - - def evaluate_accuracy(model, tokenizer): """运行精度评估""" print("=" * 60) diff --git a/vsp/qwen3.5-9b/test_basic_inference.py b/vsp/qwen3.5-9b/test_basic_inference.py index c4ead6e..f46e27d 100644 --- a/vsp/qwen3.5-9b/test_basic_inference.py +++ b/vsp/qwen3.5-9b/test_basic_inference.py @@ -1,10 +1,15 @@ """基础推理测试 - 验证模型能否正常加载和生成""" import os +import sys import glob import time import torch import psutil -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer + +# 修复 Windows GBK 编码问题 +sys.stdout.reconfigure(encoding='utf-8', errors='replace') +sys.stderr.reconfigure(encoding='utf-8', errors='replace') def get_model_path(): @@ -21,14 +26,6 @@ def test_basic_inference(): print("Qwen3.5-9B 基础推理测试") print("=" * 60) - # 4-bit 量化配置 (RTX 3050 8GB 必须量化) - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - ) - model_path = get_model_path() print(f"\n模型路径: {model_path}") @@ -38,13 +35,16 @@ def test_basic_inference(): tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print(f" Tokenizer 加载耗时: {time.time() - t0:.2f}s") - # 加载模型 (4-bit 量化) - print("加载模型 (4-bit 量化)...") + # 加载模型 (FP16 + GPU/CPU offload) + print("加载模型 (FP16 + CPU offload)...") + max_memory = {0: "6GiB", "cpu": "24GiB"} t0 = time.time() model = AutoModelForCausalLM.from_pretrained( model_path, - quantization_config=bnb_config, + torch_dtype=torch.float16, device_map="auto", + max_memory=max_memory, + offload_folder="vsp/qwen3.5-9b/offload", trust_remote_code=True, ) load_time = time.time() - t0 @@ -80,7 +80,7 @@ def test_basic_inference(): with torch.no_grad(): outputs = model.generate( **inputs, - max_new_tokens=256, + max_new_tokens=32, do_sample=True, temperature=0.7, top_p=0.8, diff --git a/vsp/qwen3.5-9b/test_concurrency.py b/vsp/qwen3.5-9b/test_concurrency.py index 86b935d..012653c 100644 --- a/vsp/qwen3.5-9b/test_concurrency.py +++ b/vsp/qwen3.5-9b/test_concurrency.py @@ -1,35 +1,17 @@ """并发压测 - 测试不同并发数下的性能表现""" import json import os +import sys import glob import time import torch import threading from concurrent.futures import ThreadPoolExecutor, as_completed -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from transformers import AutoModelForCausalLM, AutoTokenizer from datetime import datetime - -def load_model(): - """加载 4-bit 量化模型""" - paths = glob.glob("vsp/qwen3.5-9b/model/**/config.json", recursive=True) - model_path = os.path.dirname(paths[0]) if paths else "Qwen/Qwen3.5-9B" - - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_path, - quantization_config=bnb_config, - device_map="auto", - trust_remote_code=True, - ) - return model, tokenizer +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from model_utils import load_model def single_inference(model, tokenizer, prompt, lock, max_tokens=64):