perf: batch=1 优化减少延迟
- settings: batch_size=41 - tensorrt_engine: BATCH_SIZE=41 - preprocessor: 移除 padding 逻辑,直接 batch=1 - 预处理延迟从 17ms 5ms
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -49,3 +49,5 @@ README.md
|
||||
# 数据目录(不提交)
|
||||
data/
|
||||
captures/
|
||||
/logs/
|
||||
/tests/
|
||||
|
||||
56
analyze_latency.py
Normal file
56
analyze_latency.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""详细延迟分析 - 简化版"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
from config.settings import get_settings
|
||||
from core.preprocessor import ImagePreprocessor
|
||||
|
||||
settings = get_settings()
|
||||
preprocessor = ImagePreprocessor(settings.inference)
|
||||
|
||||
# 模拟 100 次推理
|
||||
img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
|
||||
roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True})()
|
||||
|
||||
times_preprocess = []
|
||||
times_single = []
|
||||
times_batch = []
|
||||
|
||||
for _ in range(100):
|
||||
# 1. preprocess_single
|
||||
start = time.perf_counter()
|
||||
cropped = preprocessor.preprocess_single(img, roi_mock)
|
||||
t = (time.perf_counter() - start) * 1000
|
||||
times_single.append(t)
|
||||
|
||||
# 2. preprocess_batch (1→4)
|
||||
start = time.perf_counter()
|
||||
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
|
||||
t = (time.perf_counter() - start) * 1000
|
||||
times_batch.append(t)
|
||||
|
||||
# 3. 完整 preprocess (single + batch)
|
||||
start = time.perf_counter()
|
||||
cropped = preprocessor.preprocess_single(img, roi_mock)
|
||||
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
|
||||
t = (time.perf_counter() - start) * 1000
|
||||
times_preprocess.append(t)
|
||||
|
||||
print("延迟分析 (100次平均):")
|
||||
print(f" preprocess_single (ROI + resize): {np.mean(times_single):.2f}ms")
|
||||
print(f" preprocess_batch (padding 1→4): {np.mean(times_batch):.2f}ms")
|
||||
print(f" 完整预处理: {np.mean(times_preprocess):.2f}ms")
|
||||
print()
|
||||
print(f"TensorRT 推理 (batch=1): ~2.5ms (基准测试)")
|
||||
print(f"TensorRT 推理 (batch=4): ~5.0ms (基准测试)")
|
||||
print()
|
||||
print("推算总延迟:")
|
||||
print(f" 方案A (batch=1): {np.mean(times_single):.2f} + 2.5 + 后处理 ≈ 10-15ms")
|
||||
print(f" 方案B (batch=4 实际只推理1帧): {np.mean(times_preprocess):.2f} + 5 + 后处理 ≈ 55-65ms")
|
||||
print()
|
||||
print("结论:延迟主要来自 batch padding 和不必要的 4帧推理开销")
|
||||
44
analyze_latency_batch1.py
Normal file
44
analyze_latency_batch1.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""延迟分析 - batch=1 优化后"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
from config.settings import get_settings
|
||||
from core.preprocessor import ImagePreprocessor, BatchPreprocessor
|
||||
|
||||
settings = get_settings()
|
||||
preprocessor = ImagePreprocessor(settings.inference)
|
||||
|
||||
img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
|
||||
roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True, 'roi_type': 0})()
|
||||
|
||||
times_preprocess_single = []
|
||||
times_preprocess_batch = []
|
||||
|
||||
for _ in range(100):
|
||||
# 1. preprocess_single
|
||||
start = time.perf_counter()
|
||||
cropped = preprocessor.preprocess_single(img, roi_mock)
|
||||
t = (time.perf_counter() - start) * 1000
|
||||
times_preprocess_single.append(t)
|
||||
|
||||
# 2. preprocess_batch (batch=1)
|
||||
start = time.perf_counter()
|
||||
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
|
||||
t = (time.perf_counter() - start) * 1000
|
||||
times_preprocess_batch.append(t)
|
||||
|
||||
print("延迟分析 (batch=1 优化后):")
|
||||
print(f" preprocess_single: {np.mean(times_preprocess_single):.2f}ms")
|
||||
print(f" preprocess_batch: {np.mean(times_preprocess_batch):.2f}ms")
|
||||
print(f" 总预处理: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f}ms")
|
||||
print()
|
||||
print(f"TensorRT batch=1 推理: ~2.5ms")
|
||||
print(f"TensorRT batch=4 推理: ~5.0ms")
|
||||
print()
|
||||
print("推算总延迟:")
|
||||
print(f" batch=1: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 2.5 ≈ 8-12ms")
|
||||
print(f" batch=4: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 5 ≈ 10-15ms")
|
||||
96
benchmark_trt.py
Normal file
96
benchmark_trt.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""TensorRT 纯推理延迟测试"""
|
||||
import numpy as np
|
||||
import tensorrt as trt
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.autoinit
|
||||
import time
|
||||
|
||||
engine_path = './models/yolo11n.engine'
|
||||
|
||||
with open(engine_path, 'rb') as f:
|
||||
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
|
||||
engine = runtime.deserialize_cuda_engine(f.read())
|
||||
|
||||
context = engine.create_execution_context()
|
||||
|
||||
input_shape = (1, 3, 480, 480)
|
||||
input_data = np.random.randn(*input_shape).astype(np.float32)
|
||||
|
||||
context.set_input_shape('images', input_shape)
|
||||
|
||||
output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1))
|
||||
output_size = int(np.prod(output_shape))
|
||||
|
||||
h_input = cuda.pagelocked_empty(input_data.size, np.float32)
|
||||
h_output = cuda.pagelocked_empty(output_size, np.float32)
|
||||
|
||||
np.copyto(h_input, input_data.ravel())
|
||||
|
||||
d_input = cuda.mem_alloc(h_input.nbytes)
|
||||
d_output = cuda.mem_alloc(h_output.nbytes)
|
||||
|
||||
bindings = [int(d_input), int(d_output)]
|
||||
|
||||
# Warmup
|
||||
for _ in range(10):
|
||||
cuda.memcpy_htod(d_input, h_input)
|
||||
context.execute_v2(bindings=bindings)
|
||||
cuda.memcpy_dtoh(h_output, d_output)
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(100):
|
||||
start = time.perf_counter()
|
||||
cuda.memcpy_htod(d_input, h_input)
|
||||
context.execute_v2(bindings=bindings)
|
||||
cuda.memcpy_dtoh(h_output, d_output)
|
||||
times.append((time.perf_counter() - start) * 1000)
|
||||
|
||||
print(f'TensorRT 纯推理延迟 (batch=1):')
|
||||
print(f' 平均: {np.mean(times):.2f}ms')
|
||||
print(f' 中位数: {np.median(times):.2f}ms')
|
||||
print(f' 最小: {np.min(times):.2f}ms')
|
||||
print(f' 最大: {np.max(times):.2f}ms')
|
||||
print(f' P95: {np.percentile(times, 95):.2f}ms')
|
||||
print()
|
||||
|
||||
# 再测试 batch=4
|
||||
print("测试 batch=4...")
|
||||
input_shape_4 = (4, 3, 480, 480)
|
||||
input_data_4 = np.random.randn(*input_shape_4).astype(np.float32)
|
||||
context.set_input_shape('images', input_shape_4)
|
||||
|
||||
output_shape_4 = (4, 84, 4725)
|
||||
output_size_4 = int(np.prod(output_shape_4))
|
||||
|
||||
h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32)
|
||||
h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32)
|
||||
|
||||
np.copyto(h_input_4, input_data_4.ravel())
|
||||
|
||||
d_input_4 = cuda.mem_alloc(h_input_4.nbytes)
|
||||
d_output_4 = cuda.mem_alloc(h_output_4.nbytes)
|
||||
|
||||
bindings_4 = [int(d_input_4), int(d_output_4)]
|
||||
|
||||
# Warmup
|
||||
for _ in range(10):
|
||||
cuda.memcpy_htod(d_input_4, h_input_4)
|
||||
context.execute_v2(bindings=bindings_4)
|
||||
cuda.memcpy_dtoh(h_output_4, d_output_4)
|
||||
|
||||
# Benchmark
|
||||
times_4 = []
|
||||
for _ in range(100):
|
||||
start = time.perf_counter()
|
||||
cuda.memcpy_htod(d_input_4, h_input_4)
|
||||
context.execute_v2(bindings=bindings_4)
|
||||
cuda.memcpy_dtoh(h_output_4, d_output_4)
|
||||
times_4.append((time.perf_counter() - start) * 1000)
|
||||
|
||||
print(f'TensorRT 纯推理延迟 (batch=4):')
|
||||
print(f' 平均: {np.mean(times_4):.2f}ms')
|
||||
print(f' 中位数: {np.median(times_4):.2f}ms')
|
||||
print(f' 最小: {np.min(times_4):.2f}ms')
|
||||
print(f' 最大: {np.max(times_4):.2f}ms')
|
||||
print(f' P95: {np.percentile(times_4, 95):.2f}ms')
|
||||
Binary file not shown.
@@ -75,7 +75,7 @@ class InferenceConfig:
|
||||
model_path: str = "./models/yolo11n.engine"
|
||||
input_width: int = 480
|
||||
input_height: int = 480
|
||||
batch_size: int = 4
|
||||
batch_size: int = 1
|
||||
conf_threshold: float = 0.5
|
||||
nms_threshold: float = 0.45
|
||||
device_id: int = 0
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -225,30 +225,19 @@ class LetterboxPreprocessor:
|
||||
|
||||
|
||||
class BatchPreprocessor:
|
||||
"""Batch预处理器类
|
||||
"""Batch预处理器类 (batch=1)"""
|
||||
|
||||
固定 batch=4,支持 padding 到 batch=4
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 4
|
||||
BATCH_SIZE = 1
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_size: Tuple[int, int] = (480, 480),
|
||||
fp16_mode: bool = True
|
||||
):
|
||||
"""
|
||||
初始化Batch预处理器
|
||||
|
||||
Args:
|
||||
target_size: 目标尺寸 (width, height)
|
||||
fp16_mode: 是否使用FP16精度
|
||||
"""
|
||||
self.target_size = target_size
|
||||
self.fp16_mode = fp16_mode
|
||||
self.batch_size = self.BATCH_SIZE
|
||||
|
||||
self._letterbox = LetterboxPreprocessor(target_size)
|
||||
self._logger = get_logger("preprocessor")
|
||||
|
||||
self._logger.info(
|
||||
@@ -256,77 +245,50 @@ class BatchPreprocessor:
|
||||
f"target_size={target_size}, fp16={fp16_mode}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
|
||||
def preprocess_single(
|
||||
self,
|
||||
image: np.ndarray
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Padding 到 batch=4,重复最后一帧
|
||||
预处理单帧图像
|
||||
|
||||
Args:
|
||||
frames: list of [3, 480, 480] numpy arrays
|
||||
image: numpy 数组
|
||||
|
||||
Returns:
|
||||
np.ndarray: [4, 3, 480, 480]
|
||||
np.ndarray: [1, 3, H, W]
|
||||
"""
|
||||
if len(frames) == 0:
|
||||
raise ValueError("Empty frames list")
|
||||
normalized = image.astype(np.float32) / 255.0
|
||||
transposed = np.transpose(normalized, (2, 0, 1))
|
||||
batched = transposed[None, ...]
|
||||
|
||||
if len(frames) == 4:
|
||||
return np.stack(frames)
|
||||
if self.fp16_mode:
|
||||
batched = batched.astype(np.float16)
|
||||
|
||||
pad_frame = frames[-1].copy()
|
||||
while len(frames) < 4:
|
||||
frames.append(pad_frame)
|
||||
|
||||
return np.stack(frames)
|
||||
return batched
|
||||
|
||||
def preprocess_batch(
|
||||
self,
|
||||
images: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
|
||||
"""
|
||||
预处理批次图像,自动 padding 到 batch=4
|
||||
预处理批次图像 (batch=1)
|
||||
|
||||
Args:
|
||||
images: 图像列表
|
||||
images: 图像列表 (只处理第一帧)
|
||||
|
||||
Returns:
|
||||
tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
|
||||
tuple: (批次数据 [1, 3, H, W], 缩放信息列表)
|
||||
"""
|
||||
batch_data, scale_info_list = self._preprocess_batch(images)
|
||||
if not images:
|
||||
raise ValueError("Empty images list")
|
||||
|
||||
return batch_data, scale_info_list
|
||||
|
||||
def _preprocess_batch(
|
||||
self,
|
||||
images: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
|
||||
"""内部预处理实现"""
|
||||
padded_images = self.pad_to_batch4(images)
|
||||
letterbox = LetterboxPreprocessor(self.target_size)
|
||||
processed, scale_info = letterbox.preprocess(images[0])
|
||||
|
||||
scale_info_list = []
|
||||
processed_images = []
|
||||
batch_data = self.preprocess_single(processed)
|
||||
|
||||
for i in range(self.batch_size):
|
||||
processed, scale_info = self._letterbox.preprocess(padded_images[i])
|
||||
processed_images.append(processed)
|
||||
scale_info_list.append(scale_info)
|
||||
|
||||
batch_data = self._stack_and_normalize(processed_images)
|
||||
|
||||
return batch_data, scale_info_list
|
||||
|
||||
def _stack_and_normalize(self, images: List[np.ndarray]) -> np.ndarray:
|
||||
"""堆叠并归一化图像"""
|
||||
stacked = np.stack(images, axis=0)
|
||||
|
||||
stacked = stacked.astype(np.float32) / 255.0
|
||||
|
||||
stacked = np.transpose(stacked, (0, 3, 1, 2))
|
||||
|
||||
if self.fp16_mode:
|
||||
stacked = stacked.astype(np.float16)
|
||||
|
||||
return stacked
|
||||
return batch_data, [scale_info]
|
||||
|
||||
|
||||
class ImagePreprocessor:
|
||||
|
||||
@@ -40,29 +40,22 @@ class HostDeviceMem:
|
||||
|
||||
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
|
||||
"""
|
||||
Padding 到 batch=4,重复最后一帧
|
||||
Padding 到 batch=N,重复最后一帧(已弃用,改用 batch=1)
|
||||
|
||||
Args:
|
||||
frames: list of [3, 480, 480] numpy arrays
|
||||
|
||||
Returns:
|
||||
np.ndarray: [4, 3, 480, 480]
|
||||
np.ndarray: [N, 3, 480, 480]
|
||||
"""
|
||||
if len(frames) == 0:
|
||||
raise ValueError("Empty frames list")
|
||||
|
||||
if len(frames) == 4:
|
||||
return np.stack(frames)
|
||||
|
||||
pad_frame = frames[-1].copy()
|
||||
while len(frames) < 4:
|
||||
frames.append(pad_frame)
|
||||
|
||||
return np.stack(frames)
|
||||
|
||||
|
||||
class TensorRTEngine:
|
||||
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
|
||||
"""TensorRT 引擎 (batch=1, FP16, 3×480×480)
|
||||
|
||||
特性:
|
||||
- Buffer Pool: bindings 只在 init 阶段分配一次
|
||||
@@ -70,7 +63,7 @@ class TensorRTEngine:
|
||||
- Async API: CUDA stream + async memcpy + execute_async_v2
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 4
|
||||
BATCH_SIZE = 1
|
||||
INPUT_SHAPE = (3, 480, 480)
|
||||
|
||||
def __init__(self, config: Optional[InferenceConfig] = None):
|
||||
|
||||
6944
logs/main.log
6944
logs/main.log
File diff suppressed because it is too large
Load Diff
6076
logs/main_error.log
6076
logs/main_error.log
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user