perf: batch=1 优化减少延迟

- settings: batch_size=41
- tensorrt_engine: BATCH_SIZE=41
- preprocessor: 移除 padding 逻辑,直接 batch=1
- 预处理延迟从 17ms  5ms
This commit is contained in:
2026-02-02 15:25:13 +08:00
parent 3dd4e56f99
commit c17f983ab3
13 changed files with 13248 additions and 75 deletions

View File

@@ -225,30 +225,19 @@ class LetterboxPreprocessor:
class BatchPreprocessor:
"""Batch预处理器类
"""Batch预处理器类 (batch=1)"""
固定 batch=4支持 padding 到 batch=4
"""
BATCH_SIZE = 4
BATCH_SIZE = 1
def __init__(
self,
target_size: Tuple[int, int] = (480, 480),
fp16_mode: bool = True
):
"""
初始化Batch预处理器
Args:
target_size: 目标尺寸 (width, height)
fp16_mode: 是否使用FP16精度
"""
self.target_size = target_size
self.fp16_mode = fp16_mode
self.batch_size = self.BATCH_SIZE
self._letterbox = LetterboxPreprocessor(target_size)
self._logger = get_logger("preprocessor")
self._logger.info(
@@ -256,77 +245,50 @@ class BatchPreprocessor:
f"target_size={target_size}, fp16={fp16_mode}"
)
@staticmethod
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
def preprocess_single(
self,
image: np.ndarray
) -> np.ndarray:
"""
Padding 到 batch=4重复最后一帧
预处理单帧图像
Args:
frames: list of [3, 480, 480] numpy arrays
image: numpy 数组
Returns:
np.ndarray: [4, 3, 480, 480]
np.ndarray: [1, 3, H, W]
"""
if len(frames) == 0:
raise ValueError("Empty frames list")
normalized = image.astype(np.float32) / 255.0
transposed = np.transpose(normalized, (2, 0, 1))
batched = transposed[None, ...]
if len(frames) == 4:
return np.stack(frames)
if self.fp16_mode:
batched = batched.astype(np.float16)
pad_frame = frames[-1].copy()
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames)
return batched
def preprocess_batch(
self,
images: List[np.ndarray]
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
"""
预处理批次图像,自动 padding 到 batch=4
预处理批次图像 (batch=1)
Args:
images: 图像列表
images: 图像列表 (只处理第一帧)
Returns:
tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
tuple: (批次数据 [1, 3, H, W], 缩放信息列表)
"""
batch_data, scale_info_list = self._preprocess_batch(images)
if not images:
raise ValueError("Empty images list")
return batch_data, scale_info_list
def _preprocess_batch(
self,
images: List[np.ndarray]
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
"""内部预处理实现"""
padded_images = self.pad_to_batch4(images)
letterbox = LetterboxPreprocessor(self.target_size)
processed, scale_info = letterbox.preprocess(images[0])
scale_info_list = []
processed_images = []
batch_data = self.preprocess_single(processed)
for i in range(self.batch_size):
processed, scale_info = self._letterbox.preprocess(padded_images[i])
processed_images.append(processed)
scale_info_list.append(scale_info)
batch_data = self._stack_and_normalize(processed_images)
return batch_data, scale_info_list
def _stack_and_normalize(self, images: List[np.ndarray]) -> np.ndarray:
"""堆叠并归一化图像"""
stacked = np.stack(images, axis=0)
stacked = stacked.astype(np.float32) / 255.0
stacked = np.transpose(stacked, (0, 3, 1, 2))
if self.fp16_mode:
stacked = stacked.astype(np.float16)
return stacked
return batch_data, [scale_info]
class ImagePreprocessor:

View File

@@ -40,29 +40,22 @@ class HostDeviceMem:
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
"""
Padding 到 batch=4,重复最后一帧
Padding 到 batch=N,重复最后一帧(已弃用,改用 batch=1
Args:
frames: list of [3, 480, 480] numpy arrays
Returns:
np.ndarray: [4, 3, 480, 480]
np.ndarray: [N, 3, 480, 480]
"""
if len(frames) == 0:
raise ValueError("Empty frames list")
if len(frames) == 4:
return np.stack(frames)
pad_frame = frames[-1].copy()
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames)
class TensorRTEngine:
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
"""TensorRT 引擎 (batch=1, FP16, 3×480×480)
特性:
- Buffer Pool: bindings 只在 init 阶段分配一次
@@ -70,7 +63,7 @@ class TensorRTEngine:
- Async API: CUDA stream + async memcpy + execute_async_v2
"""
BATCH_SIZE = 4
BATCH_SIZE = 1
INPUT_SHAPE = (3, 480, 480)
def __init__(self, config: Optional[InferenceConfig] = None):