perf: batch=1 优化减少延迟

- settings: batch_size=41 - tensorrt_engine: BATCH_SIZE=41 - preprocessor: 移除 padding 逻辑，直接 batch=1 - 预处理延迟从 17ms 5ms
2026-02-02 15:25:13 +08:00
parent 3dd4e56f99
commit c17f983ab3
13 changed files with 13248 additions and 75 deletions
--- a/core/pycache/preprocessor.cpython-310.pyc
+++ b/core/pycache/preprocessor.cpython-310.pyc
--- a/core/pycache/tensorrt_engine.cpython-310.pyc
+++ b/core/pycache/tensorrt_engine.cpython-310.pyc
--- a/core/preprocessor.py
+++ b/core/preprocessor.py
@@ -225,30 +225,19 @@ class LetterboxPreprocessor:


 class BatchPreprocessor:
-    """Batch预处理器类
+    """Batch预处理器类 (batch=1)"""
    
-    固定 batch=4，支持 padding 到 batch=4
-    """
-    
-    BATCH_SIZE = 4
+    BATCH_SIZE = 1
    
    def __init__(
        self,
        target_size: Tuple[int, int] = (480, 480),
        fp16_mode: bool = True
    ):
-        """
-        初始化Batch预处理器
-        
-        Args:
-            target_size: 目标尺寸 (width, height)
-            fp16_mode: 是否使用FP16精度
-        """
        self.target_size = target_size
        self.fp16_mode = fp16_mode
        self.batch_size = self.BATCH_SIZE
        
-        self._letterbox = LetterboxPreprocessor(target_size)
        self._logger = get_logger("preprocessor")
        
        self._logger.info(
@@ -256,77 +245,50 @@ class BatchPreprocessor:
            f"target_size={target_size}, fp16={fp16_mode}"
        )
    
-    @staticmethod
-    def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
+    def preprocess_single(
+        self,
+        image: np.ndarray
+    ) -> np.ndarray:
        """
-        Padding 到 batch=4，重复最后一帧
+        预处理单帧图像
        
        Args:
-            frames: list of [3, 480, 480] numpy arrays
+            image: numpy 数组
        
        Returns:
-            np.ndarray: [4, 3, 480, 480]
+            np.ndarray: [1, 3, H, W]
        """
-        if len(frames) == 0:
-            raise ValueError("Empty frames list")
+        normalized = image.astype(np.float32) / 255.0
+        transposed = np.transpose(normalized, (2, 0, 1))
+        batched = transposed[None, ...]
        
-        if len(frames) == 4:
-            return np.stack(frames)
+        if self.fp16_mode:
+            batched = batched.astype(np.float16)
        
-        pad_frame = frames[-1].copy()
-        while len(frames) < 4:
-            frames.append(pad_frame)
-        
-        return np.stack(frames)
+        return batched
    
    def preprocess_batch(
        self,
        images: List[np.ndarray]
    ) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
        """
-        预处理批次图像，自动 padding 到 batch=4
+        预处理批次图像 (batch=1)
        
        Args:
-            images: 图像列表
+            images: 图像列表 (只处理第一帧)
        
        Returns:
-            tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
+            tuple: (批次数据 [1, 3, H, W], 缩放信息列表)
        """
-        batch_data, scale_info_list = self._preprocess_batch(images)
+        if not images:
+            raise ValueError("Empty images list")
        
-        return batch_data, scale_info_list
-    
-    def _preprocess_batch(
-        self,
-        images: List[np.ndarray]
-    ) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
-        """内部预处理实现"""
-        padded_images = self.pad_to_batch4(images)
+        letterbox = LetterboxPreprocessor(self.target_size)
+        processed, scale_info = letterbox.preprocess(images[0])
        
-        scale_info_list = []
-        processed_images = []
+        batch_data = self.preprocess_single(processed)
        
-        for i in range(self.batch_size):
-            processed, scale_info = self._letterbox.preprocess(padded_images[i])
-            processed_images.append(processed)
-            scale_info_list.append(scale_info)
-        
-        batch_data = self._stack_and_normalize(processed_images)
-        
-        return batch_data, scale_info_list
-    
-    def _stack_and_normalize(self, images: List[np.ndarray]) -> np.ndarray:
-        """堆叠并归一化图像"""
-        stacked = np.stack(images, axis=0)
-        
-        stacked = stacked.astype(np.float32) / 255.0
-        
-        stacked = np.transpose(stacked, (0, 3, 1, 2))
-        
-        if self.fp16_mode:
-            stacked = stacked.astype(np.float16)
-        
-        return stacked
+        return batch_data, [scale_info]


 class ImagePreprocessor:
--- a/core/tensorrt_engine.py
+++ b/core/tensorrt_engine.py
@@ -40,29 +40,22 @@ class HostDeviceMem:

 def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
    """
-    Padding 到 batch=4，重复最后一帧
+    Padding 到 batch=N，重复最后一帧（已弃用，改用 batch=1）
    
    Args:
        frames: list of [3, 480, 480] numpy arrays
    
    Returns:
-        np.ndarray: [4, 3, 480, 480]
+        np.ndarray: [N, 3, 480, 480]
    """
    if len(frames) == 0:
        raise ValueError("Empty frames list")
    
-    if len(frames) == 4:
-        return np.stack(frames)
-    
-    pad_frame = frames[-1].copy()
-    while len(frames) < 4:
-        frames.append(pad_frame)
-    
    return np.stack(frames)


 class TensorRTEngine:
-    """固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
+    """TensorRT 引擎 (batch=1, FP16, 3×480×480)
    
    特性：
    - Buffer Pool: bindings 只在 init 阶段分配一次
@@ -70,7 +63,7 @@ class TensorRTEngine:
    - Async API: CUDA stream + async memcpy + execute_async_v2
    """
    
-    BATCH_SIZE = 4
+    BATCH_SIZE = 1
    INPUT_SHAPE = (3, 480, 480)
    
    def __init__(self, config: Optional[InferenceConfig] = None):