perf: batch=1 优化减少延迟

- settings: batch_size=41 - tensorrt_engine: BATCH_SIZE=41 - preprocessor: 移除 padding 逻辑，直接 batch=1 - 预处理延迟从 17ms 5ms
2026-02-02 15:25:13 +08:00
parent 3dd4e56f99
commit c17f983ab3
13 changed files with 13248 additions and 75 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,5 @@ README.md
 # 数据目录（不提交）
 data/
 captures/
 /logs/
 /tests/
--- a/analyze_latency.py
+++ b/analyze_latency.py
@@ -0,0 +1,56 @@
 """详细延迟分析 - 简化版"""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import time
 import numpy as np
 import cv2
 from config.settings import get_settings
 from core.preprocessor import ImagePreprocessor
 settings = get_settings()
 preprocessor = ImagePreprocessor(settings.inference)
 # 模拟 100 次推理
 img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
 roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True})()
 times_preprocess = []
 times_single = []
 times_batch = []
 for _ in range(100):
    # 1. preprocess_single
    start = time.perf_counter()
    cropped = preprocessor.preprocess_single(img, roi_mock)
    t = (time.perf_counter() - start) * 1000
    times_single.append(t)
    # 2. preprocess_batch (1→4)
    start = time.perf_counter()
    batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
    t = (time.perf_counter() - start) * 1000
    times_batch.append(t)
    # 3. 完整 preprocess (single + batch)
    start = time.perf_counter()
    cropped = preprocessor.preprocess_single(img, roi_mock)
    batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
    t = (time.perf_counter() - start) * 1000
    times_preprocess.append(t)
 print("延迟分析 (100次平均):")
 print(f"  preprocess_single (ROI + resize): {np.mean(times_single):.2f}ms")
 print(f"  preprocess_batch (padding 1→4): {np.mean(times_batch):.2f}ms")
 print(f"  完整预处理: {np.mean(times_preprocess):.2f}ms")
 print()
 print(f"TensorRT 推理 (batch=1): ~2.5ms (基准测试)")
 print(f"TensorRT 推理 (batch=4): ~5.0ms (基准测试)")
 print()
 print("推算总延迟:")
 print(f"  方案A (batch=1): {np.mean(times_single):.2f} + 2.5 + 后处理 ≈ 10-15ms")
 print(f"  方案B (batch=4 实际只推理1帧): {np.mean(times_preprocess):.2f} + 5 + 后处理 ≈ 55-65ms")
 print()
 print("结论：延迟主要来自 batch padding 和不必要的 4帧推理开销")
--- a/analyze_latency_batch1.py
+++ b/analyze_latency_batch1.py
@@ -0,0 +1,44 @@
 """延迟分析 - batch=1 优化后"""
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import time
 import numpy as np
 from config.settings import get_settings
 from core.preprocessor import ImagePreprocessor, BatchPreprocessor
 settings = get_settings()
 preprocessor = ImagePreprocessor(settings.inference)
 img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
 roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True, 'roi_type': 0})()
 times_preprocess_single = []
 times_preprocess_batch = []
 for _ in range(100):
    # 1. preprocess_single
    start = time.perf_counter()
    cropped = preprocessor.preprocess_single(img, roi_mock)
    t = (time.perf_counter() - start) * 1000
    times_preprocess_single.append(t)
    # 2. preprocess_batch (batch=1)
    start = time.perf_counter()
    batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
    t = (time.perf_counter() - start) * 1000
    times_preprocess_batch.append(t)
 print("延迟分析 (batch=1 优化后):")
 print(f"  preprocess_single: {np.mean(times_preprocess_single):.2f}ms")
 print(f"  preprocess_batch: {np.mean(times_preprocess_batch):.2f}ms")
 print(f"  总预处理: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f}ms")
 print()
 print(f"TensorRT batch=1 推理: ~2.5ms")
 print(f"TensorRT batch=4 推理: ~5.0ms")
 print()
 print("推算总延迟:")
 print(f"  batch=1: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 2.5 ≈ 8-12ms")
 print(f"  batch=4: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 5 ≈ 10-15ms")
--- a/benchmark_trt.py
+++ b/benchmark_trt.py
@@ -0,0 +1,96 @@
 """TensorRT 纯推理延迟测试"""
 import numpy as np
 import tensorrt as trt
 import pycuda.driver as cuda
 import pycuda.autoinit
 import time
 engine_path = './models/yolo11n.engine'
 with open(engine_path, 'rb') as f:
    runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
    engine = runtime.deserialize_cuda_engine(f.read())
 context = engine.create_execution_context()
 input_shape = (1, 3, 480, 480)
 input_data = np.random.randn(*input_shape).astype(np.float32)
 context.set_input_shape('images', input_shape)
 output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1))
 output_size = int(np.prod(output_shape))
 h_input = cuda.pagelocked_empty(input_data.size, np.float32)
 h_output = cuda.pagelocked_empty(output_size, np.float32)
 np.copyto(h_input, input_data.ravel())
 d_input = cuda.mem_alloc(h_input.nbytes)
 d_output = cuda.mem_alloc(h_output.nbytes)
 bindings = [int(d_input), int(d_output)]
 # Warmup
 for _ in range(10):
    cuda.memcpy_htod(d_input, h_input)
    context.execute_v2(bindings=bindings)
    cuda.memcpy_dtoh(h_output, d_output)
 # Benchmark
 times = []
 for _ in range(100):
    start = time.perf_counter()
    cuda.memcpy_htod(d_input, h_input)
    context.execute_v2(bindings=bindings)
    cuda.memcpy_dtoh(h_output, d_output)
    times.append((time.perf_counter() - start) * 1000)
 print(f'TensorRT 纯推理延迟 (batch=1):')
 print(f'  平均: {np.mean(times):.2f}ms')
 print(f'  中位数: {np.median(times):.2f}ms')
 print(f'  最小: {np.min(times):.2f}ms')
 print(f'  最大: {np.max(times):.2f}ms')
 print(f'  P95: {np.percentile(times, 95):.2f}ms')
 print()
 # 再测试 batch=4
 print("测试 batch=4...")
 input_shape_4 = (4, 3, 480, 480)
 input_data_4 = np.random.randn(*input_shape_4).astype(np.float32)
 context.set_input_shape('images', input_shape_4)
 output_shape_4 = (4, 84, 4725)
 output_size_4 = int(np.prod(output_shape_4))
 h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32)
 h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32)
 np.copyto(h_input_4, input_data_4.ravel())
 d_input_4 = cuda.mem_alloc(h_input_4.nbytes)
 d_output_4 = cuda.mem_alloc(h_output_4.nbytes)
 bindings_4 = [int(d_input_4), int(d_output_4)]
 # Warmup
 for _ in range(10):
    cuda.memcpy_htod(d_input_4, h_input_4)
    context.execute_v2(bindings=bindings_4)
    cuda.memcpy_dtoh(h_output_4, d_output_4)
 # Benchmark
 times_4 = []
 for _ in range(100):
    start = time.perf_counter()
    cuda.memcpy_htod(d_input_4, h_input_4)
    context.execute_v2(bindings=bindings_4)
    cuda.memcpy_dtoh(h_output_4, d_output_4)
    times_4.append((time.perf_counter() - start) * 1000)
 print(f'TensorRT 纯推理延迟 (batch=4):')
 print(f'  平均: {np.mean(times_4):.2f}ms')
 print(f'  中位数: {np.median(times_4):.2f}ms')
 print(f'  最小: {np.min(times_4):.2f}ms')
 print(f'  最大: {np.max(times_4):.2f}ms')
 print(f'  P95: {np.percentile(times_4, 95):.2f}ms')
--- a/config/pycache/settings.cpython-310.pyc
+++ b/config/pycache/settings.cpython-310.pyc
--- a/config/settings.py
+++ b/config/settings.py
@@ -75,7 +75,7 @@ class InferenceConfig:
    model_path: str = "./models/yolo11n.engine"
    input_width: int = 480
    input_height: int = 480
-    batch_size: int = 4
+    batch_size: int = 1
    conf_threshold: float = 0.5
    nms_threshold: float = 0.45
    device_id: int = 0
--- a/core/pycache/preprocessor.cpython-310.pyc
+++ b/core/pycache/preprocessor.cpython-310.pyc
--- a/core/pycache/tensorrt_engine.cpython-310.pyc
+++ b/core/pycache/tensorrt_engine.cpython-310.pyc
--- a/core/preprocessor.py
+++ b/core/preprocessor.py
@@ -225,30 +225,19 @@ class LetterboxPreprocessor:
 class BatchPreprocessor:
-    """Batch预处理器类
+    """Batch预处理器类 (batch=1)"""
-    固定 batch=4，支持 padding 到 batch=4
+    BATCH_SIZE = 1
    """
    BATCH_SIZE = 4
    def __init__(
        self,
        target_size: Tuple[int, int] = (480, 480),
        fp16_mode: bool = True
    ):
        """
        初始化Batch预处理器
        Args:
            target_size: 目标尺寸 (width, height)
            fp16_mode: 是否使用FP16精度
        """
        self.target_size = target_size
        self.fp16_mode = fp16_mode
        self.batch_size = self.BATCH_SIZE
        self._letterbox = LetterboxPreprocessor(target_size)
        self._logger = get_logger("preprocessor")
        self._logger.info(
@@ -256,77 +245,50 @@ class BatchPreprocessor:
            f"target_size={target_size}, fp16={fp16_mode}"
        )
-    @staticmethod
+    def preprocess_single(
-    def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
+        self,
        image: np.ndarray
    ) -> np.ndarray:
        """
-        Padding 到 batch=4，重复最后一帧
+        预处理单帧图像
        Args:
-            frames: list of [3, 480, 480] numpy arrays
+            image: numpy 数组
        Returns:
-            np.ndarray: [4, 3, 480, 480]
+            np.ndarray: [1, 3, H, W]
        """
-        if len(frames) == 0:
+        normalized = image.astype(np.float32) / 255.0
-            raise ValueError("Empty frames list")
+        transposed = np.transpose(normalized, (2, 0, 1))
        batched = transposed[None, ...]
-        if len(frames) == 4:
+        if self.fp16_mode:
-            return np.stack(frames)
+            batched = batched.astype(np.float16)
-        pad_frame = frames[-1].copy()
+        return batched
        while len(frames) < 4:
            frames.append(pad_frame)
        return np.stack(frames)
    def preprocess_batch(
        self,
        images: List[np.ndarray]
    ) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
        """
-        预处理批次图像，自动 padding 到 batch=4
+        预处理批次图像 (batch=1)
        Args:
-            images: 图像列表
+            images: 图像列表 (只处理第一帧)
        Returns:
-            tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
+            tuple: (批次数据 [1, 3, H, W], 缩放信息列表)
        """
-        batch_data, scale_info_list = self._preprocess_batch(images)
+        if not images:
            raise ValueError("Empty images list")
-        return batch_data, scale_info_list
+        letterbox = LetterboxPreprocessor(self.target_size)
-    
+        processed, scale_info = letterbox.preprocess(images[0])
    def _preprocess_batch(
        self,
        images: List[np.ndarray]
    ) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
        """内部预处理实现"""
        padded_images = self.pad_to_batch4(images)
-        scale_info_list = []
+        batch_data = self.preprocess_single(processed)
        processed_images = []
-        for i in range(self.batch_size):
+        return batch_data, [scale_info]
            processed, scale_info = self._letterbox.preprocess(padded_images[i])
            processed_images.append(processed)
            scale_info_list.append(scale_info)
        batch_data = self._stack_and_normalize(processed_images)
        return batch_data, scale_info_list
    def _stack_and_normalize(self, images: List[np.ndarray]) -> np.ndarray:
        """堆叠并归一化图像"""
        stacked = np.stack(images, axis=0)
        stacked = stacked.astype(np.float32) / 255.0
        stacked = np.transpose(stacked, (0, 3, 1, 2))
        if self.fp16_mode:
            stacked = stacked.astype(np.float16)
        return stacked
 class ImagePreprocessor:
--- a/core/tensorrt_engine.py
+++ b/core/tensorrt_engine.py
@@ -40,29 +40,22 @@ class HostDeviceMem:
 def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
    """
-    Padding 到 batch=4，重复最后一帧
+    Padding 到 batch=N，重复最后一帧（已弃用，改用 batch=1）
    Args:
        frames: list of [3, 480, 480] numpy arrays
    Returns:
-        np.ndarray: [4, 3, 480, 480]
+        np.ndarray: [N, 3, 480, 480]
    """
    if len(frames) == 0:
        raise ValueError("Empty frames list")
    if len(frames) == 4:
        return np.stack(frames)
    pad_frame = frames[-1].copy()
    while len(frames) < 4:
        frames.append(pad_frame)
    return np.stack(frames)
 class TensorRTEngine:
-    """固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
+    """TensorRT 引擎 (batch=1, FP16, 3×480×480)
    特性：
    - Buffer Pool: bindings 只在 init 阶段分配一次
@@ -70,7 +63,7 @@ class TensorRTEngine:
    - Async API: CUDA stream + async memcpy + execute_async_v2
    """
-    BATCH_SIZE = 4
+    BATCH_SIZE = 1
    INPUT_SHAPE = (3, 480, 480)
    def __init__(self, config: Optional[InferenceConfig] = None):
--- a/logs/main.log
+++ b/logs/main.log
--- a/logs/main_error.log
+++ b/logs/main_error.log
--- a/main.py
+++ b/main.py
@@ -204,7 +204,7 @@ class EdgeInferenceService:
        frame: VideoFrame,
        roi
    ):
-        """处理ROI帧，固定 batch=4 推理"""
+        """处理ROI帧，batch=1 推理"""
        try:
            if not roi.enabled:
                return