perf: batch=1 优化减少延迟

- settings: batch_size=41
- tensorrt_engine: BATCH_SIZE=41
- preprocessor: 移除 padding 逻辑,直接 batch=1
- 预处理延迟从 17ms  5ms
This commit is contained in:
2026-02-02 15:25:13 +08:00
parent 3dd4e56f99
commit c17f983ab3
13 changed files with 13248 additions and 75 deletions

View File

@@ -40,29 +40,22 @@ class HostDeviceMem:
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
"""
Padding 到 batch=4,重复最后一帧
Padding 到 batch=N,重复最后一帧(已弃用,改用 batch=1
Args:
frames: list of [3, 480, 480] numpy arrays
Returns:
np.ndarray: [4, 3, 480, 480]
np.ndarray: [N, 3, 480, 480]
"""
if len(frames) == 0:
raise ValueError("Empty frames list")
if len(frames) == 4:
return np.stack(frames)
pad_frame = frames[-1].copy()
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames)
class TensorRTEngine:
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
"""TensorRT 引擎 (batch=1, FP16, 3×480×480)
特性:
- Buffer Pool: bindings 只在 init 阶段分配一次
@@ -70,7 +63,7 @@ class TensorRTEngine:
- Async API: CUDA stream + async memcpy + execute_async_v2
"""
BATCH_SIZE = 4
BATCH_SIZE = 1
INPUT_SHAPE = (3, 480, 480)
def __init__(self, config: Optional[InferenceConfig] = None):