feat: TensorRT 固定 batch=4 重构

- tensorrt_engine.py 工业级 Buffer Pool
- preprocessor.py 添加 pad_to_batch4()
- postprocessor.py 支持批量输出
- settings.py 固定 batch_size=4
This commit is contained in:
2026-02-02 14:49:47 +08:00
parent 956bcbbc3e
commit 745cadc8e7
18 changed files with 68258 additions and 130 deletions

View File

@@ -1,9 +1,9 @@
"""
TensorRT推理引擎模块
固定 batch=4, FP16, 3×480×480
工业级实现Buffer Pool、异步推理、性能监控
"""
import ctypes
import logging
import threading
import time
@@ -38,8 +38,31 @@ class HostDeviceMem:
return f"Host:{self.host.shape}, Device:{int(self.device)}"
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
"""
Padding 到 batch=4重复最后一帧
Args:
frames: list of [3, 480, 480] numpy arrays
Returns:
np.ndarray: [4, 3, 480, 480]
"""
if len(frames) == 0:
raise ValueError("Empty frames list")
if len(frames) == 4:
return np.stack(frames)
pad_frame = frames[-1].copy()
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames)
class TensorRTEngine:
"""工业级 TensorRT 引擎
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
特性:
- Buffer Pool: bindings 只在 init 阶段分配一次
@@ -47,6 +70,9 @@ class TensorRTEngine:
- Async API: CUDA stream + async memcpy + execute_async_v2
"""
BATCH_SIZE = 4
INPUT_SHAPE = (3, 480, 480)
def __init__(self, config: Optional[InferenceConfig] = None):
if not TRT_AVAILABLE:
raise RuntimeError("TensorRT 未安装,请先安装 tensorrt 库")
@@ -68,7 +94,6 @@ class TensorRTEngine:
self._bindings: List[int] = []
self._inputs: List[HostDeviceMem] = []
self._outputs: List[HostDeviceMem] = []
self._binding_names: Dict[int, str] = {}
self._performance_stats = {
"inference_count": 0,
@@ -81,8 +106,8 @@ class TensorRTEngine:
self._logger.info(
f"TensorRT 引擎初始化: "
f"{config.model_path}, "
f"{config.input_width}x{config.input_height}, "
f"batch={config.batch_size}, "
f"batch={self.BATCH_SIZE}, "
f"shape={self.INPUT_SHAPE}, "
f"fp16={config.fp16_mode}"
)
@@ -113,7 +138,7 @@ class TensorRTEngine:
"load", "TensorRT", engine_path, True
)
self._logger.info(f"TensorRT 引擎加载成功: {engine_path}")
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}")
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}, batch={self.BATCH_SIZE}")
return True
@@ -122,30 +147,31 @@ class TensorRTEngine:
return False
def _allocate_buffers(self):
"""Buffer Pool: 初始化阶段一次性分配所有 bindings(工业级关键点)"""
"""Buffer Pool: 初始化阶段一次性分配所有 bindings
对于动态 shape engine使用配置中的 batch_size 作为默认大小
"""
self._bindings = []
self._inputs = []
self._outputs = []
self._binding_names = {}
for binding_idx in range(self._engine.num_bindings):
name = self._engine.get_binding_name(binding_idx)
shape = list(self._engine.get_binding_shape(binding_idx))
dtype = trt.nptype(self._engine.get_binding_dtype(binding_idx))
shape = self._engine.get_binding_shape(binding_idx)
self._binding_names[binding_idx] = name
if shape[0] == -1:
shape[0] = self.BATCH_SIZE
shape = tuple(max(1, s) if s < 0 else s for s in shape)
size = trt.volume(shape)
try:
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
except Exception as e:
self._logger.warning(f"pagelocked memory 分配失败,回退到普通 numpy: {e}")
host_mem = np.zeros(size, dtype=dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self._bindings.append(int(device_mem))
mem_pair = HostDeviceMem(host_mem, device_mem)
@@ -159,24 +185,13 @@ class TensorRTEngine:
raise RuntimeError("No input bindings found")
if len(self._outputs) == 0:
raise RuntimeError("No output bindings found")
self._logger.debug(
f"Buffer Pool 分配完成: "
f"inputs={[int(i.device) for i in self._inputs]}, "
f"outputs={[int(o.device) for o in self._outputs]}"
)
def _get_output_shape(self, binding_idx: int) -> Tuple[int, ...]:
"""获取输出的 shape"""
name = self._binding_names[binding_idx]
return self._engine.get_binding_shape(name)
def infer(self, input_np: np.ndarray) -> Tuple[List[np.ndarray], float]:
def infer(self, input_batch: np.ndarray) -> Tuple[List[np.ndarray], float]:
"""
执行推理(工业级 async 模式)
Args:
input_np: numpy 输入shape 必须与 engine 一致
input_batch: numpy 输入shape = [batch, 3, 480, 480]dtype = np.float16
Returns:
tuple: (输出列表, 推理耗时ms)
@@ -187,17 +202,20 @@ class TensorRTEngine:
if len(self._inputs) == 0:
raise RuntimeError("未分配输入 buffer")
batch_size = input_batch.shape[0]
start_time = time.perf_counter()
self._cuda_context.push()
try:
input_np = np.ascontiguousarray(input_np)
input_batch = np.ascontiguousarray(input_batch)
input_name = self._binding_names[0]
self._context.set_input_shape(input_name, input_np.shape)
input_name = self._engine.get_binding_name(0)
actual_shape = list(input_batch.shape)
self._context.set_input_shape(input_name, actual_shape)
np.copyto(self._inputs[0].host, input_np.ravel())
np.copyto(self._inputs[0].host, input_batch.ravel())
cuda.memcpy_htod_async(
self._inputs[0].device,
@@ -210,28 +228,20 @@ class TensorRTEngine:
stream_handle=self._stream.handle
)
results = []
for out in self._outputs:
cuda.memcpy_dtoh_async(
out.host,
out.device,
self._stream
)
results.append(out.host.copy())
self._stream.synchronize()
inference_time_ms = (time.perf_counter() - start_time) * 1000
batch_size = input_np.shape[0]
self._update_performance_stats(inference_time_ms, batch_size)
output_shapes = []
for i in range(len(self._inputs), self._engine.num_bindings):
output_shapes.append(self._get_output_shape(i))
results = []
for idx, out in enumerate(self._outputs):
shape = output_shapes[idx] if idx < len(output_shapes) else out.host.shape
results.append(out.host.reshape(shape))
self._update_performance_stats(inference_time_ms, self.BATCH_SIZE)
return results, inference_time_ms