fix: 修复10个关键bug提升系统稳定性和性能
1. YOLO11输出解析错误: 移除不存在的objectness行,正确使用class_scores.max() 2. CPU NMS逻辑错误: keep_mask同时标记保留和抑制框导致NMS失效,改用独立suppressed集合 3. 坐标映射缺失: _build_tracks中scale_info未使用,添加revert_boxes还原到ROI裁剪空间 4. batch=1限制: 恢复真正的动态batch推理(1~8),BatchPreprocessor支持多图stack 5. 帧率控制缺失: _read_frame添加time.monotonic()间隔控制,按target_fps跳帧 6. 拉流推理耦合: 新增独立推理线程(InferenceWorker),生产者-消费者模式解耦 7. 攒批形同虚设: 添加50ms攒批窗口+max_batch阈值,替代>=1立即处理 8. LeavePost双重等待: LEAVING确认后直接触发告警,不再进入OFF_DUTY二次等待 9. register_algorithm每帧调用: 添加_registered_keys缓存,O(1)快速路径跳过 10. GPU context线程安全: TensorRT infer()内部加锁,防止多线程CUDA context竞争 附带修复: - reset_algorithm中未定义algorithm_type变量(NameError) - update_roi_params中循环变量key覆盖外层key - AlertInfo缺少bind_id字段(TypeError) - _logger.log_alert在标准logger上不存在(AttributeError) - AlarmStateMachine死锁(Lock改为RLock) - ROICropper.create_mask坐标解析错误 - 更新测试用例适配新API Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -55,15 +55,17 @@ def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
|
||||
|
||||
|
||||
class TensorRTEngine:
|
||||
"""TensorRT 引擎 (batch=1, FP16, 3×480×480)
|
||||
|
||||
"""TensorRT 引擎 (动态 batch 1~8, opt=4, FP16, 3×480×480)
|
||||
|
||||
特性:
|
||||
- Buffer Pool: bindings 只在 init 阶段分配一次
|
||||
- Buffer Pool: bindings 按 max_batch 分配,推理时按实际 batch 使用
|
||||
- Pinned Memory: 使用 pagelocked host memory 提升 H2D/D2H 性能
|
||||
- Async API: CUDA stream + async memcpy + execute_async_v2
|
||||
- 推理锁: 保证多线程下 CUDA context 安全
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 1
|
||||
|
||||
MAX_BATCH_SIZE = 8
|
||||
OPT_BATCH_SIZE = 4
|
||||
INPUT_SHAPE = (3, 480, 480)
|
||||
|
||||
def __init__(self, config: Optional[InferenceConfig] = None):
|
||||
@@ -99,7 +101,7 @@ class TensorRTEngine:
|
||||
self._logger.info(
|
||||
f"TensorRT 引擎初始化: "
|
||||
f"{config.model_path}, "
|
||||
f"batch={self.BATCH_SIZE}, "
|
||||
f"batch=1~{self.MAX_BATCH_SIZE} (opt={self.OPT_BATCH_SIZE}), "
|
||||
f"shape={self.INPUT_SHAPE}, "
|
||||
f"fp16={config.fp16_mode}"
|
||||
)
|
||||
@@ -131,7 +133,7 @@ class TensorRTEngine:
|
||||
"load", "TensorRT", engine_path, True
|
||||
)
|
||||
self._logger.info(f"TensorRT 引擎加载成功: {engine_path}")
|
||||
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}, batch={self.BATCH_SIZE}")
|
||||
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}, batch=1~{self.MAX_BATCH_SIZE}")
|
||||
|
||||
return True
|
||||
|
||||
@@ -153,7 +155,7 @@ class TensorRTEngine:
|
||||
dtype = trt.nptype(self._engine.get_binding_dtype(binding_idx))
|
||||
|
||||
if shape[0] == -1:
|
||||
shape[0] = self.BATCH_SIZE
|
||||
shape[0] = self.MAX_BATCH_SIZE
|
||||
|
||||
shape = tuple(max(1, s) if s < 0 else s for s in shape)
|
||||
size = trt.volume(shape)
|
||||
@@ -181,65 +183,74 @@ class TensorRTEngine:
|
||||
|
||||
def infer(self, input_batch: np.ndarray) -> Tuple[List[np.ndarray], float]:
|
||||
"""
|
||||
执行推理(工业级 async 模式)
|
||||
|
||||
执行推理(工业级 async 模式,线程安全)
|
||||
|
||||
Args:
|
||||
input_batch: numpy 输入,shape = [batch, 3, 480, 480],dtype = np.float16
|
||||
|
||||
batch 可以是 1~MAX_BATCH_SIZE 的任意值
|
||||
|
||||
Returns:
|
||||
tuple: (输出列表, 推理耗时ms)
|
||||
"""
|
||||
if self._engine is None or self._context is None:
|
||||
raise RuntimeError("引擎未加载")
|
||||
|
||||
|
||||
if len(self._inputs) == 0:
|
||||
raise RuntimeError("未分配输入 buffer")
|
||||
|
||||
|
||||
batch_size = input_batch.shape[0]
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
self._cuda_context.push()
|
||||
|
||||
try:
|
||||
input_batch = np.ascontiguousarray(input_batch)
|
||||
|
||||
input_name = self._engine.get_binding_name(0)
|
||||
actual_shape = list(input_batch.shape)
|
||||
self._context.set_input_shape(input_name, actual_shape)
|
||||
|
||||
np.copyto(self._inputs[0].host, input_batch.ravel())
|
||||
|
||||
cuda.memcpy_htod_async(
|
||||
self._inputs[0].device,
|
||||
self._inputs[0].host,
|
||||
self._stream
|
||||
)
|
||||
|
||||
self._context.execute_async_v2(
|
||||
bindings=self._bindings,
|
||||
stream_handle=self._stream.handle
|
||||
)
|
||||
|
||||
results = []
|
||||
for out in self._outputs:
|
||||
cuda.memcpy_dtoh_async(
|
||||
out.host,
|
||||
out.device,
|
||||
|
||||
with self._lock:
|
||||
start_time = time.perf_counter()
|
||||
|
||||
self._cuda_context.push()
|
||||
|
||||
try:
|
||||
input_batch = np.ascontiguousarray(input_batch)
|
||||
|
||||
input_name = self._engine.get_binding_name(0)
|
||||
actual_shape = list(input_batch.shape)
|
||||
self._context.set_input_shape(input_name, actual_shape)
|
||||
|
||||
np.copyto(self._inputs[0].host[:input_batch.size], input_batch.ravel())
|
||||
|
||||
cuda.memcpy_htod_async(
|
||||
self._inputs[0].device,
|
||||
self._inputs[0].host,
|
||||
self._stream
|
||||
)
|
||||
results.append(out.host.copy())
|
||||
|
||||
self._stream.synchronize()
|
||||
|
||||
inference_time_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
self._update_performance_stats(inference_time_ms, self.BATCH_SIZE)
|
||||
|
||||
return results, inference_time_ms
|
||||
|
||||
finally:
|
||||
self._cuda_context.pop()
|
||||
|
||||
self._context.execute_async_v2(
|
||||
bindings=self._bindings,
|
||||
stream_handle=self._stream.handle
|
||||
)
|
||||
|
||||
results = []
|
||||
for out in self._outputs:
|
||||
cuda.memcpy_dtoh_async(
|
||||
out.host,
|
||||
out.device,
|
||||
self._stream
|
||||
)
|
||||
|
||||
self._stream.synchronize()
|
||||
|
||||
# 根据实际 batch_size 裁剪输出
|
||||
for out in self._outputs:
|
||||
output_data = out.host.copy()
|
||||
# 输出 shape 需按 batch_size 重新划分
|
||||
per_batch_size = len(output_data) // self.MAX_BATCH_SIZE
|
||||
actual_size = per_batch_size * batch_size
|
||||
results.append(output_data[:actual_size])
|
||||
|
||||
inference_time_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
self._update_performance_stats(inference_time_ms, batch_size)
|
||||
|
||||
return results, inference_time_ms
|
||||
|
||||
finally:
|
||||
self._cuda_context.pop()
|
||||
|
||||
def _update_performance_stats(self, inference_time_ms: float, batch_size: int):
|
||||
"""更新性能统计"""
|
||||
|
||||
Reference in New Issue
Block a user