fix: 修复10个关键bug提升系统稳定性和性能
1. YOLO11输出解析错误: 移除不存在的objectness行,正确使用class_scores.max() 2. CPU NMS逻辑错误: keep_mask同时标记保留和抑制框导致NMS失效,改用独立suppressed集合 3. 坐标映射缺失: _build_tracks中scale_info未使用,添加revert_boxes还原到ROI裁剪空间 4. batch=1限制: 恢复真正的动态batch推理(1~8),BatchPreprocessor支持多图stack 5. 帧率控制缺失: _read_frame添加time.monotonic()间隔控制,按target_fps跳帧 6. 拉流推理耦合: 新增独立推理线程(InferenceWorker),生产者-消费者模式解耦 7. 攒批形同虚设: 添加50ms攒批窗口+max_batch阈值,替代>=1立即处理 8. LeavePost双重等待: LEAVING确认后直接触发告警,不再进入OFF_DUTY二次等待 9. register_algorithm每帧调用: 添加_registered_keys缓存,O(1)快速路径跳过 10. GPU context线程安全: TensorRT infer()内部加锁,防止多线程CUDA context竞争 附带修复: - reset_algorithm中未定义algorithm_type变量(NameError) - update_roi_params中循环变量key覆盖外层key - AlertInfo缺少bind_id字段(TypeError) - _logger.log_alert在标准logger上不存在(AttributeError) - AlarmStateMachine死锁(Lock改为RLock) - ROICropper.create_mask坐标解析错误 - 更新测试用例适配新API Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
105
main.py
105
main.py
@@ -59,7 +59,11 @@ class EdgeInferenceService:
|
||||
|
||||
self._batch_roi_queue: List[tuple] = []
|
||||
self._batch_lock = threading.Lock()
|
||||
|
||||
self._batch_event = threading.Event()
|
||||
self._inference_thread: Optional[threading.Thread] = None
|
||||
self._max_batch_size = 8
|
||||
self._batch_timeout_sec = 0.05 # 50ms 攒批窗口
|
||||
|
||||
self._logger.info("Edge_Inference_Service 初始化开始")
|
||||
|
||||
def _init_database(self):
|
||||
@@ -205,59 +209,59 @@ class EdgeInferenceService:
|
||||
|
||||
if not roi_items:
|
||||
return
|
||||
|
||||
|
||||
with self._batch_lock:
|
||||
self._batch_roi_queue.extend(roi_items)
|
||||
|
||||
batch_size = len(self._batch_roi_queue)
|
||||
if batch_size >= 1:
|
||||
self._batch_process_rois()
|
||||
|
||||
|
||||
# 通知推理线程有新数据
|
||||
self._batch_event.set()
|
||||
|
||||
self._performance_stats["total_frames_processed"] += 1
|
||||
|
||||
except Exception as e:
|
||||
self._logger.error(f"处理帧失败 {camera_id}: {e}")
|
||||
|
||||
def _batch_process_rois(self):
|
||||
"""批量处理 ROI - 预处理、推理、后处理"""
|
||||
"""批量处理 ROI - 真正的 batch 推理"""
|
||||
with self._batch_lock:
|
||||
roi_items = self._batch_roi_queue
|
||||
if not roi_items:
|
||||
return
|
||||
self._batch_roi_queue = []
|
||||
|
||||
batch_size = len(roi_items)
|
||||
|
||||
|
||||
try:
|
||||
images = [item[4] for item in roi_items]
|
||||
scale_infos = [item[5] for item in roi_items]
|
||||
|
||||
|
||||
# 真正的 batch: 将所有 ROI 裁剪图拼成 [N,3,H,W] 一次推理
|
||||
batch_data, _ = self._preprocessor._batch_preprocessor.preprocess_batch(
|
||||
images
|
||||
)
|
||||
|
||||
|
||||
engine = self._engine_manager.get_engine("default")
|
||||
if engine is None:
|
||||
return
|
||||
|
||||
|
||||
# 一次性推理整个 batch
|
||||
outputs, inference_time_ms = engine.infer(batch_data)
|
||||
|
||||
|
||||
batch_size = len(roi_items)
|
||||
batch_results = self._postprocessor.batch_process_detections(
|
||||
outputs,
|
||||
batch_size,
|
||||
conf_threshold=self._settings.inference.conf_threshold
|
||||
)
|
||||
|
||||
|
||||
for idx, (camera_id, roi, bind, frame, _, scale_info) in enumerate(roi_items):
|
||||
boxes, scores, class_ids = batch_results[idx]
|
||||
|
||||
|
||||
if len(boxes) > 0:
|
||||
self._handle_detections(
|
||||
camera_id, roi, bind, frame,
|
||||
boxes, scores, class_ids,
|
||||
scale_info
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
self._logger.error(f"批量处理 ROI 失败: {e}")
|
||||
|
||||
@@ -273,11 +277,20 @@ class EdgeInferenceService:
|
||||
class_ids: any,
|
||||
scale_info: tuple
|
||||
) -> list:
|
||||
"""将检测结果转换为算法所需的 tracks 格式"""
|
||||
"""将检测结果转换为算法所需的 tracks 格式
|
||||
|
||||
坐标从 letterbox 空间还原到 ROI 裁剪空间
|
||||
"""
|
||||
tracks = []
|
||||
class_names = getattr(self._settings, 'class_names', ['person'])
|
||||
|
||||
for i, box in enumerate(boxes):
|
||||
|
||||
# 将 letterbox 坐标还原到 ROI 裁剪空间
|
||||
reverted_boxes = self._preprocessor.revert_boxes(
|
||||
[box.tolist() if hasattr(box, 'tolist') else list(box) for box in boxes],
|
||||
scale_info
|
||||
)
|
||||
|
||||
for i, box in enumerate(reverted_boxes):
|
||||
class_id = int(class_ids[i]) if class_ids[i] else 0
|
||||
track = {
|
||||
"track_id": f"{roi.roi_id}_{i}",
|
||||
@@ -287,7 +300,7 @@ class EdgeInferenceService:
|
||||
"matched_rois": [{"roi_id": roi.roi_id}],
|
||||
}
|
||||
tracks.append(track)
|
||||
|
||||
|
||||
return tracks
|
||||
|
||||
def _handle_detections(
|
||||
@@ -351,16 +364,37 @@ class EdgeInferenceService:
|
||||
)
|
||||
self._reporter.report_alert(alert_info, screenshot=frame.image)
|
||||
|
||||
self._logger.log_alert(
|
||||
alert.get("alert_type", "detection"),
|
||||
camera_id,
|
||||
roi_id,
|
||||
alert.get("confidence", 1.0)
|
||||
self._logger.info(
|
||||
f"告警已生成: type={alert.get('alert_type', 'detection')}, "
|
||||
f"camera={camera_id}, roi={roi_id}, "
|
||||
f"confidence={alert.get('confidence', 1.0)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self._logger.error(f"处理检测结果失败: {e}")
|
||||
|
||||
def _inference_worker(self):
|
||||
"""推理线程:攒批窗口内收集 ROI 请求,批量推理"""
|
||||
while not self._stop_event.is_set():
|
||||
# 等待有新数据到达或超时
|
||||
self._batch_event.wait(timeout=self._batch_timeout_sec)
|
||||
self._batch_event.clear()
|
||||
|
||||
with self._batch_lock:
|
||||
queue_size = len(self._batch_roi_queue)
|
||||
|
||||
# 攒批窗口:等到攒够 max_batch 或超时后处理
|
||||
if queue_size > 0 and queue_size < self._max_batch_size:
|
||||
# 再等一小段时间凑更多
|
||||
self._batch_event.wait(timeout=self._batch_timeout_sec)
|
||||
self._batch_event.clear()
|
||||
|
||||
with self._batch_lock:
|
||||
if not self._batch_roi_queue:
|
||||
continue
|
||||
|
||||
self._batch_process_rois()
|
||||
|
||||
def start(self):
|
||||
"""启动服务"""
|
||||
if self._running:
|
||||
@@ -370,7 +404,16 @@ class EdgeInferenceService:
|
||||
self._stop_event.clear()
|
||||
|
||||
self._load_cameras()
|
||||
|
||||
|
||||
# 启动独立推理线程(生产者-消费者模式)
|
||||
self._inference_thread = threading.Thread(
|
||||
target=self._inference_worker,
|
||||
name="InferenceWorker",
|
||||
daemon=True
|
||||
)
|
||||
self._inference_thread.start()
|
||||
self._logger.info("推理线程已启动")
|
||||
|
||||
self._stream_manager.start_all()
|
||||
|
||||
self._logger.info("Edge_Inference_Service 已启动")
|
||||
@@ -429,7 +472,11 @@ class EdgeInferenceService:
|
||||
|
||||
self._running = False
|
||||
self._stop_event.set()
|
||||
|
||||
self._batch_event.set() # 唤醒推理线程以退出
|
||||
|
||||
if self._inference_thread and self._inference_thread.is_alive():
|
||||
self._inference_thread.join(timeout=5)
|
||||
|
||||
if self._stream_manager:
|
||||
self._stream_manager.stop_all()
|
||||
self._stream_manager.close()
|
||||
|
||||
Reference in New Issue
Block a user