修复:P0+P1 生产稳定性和性能优化(6项)
P0 稳定性修复: - 告警去重字典添加惰性清理机制,防止长时间运行内存溢出 - Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏 - 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试 P1 性能优化: - GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片 - 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式) - ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
39
main.py
39
main.py
@@ -92,6 +92,10 @@ class EdgeInferenceService:
|
||||
# ROI级别告警去重:同ROI+同类型未resolve的告警不重复发送
|
||||
# key: f"{roi_id}_{alert_type}", value: alarm_id
|
||||
self._active_alarms: Dict[str, str] = {}
|
||||
self._active_alarms_time: Dict[str, datetime] = {} # 活跃告警创建时间
|
||||
self._cleanup_counter = 0
|
||||
self._cleanup_interval = 100 # 每 100 次 _handle_detections 清理一次
|
||||
self._active_alarm_max_age_sec = 3600 # 活跃告警最大存活时间(1小时)
|
||||
|
||||
self._logger.info("Edge_Inference_Service 初始化开始")
|
||||
|
||||
@@ -800,6 +804,12 @@ class EdgeInferenceService:
|
||||
):
|
||||
"""处理检测结果 - 算法接管判断权"""
|
||||
try:
|
||||
# 惰性清理过期去重记录
|
||||
self._cleanup_counter += 1
|
||||
if self._cleanup_counter >= self._cleanup_interval:
|
||||
self._cleanup_counter = 0
|
||||
self._cleanup_dedup_dicts(frame.timestamp)
|
||||
|
||||
if self._algorithm_manager is None:
|
||||
self._logger.warning("算法管理器不可用,跳过算法处理")
|
||||
return
|
||||
@@ -865,6 +875,7 @@ class EdgeInferenceService:
|
||||
for k, v in list(self._active_alarms.items()):
|
||||
if v == resolve_alarm_id:
|
||||
del self._active_alarms[k]
|
||||
self._active_alarms_time.pop(k, None)
|
||||
self._logger.debug(f"[去重] 活跃告警已清除: {k} -> {resolve_alarm_id}")
|
||||
break
|
||||
|
||||
@@ -938,6 +949,7 @@ class EdgeInferenceService:
|
||||
|
||||
# 记录活跃告警(用于 ROI 级去重)
|
||||
self._active_alarms[active_key] = alarm_info.alarm_id
|
||||
self._active_alarms_time[active_key] = frame.timestamp
|
||||
|
||||
# 回填 alarm_id 到算法实例(用于后续 resolve 追踪,泛化支持所有算法类型)
|
||||
algo = self._algorithm_manager.algorithms.get(roi_id, {}).get(f"{roi_id}_{bind.bind_id}", {}).get(alert_type)
|
||||
@@ -952,7 +964,32 @@ class EdgeInferenceService:
|
||||
|
||||
except Exception as e:
|
||||
self._logger.error(f"处理检测结果失败: {e}")
|
||||
|
||||
|
||||
def _cleanup_dedup_dicts(self, now: datetime):
|
||||
"""惰性清理过期的去重记录"""
|
||||
# 清理 _camera_alert_cooldown 中已过冷却期的记录
|
||||
expired_cooldown = [
|
||||
k for k, v in self._camera_alert_cooldown.items()
|
||||
if (now - v).total_seconds() > self._camera_cooldown_seconds * 2
|
||||
]
|
||||
for k in expired_cooldown:
|
||||
del self._camera_alert_cooldown[k]
|
||||
|
||||
# 清理 _active_alarms 中可能因 resolve 丢失而残留的记录
|
||||
expired_active = [
|
||||
k for k, t in self._active_alarms_time.items()
|
||||
if (now - t).total_seconds() > self._active_alarm_max_age_sec
|
||||
]
|
||||
for k in expired_active:
|
||||
self._active_alarms.pop(k, None)
|
||||
self._active_alarms_time.pop(k, None)
|
||||
self._logger.warning(f"[去重] 活跃告警超时清除: {k}")
|
||||
|
||||
if expired_cooldown or expired_active:
|
||||
self._logger.debug(
|
||||
f"[去重] 清理完成: cooldown={len(expired_cooldown)}, active={len(expired_active)}"
|
||||
)
|
||||
|
||||
def _inference_worker(self):
|
||||
"""推理线程:攒批窗口内收集 ROI 请求,批量推理"""
|
||||
while not self._stop_event.is_set():
|
||||
|
||||
Reference in New Issue
Block a user