修复:P0+P1 生产稳定性和性能优化(6项)

P0 稳定性修复:
- 告警去重字典添加惰性清理机制,防止长时间运行内存溢出
- Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏
- 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试

P1 性能优化:
- GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片
- 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式)
- ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 14:05:57 +08:00
parent a9a5457583
commit 5a0265de52
8 changed files with 593 additions and 41 deletions

39
main.py
View File

@@ -92,6 +92,10 @@ class EdgeInferenceService:
# ROI级别告警去重同ROI+同类型未resolve的告警不重复发送
# key: f"{roi_id}_{alert_type}", value: alarm_id
self._active_alarms: Dict[str, str] = {}
self._active_alarms_time: Dict[str, datetime] = {} # 活跃告警创建时间
self._cleanup_counter = 0
self._cleanup_interval = 100 # 每 100 次 _handle_detections 清理一次
self._active_alarm_max_age_sec = 3600 # 活跃告警最大存活时间1小时
self._logger.info("Edge_Inference_Service 初始化开始")
@@ -800,6 +804,12 @@ class EdgeInferenceService:
):
"""处理检测结果 - 算法接管判断权"""
try:
# 惰性清理过期去重记录
self._cleanup_counter += 1
if self._cleanup_counter >= self._cleanup_interval:
self._cleanup_counter = 0
self._cleanup_dedup_dicts(frame.timestamp)
if self._algorithm_manager is None:
self._logger.warning("算法管理器不可用,跳过算法处理")
return
@@ -865,6 +875,7 @@ class EdgeInferenceService:
for k, v in list(self._active_alarms.items()):
if v == resolve_alarm_id:
del self._active_alarms[k]
self._active_alarms_time.pop(k, None)
self._logger.debug(f"[去重] 活跃告警已清除: {k} -> {resolve_alarm_id}")
break
@@ -938,6 +949,7 @@ class EdgeInferenceService:
# 记录活跃告警(用于 ROI 级去重)
self._active_alarms[active_key] = alarm_info.alarm_id
self._active_alarms_time[active_key] = frame.timestamp
# 回填 alarm_id 到算法实例(用于后续 resolve 追踪,泛化支持所有算法类型)
algo = self._algorithm_manager.algorithms.get(roi_id, {}).get(f"{roi_id}_{bind.bind_id}", {}).get(alert_type)
@@ -952,7 +964,32 @@ class EdgeInferenceService:
except Exception as e:
self._logger.error(f"处理检测结果失败: {e}")
def _cleanup_dedup_dicts(self, now: datetime):
"""惰性清理过期的去重记录"""
# 清理 _camera_alert_cooldown 中已过冷却期的记录
expired_cooldown = [
k for k, v in self._camera_alert_cooldown.items()
if (now - v).total_seconds() > self._camera_cooldown_seconds * 2
]
for k in expired_cooldown:
del self._camera_alert_cooldown[k]
# 清理 _active_alarms 中可能因 resolve 丢失而残留的记录
expired_active = [
k for k, t in self._active_alarms_time.items()
if (now - t).total_seconds() > self._active_alarm_max_age_sec
]
for k in expired_active:
self._active_alarms.pop(k, None)
self._active_alarms_time.pop(k, None)
self._logger.warning(f"[去重] 活跃告警超时清除: {k}")
if expired_cooldown or expired_active:
self._logger.debug(
f"[去重] 清理完成: cooldown={len(expired_cooldown)}, active={len(expired_active)}"
)
def _inference_worker(self):
"""推理线程:攒批窗口内收集 ROI 请求,批量推理"""
while not self._stop_event.is_set():