修复:P0+P1 生产稳定性和性能优化(6项)

P0 稳定性修复:
- 告警去重字典添加惰性清理机制,防止长时间运行内存溢出
- Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏
- 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试

P1 性能优化:
- GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片
- 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式)
- ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 14:05:57 +08:00
parent a9a5457583
commit 5a0265de52
8 changed files with 593 additions and 41 deletions

View File

@@ -47,6 +47,7 @@ class AlarmUploadWorker:
self._logger = logging.getLogger("alarm_upload_worker")
self._redis: Optional[redis.Redis] = None
self._redis_binary: Optional[redis.Redis] = None # 用于读取截图 bytes
self._cos_client = None # 懒初始化
self._thread: Optional[threading.Thread] = None
@@ -80,6 +81,16 @@ class AlarmUploadWorker:
)
self._redis.ping()
self._logger.info(f"Worker Redis 连接成功: {redis_cfg.host}:{redis_cfg.port}/{redis_cfg.db}")
# 二进制 Redis 连接(用于读取截图 bytes不做 decode
self._redis_binary = redis.Redis(
host=redis_cfg.host,
port=redis_cfg.port,
db=redis_cfg.db,
password=redis_cfg.password,
decode_responses=False,
socket_connect_timeout=5,
)
except Exception as e:
self._logger.error(f"Worker Redis 连接失败: {e}")
return
@@ -136,6 +147,12 @@ class AlarmUploadWorker:
except Exception:
pass
if self._redis_binary:
try:
self._redis_binary.close()
except Exception:
pass
self._logger.info("AlarmUploadWorker 已停止")
def _worker_loop(self):
@@ -184,21 +201,43 @@ class AlarmUploadWorker:
self._logger.info(f"开始处理告警: {alarm_id} (retry={retry_count})")
# Step 1: 上传截图到 COS(从 base64 解码后直接上传字节流)
# Step 1: 上传截图到 COS
snapshot_key = (alarm_data.get("ext_data") or {}).get("_snapshot_key")
snapshot_b64 = alarm_data.get("snapshot_b64")
object_key = None
if snapshot_b64:
if snapshot_key:
# 新格式:从独立 Redis key 获取原始 bytes
try:
image_bytes = self._redis_binary.get(snapshot_key) if self._redis_binary else None
if image_bytes is None:
self._logger.warning(f"截图 key 已过期: {snapshot_key}, 无截图继续上报")
else:
object_key = self._upload_snapshot_to_cos(
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
)
if object_key is None:
self._handle_retry(alarm_json, "COS 上传失败")
return
# 上传成功后删除临时 key
try:
if self._redis_binary:
self._redis_binary.delete(snapshot_key)
except Exception:
pass
except Exception as e:
self._logger.error(f"截图获取/上传失败: {e}")
self._handle_retry(alarm_json, f"截图处理失败: {e}")
return
elif snapshot_b64:
# 兼容旧格式 (Base64)
try:
import base64
image_bytes = base64.b64decode(snapshot_b64)
object_key = self._upload_snapshot_to_cos(
image_bytes,
alarm_id,
alarm_data.get("device_id", "unknown"),
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
)
if object_key is None:
# COS 上传失败,进入重试
self._handle_retry(alarm_json, "COS 上传失败")
return
except Exception as e:

View File

@@ -215,6 +215,15 @@ class ConfigSyncManager:
logger.error(f"本地 Redis 连接失败: {e}")
self._local_redis = None
def _safe_close_cloud_redis(self):
"""安全关闭云端 Redis 连接"""
if self._cloud_redis is not None:
try:
self._cloud_redis.close()
except Exception:
pass
self._cloud_redis = None
def _init_cloud_redis(self):
"""初始化云端 Redis 连接"""
try:
@@ -238,7 +247,7 @@ class ConfigSyncManager:
except Exception as e:
logger.warning(f"云端 Redis 连接失败(将使用本地缓存运行): {e}")
self._cloud_redis = None
self._safe_close_cloud_redis()
def _init_database(self):
"""初始化 SQLite 数据库连接"""
@@ -311,9 +320,7 @@ class ConfigSyncManager:
try:
cameras = self._db_manager.get_all_camera_configs()
rois = self._db_manager.get_all_roi_configs()
binds = []
for roi in rois:
binds.extend(self._db_manager.get_bindings_by_roi(roi["roi_id"]))
binds = self._db_manager.get_all_bindings()
logger.info(f"[EDGE] Loading config from local db ({source})...")
logger.info(f"[EDGE] Camera count = {len(cameras)}")
logger.info(f"[EDGE] ROI count = {len(rois)}")
@@ -378,7 +385,7 @@ class ConfigSyncManager:
if self._stop_event.is_set():
return
logger.warning(f"云端 Redis 连接断开: {e}, {backoff}s 后重连...")
self._cloud_redis = None
self._safe_close_cloud_redis()
self._stop_event.wait(backoff)
backoff = min(backoff * 2, max_backoff)
@@ -776,10 +783,7 @@ class ConfigSyncManager:
bindings_list = self._db_manager.get_bindings_by_camera(camera_id)
else:
roi_configs = self._db_manager.get_all_roi_configs()
bindings_list = []
for roi in roi_configs:
bindings = self._db_manager.get_bindings_by_roi(roi['roi_id'])
bindings_list.extend(bindings)
bindings_list = self._db_manager.get_all_bindings()
roi_dict = {r['roi_id']: r for r in roi_configs}
bindings_dict: Dict[str, list] = {}
@@ -857,8 +861,7 @@ class ConfigSyncManager:
binds: List[Dict[str, Any]] = []
rois = self._db_manager.get_all_roi_configs()
for roi in rois:
binds.extend(self._db_manager.get_bindings_by_roi(roi["roi_id"]))
binds = self._db_manager.get_all_bindings()
return binds
def get_algo_bind_from_redis(self, bind_id: str) -> Optional[Dict[str, Any]]:

View File

@@ -78,22 +78,24 @@ class NMSProcessor:
max_output_size: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""GPU 加速 NMS"""
boxes_t = torch.from_numpy(boxes).cuda()
scores_t = torch.from_numpy(scores).cuda()
keep = torch_nms(boxes_t, scores_t, iou_threshold=self.nms_threshold)
keep_np = keep.cpu().numpy()
if len(keep_np) > max_output_size:
top_k = np.argsort(scores[keep_np])[::-1][:max_output_size]
keep_np = keep_np[top_k]
return (
keep_np.astype(np.int32),
scores[keep_np],
class_ids[keep_np] if class_ids is not None else np.array([])
)
with torch.no_grad():
boxes_t = torch.from_numpy(boxes).cuda()
scores_t = torch.from_numpy(scores).cuda()
keep = torch_nms(boxes_t, scores_t, iou_threshold=self.nms_threshold)
keep_np = keep.cpu().numpy()
del boxes_t, scores_t, keep
if len(keep_np) > max_output_size:
top_k = np.argsort(scores[keep_np])[::-1][:max_output_size]
keep_np = keep_np[top_k]
return (
keep_np.astype(np.int32),
scores[keep_np],
class_ids[keep_np] if class_ids is not None else np.array([])
)
def _process_cpu(
self,

View File

@@ -112,9 +112,20 @@ class ResultReporter:
self._logger.info(
f"Redis 连接成功: {redis_cfg.host}:{redis_cfg.port}/{redis_cfg.db}"
)
# 二进制 Redis 连接(用于存储截图 bytes不做 decode
self._redis_binary = redis.Redis(
host=redis_cfg.host,
port=redis_cfg.port,
db=redis_cfg.db,
password=redis_cfg.password,
decode_responses=False,
socket_connect_timeout=5,
)
except Exception as e:
self._logger.error(f"Redis 连接失败: {e}")
self._redis = None
self._redis_binary = None
def report_alarm(self, alarm_info: AlarmInfo, screenshot: Optional[np.ndarray] = None) -> bool:
"""
@@ -129,13 +140,22 @@ class ResultReporter:
"""
self._performance_stats["alerts_generated"] += 1
# 将截图编码为 JPEG base64直接通过 Redis 传递给 Worker 上传 COS
# 将截图编码为 JPEG,直接存储 bytes 到独立 Redis key避免 Base64 开销
if screenshot is not None:
try:
import cv2
import base64
success, buffer = cv2.imencode('.jpg', screenshot, [cv2.IMWRITE_JPEG_QUALITY, 85])
if success:
if success and self._redis_binary is not None:
snapshot_key = f"local:alarm:snapshot:{alarm_info.alarm_id}"
# 直接存储 JPEG bytes避免 Base64 编解码开销
self._redis_binary.set(snapshot_key, buffer.tobytes(), ex=3600)
alarm_info.snapshot_b64 = None
if alarm_info.ext_data is None:
alarm_info.ext_data = {}
alarm_info.ext_data["_snapshot_key"] = snapshot_key
elif success:
# 降级:无二进制 Redis 连接时使用 Base64
import base64
alarm_info.snapshot_b64 = base64.b64encode(buffer.tobytes()).decode('ascii')
else:
self._logger.warning("截图 JPEG 编码失败")
@@ -211,6 +231,12 @@ class ResultReporter:
except Exception:
pass
if hasattr(self, '_redis_binary') and self._redis_binary:
try:
self._redis_binary.close()
except Exception:
pass
self._logger.info("ResultReporter 清理完成")
def cleanup(self):

View File

@@ -59,6 +59,7 @@ class ScreenshotHandler:
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._last_pending_check = 0.0
# ==================== 生命周期 ====================
@@ -180,20 +181,26 @@ class ScreenshotHandler:
backoff = 5 # 重置退避
# 每 60 秒检查一次 pending 消息
import time as _time
if _time.time() - self._last_pending_check > 60:
self._last_pending_check = _time.time()
self._cleanup_pending_messages()
for stream_name, messages in results:
for msg_id, fields in messages:
try:
self._handle_request(fields)
except Exception as e:
logger.error("[截图] 处理请求失败: %s", e)
finally:
# ACK 消息
# 处理成功才 ACK
try:
self._cloud_redis.xack(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
)
except Exception:
pass
except Exception as e:
logger.error("[截图] 处理请求失败 (msg_id=%s): %s", msg_id, e)
# 不 ACK消息留在 pending list 等待重试
except redis.ConnectionError as e:
if self._stop_event.is_set():
@@ -409,3 +416,38 @@ class ScreenshotHandler:
logger.info("[截图] 降级写 Redis 成功: request_id=%s", request_id)
except Exception as e:
logger.error("[截图] 降级写 Redis 也失败: %s", e)
# ==================== Pending 消息清理 ====================
_MAX_RETRY_COUNT = 3
_PENDING_IDLE_MS = 30000 # 消息 pending 超过 30 秒才处理
def _cleanup_pending_messages(self):
"""清理 pending list 中重试次数过多的消息"""
try:
pending = self._cloud_redis.xpending_range(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP,
min="-", max="+", count=50,
consumername=self._consumer_name
)
for entry in pending:
msg_id = entry['message_id']
delivery_count = entry['times_delivered']
idle_ms = entry['time_since_delivered']
if idle_ms < self._PENDING_IDLE_MS:
continue
if delivery_count > self._MAX_RETRY_COUNT:
logger.warning(
"[截图] 消息超过最大重试次数,丢弃: msg_id=%s, retries=%d",
msg_id, delivery_count
)
try:
self._cloud_redis.xack(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
)
except Exception:
pass
except Exception as e:
logger.debug("[截图] 检查 pending list: %s", e)