修复:P0+P1 生产稳定性和性能优化(6项)
P0 稳定性修复: - 告警去重字典添加惰性清理机制,防止长时间运行内存溢出 - Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏 - 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试 P1 性能优化: - GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片 - 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式) - ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -47,6 +47,7 @@ class AlarmUploadWorker:
|
||||
self._logger = logging.getLogger("alarm_upload_worker")
|
||||
|
||||
self._redis: Optional[redis.Redis] = None
|
||||
self._redis_binary: Optional[redis.Redis] = None # 用于读取截图 bytes
|
||||
self._cos_client = None # 懒初始化
|
||||
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
@@ -80,6 +81,16 @@ class AlarmUploadWorker:
|
||||
)
|
||||
self._redis.ping()
|
||||
self._logger.info(f"Worker Redis 连接成功: {redis_cfg.host}:{redis_cfg.port}/{redis_cfg.db}")
|
||||
|
||||
# 二进制 Redis 连接(用于读取截图 bytes,不做 decode)
|
||||
self._redis_binary = redis.Redis(
|
||||
host=redis_cfg.host,
|
||||
port=redis_cfg.port,
|
||||
db=redis_cfg.db,
|
||||
password=redis_cfg.password,
|
||||
decode_responses=False,
|
||||
socket_connect_timeout=5,
|
||||
)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Worker Redis 连接失败: {e}")
|
||||
return
|
||||
@@ -136,6 +147,12 @@ class AlarmUploadWorker:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if self._redis_binary:
|
||||
try:
|
||||
self._redis_binary.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._logger.info("AlarmUploadWorker 已停止")
|
||||
|
||||
def _worker_loop(self):
|
||||
@@ -184,21 +201,43 @@ class AlarmUploadWorker:
|
||||
|
||||
self._logger.info(f"开始处理告警: {alarm_id} (retry={retry_count})")
|
||||
|
||||
# Step 1: 上传截图到 COS(从 base64 解码后直接上传字节流)
|
||||
# Step 1: 上传截图到 COS
|
||||
snapshot_key = (alarm_data.get("ext_data") or {}).get("_snapshot_key")
|
||||
snapshot_b64 = alarm_data.get("snapshot_b64")
|
||||
object_key = None
|
||||
|
||||
if snapshot_b64:
|
||||
if snapshot_key:
|
||||
# 新格式:从独立 Redis key 获取原始 bytes
|
||||
try:
|
||||
image_bytes = self._redis_binary.get(snapshot_key) if self._redis_binary else None
|
||||
if image_bytes is None:
|
||||
self._logger.warning(f"截图 key 已过期: {snapshot_key}, 无截图继续上报")
|
||||
else:
|
||||
object_key = self._upload_snapshot_to_cos(
|
||||
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
|
||||
)
|
||||
if object_key is None:
|
||||
self._handle_retry(alarm_json, "COS 上传失败")
|
||||
return
|
||||
# 上传成功后删除临时 key
|
||||
try:
|
||||
if self._redis_binary:
|
||||
self._redis_binary.delete(snapshot_key)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
self._logger.error(f"截图获取/上传失败: {e}")
|
||||
self._handle_retry(alarm_json, f"截图处理失败: {e}")
|
||||
return
|
||||
elif snapshot_b64:
|
||||
# 兼容旧格式 (Base64)
|
||||
try:
|
||||
import base64
|
||||
image_bytes = base64.b64decode(snapshot_b64)
|
||||
object_key = self._upload_snapshot_to_cos(
|
||||
image_bytes,
|
||||
alarm_id,
|
||||
alarm_data.get("device_id", "unknown"),
|
||||
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
|
||||
)
|
||||
if object_key is None:
|
||||
# COS 上传失败,进入重试
|
||||
self._handle_retry(alarm_json, "COS 上传失败")
|
||||
return
|
||||
except Exception as e:
|
||||
|
||||
@@ -215,6 +215,15 @@ class ConfigSyncManager:
|
||||
logger.error(f"本地 Redis 连接失败: {e}")
|
||||
self._local_redis = None
|
||||
|
||||
def _safe_close_cloud_redis(self):
|
||||
"""安全关闭云端 Redis 连接"""
|
||||
if self._cloud_redis is not None:
|
||||
try:
|
||||
self._cloud_redis.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._cloud_redis = None
|
||||
|
||||
def _init_cloud_redis(self):
|
||||
"""初始化云端 Redis 连接"""
|
||||
try:
|
||||
@@ -238,7 +247,7 @@ class ConfigSyncManager:
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"云端 Redis 连接失败(将使用本地缓存运行): {e}")
|
||||
self._cloud_redis = None
|
||||
self._safe_close_cloud_redis()
|
||||
|
||||
def _init_database(self):
|
||||
"""初始化 SQLite 数据库连接"""
|
||||
@@ -311,9 +320,7 @@ class ConfigSyncManager:
|
||||
try:
|
||||
cameras = self._db_manager.get_all_camera_configs()
|
||||
rois = self._db_manager.get_all_roi_configs()
|
||||
binds = []
|
||||
for roi in rois:
|
||||
binds.extend(self._db_manager.get_bindings_by_roi(roi["roi_id"]))
|
||||
binds = self._db_manager.get_all_bindings()
|
||||
logger.info(f"[EDGE] Loading config from local db ({source})...")
|
||||
logger.info(f"[EDGE] Camera count = {len(cameras)}")
|
||||
logger.info(f"[EDGE] ROI count = {len(rois)}")
|
||||
@@ -378,7 +385,7 @@ class ConfigSyncManager:
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
logger.warning(f"云端 Redis 连接断开: {e}, {backoff}s 后重连...")
|
||||
self._cloud_redis = None
|
||||
self._safe_close_cloud_redis()
|
||||
self._stop_event.wait(backoff)
|
||||
backoff = min(backoff * 2, max_backoff)
|
||||
|
||||
@@ -776,10 +783,7 @@ class ConfigSyncManager:
|
||||
bindings_list = self._db_manager.get_bindings_by_camera(camera_id)
|
||||
else:
|
||||
roi_configs = self._db_manager.get_all_roi_configs()
|
||||
bindings_list = []
|
||||
for roi in roi_configs:
|
||||
bindings = self._db_manager.get_bindings_by_roi(roi['roi_id'])
|
||||
bindings_list.extend(bindings)
|
||||
bindings_list = self._db_manager.get_all_bindings()
|
||||
|
||||
roi_dict = {r['roi_id']: r for r in roi_configs}
|
||||
bindings_dict: Dict[str, list] = {}
|
||||
@@ -857,8 +861,7 @@ class ConfigSyncManager:
|
||||
|
||||
binds: List[Dict[str, Any]] = []
|
||||
rois = self._db_manager.get_all_roi_configs()
|
||||
for roi in rois:
|
||||
binds.extend(self._db_manager.get_bindings_by_roi(roi["roi_id"]))
|
||||
binds = self._db_manager.get_all_bindings()
|
||||
return binds
|
||||
|
||||
def get_algo_bind_from_redis(self, bind_id: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
@@ -78,22 +78,24 @@ class NMSProcessor:
|
||||
max_output_size: int
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""GPU 加速 NMS"""
|
||||
boxes_t = torch.from_numpy(boxes).cuda()
|
||||
scores_t = torch.from_numpy(scores).cuda()
|
||||
|
||||
keep = torch_nms(boxes_t, scores_t, iou_threshold=self.nms_threshold)
|
||||
|
||||
keep_np = keep.cpu().numpy()
|
||||
|
||||
if len(keep_np) > max_output_size:
|
||||
top_k = np.argsort(scores[keep_np])[::-1][:max_output_size]
|
||||
keep_np = keep_np[top_k]
|
||||
|
||||
return (
|
||||
keep_np.astype(np.int32),
|
||||
scores[keep_np],
|
||||
class_ids[keep_np] if class_ids is not None else np.array([])
|
||||
)
|
||||
with torch.no_grad():
|
||||
boxes_t = torch.from_numpy(boxes).cuda()
|
||||
scores_t = torch.from_numpy(scores).cuda()
|
||||
|
||||
keep = torch_nms(boxes_t, scores_t, iou_threshold=self.nms_threshold)
|
||||
|
||||
keep_np = keep.cpu().numpy()
|
||||
del boxes_t, scores_t, keep
|
||||
|
||||
if len(keep_np) > max_output_size:
|
||||
top_k = np.argsort(scores[keep_np])[::-1][:max_output_size]
|
||||
keep_np = keep_np[top_k]
|
||||
|
||||
return (
|
||||
keep_np.astype(np.int32),
|
||||
scores[keep_np],
|
||||
class_ids[keep_np] if class_ids is not None else np.array([])
|
||||
)
|
||||
|
||||
def _process_cpu(
|
||||
self,
|
||||
|
||||
@@ -112,9 +112,20 @@ class ResultReporter:
|
||||
self._logger.info(
|
||||
f"Redis 连接成功: {redis_cfg.host}:{redis_cfg.port}/{redis_cfg.db}"
|
||||
)
|
||||
|
||||
# 二进制 Redis 连接(用于存储截图 bytes,不做 decode)
|
||||
self._redis_binary = redis.Redis(
|
||||
host=redis_cfg.host,
|
||||
port=redis_cfg.port,
|
||||
db=redis_cfg.db,
|
||||
password=redis_cfg.password,
|
||||
decode_responses=False,
|
||||
socket_connect_timeout=5,
|
||||
)
|
||||
except Exception as e:
|
||||
self._logger.error(f"Redis 连接失败: {e}")
|
||||
self._redis = None
|
||||
self._redis_binary = None
|
||||
|
||||
def report_alarm(self, alarm_info: AlarmInfo, screenshot: Optional[np.ndarray] = None) -> bool:
|
||||
"""
|
||||
@@ -129,13 +140,22 @@ class ResultReporter:
|
||||
"""
|
||||
self._performance_stats["alerts_generated"] += 1
|
||||
|
||||
# 将截图编码为 JPEG base64,直接通过 Redis 传递给 Worker 上传 COS
|
||||
# 将截图编码为 JPEG,直接存储 bytes 到独立 Redis key,避免 Base64 开销
|
||||
if screenshot is not None:
|
||||
try:
|
||||
import cv2
|
||||
import base64
|
||||
success, buffer = cv2.imencode('.jpg', screenshot, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||
if success:
|
||||
if success and self._redis_binary is not None:
|
||||
snapshot_key = f"local:alarm:snapshot:{alarm_info.alarm_id}"
|
||||
# 直接存储 JPEG bytes,避免 Base64 编解码开销
|
||||
self._redis_binary.set(snapshot_key, buffer.tobytes(), ex=3600)
|
||||
alarm_info.snapshot_b64 = None
|
||||
if alarm_info.ext_data is None:
|
||||
alarm_info.ext_data = {}
|
||||
alarm_info.ext_data["_snapshot_key"] = snapshot_key
|
||||
elif success:
|
||||
# 降级:无二进制 Redis 连接时使用 Base64
|
||||
import base64
|
||||
alarm_info.snapshot_b64 = base64.b64encode(buffer.tobytes()).decode('ascii')
|
||||
else:
|
||||
self._logger.warning("截图 JPEG 编码失败")
|
||||
@@ -211,6 +231,12 @@ class ResultReporter:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if hasattr(self, '_redis_binary') and self._redis_binary:
|
||||
try:
|
||||
self._redis_binary.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._logger.info("ResultReporter 清理完成")
|
||||
|
||||
def cleanup(self):
|
||||
|
||||
@@ -59,6 +59,7 @@ class ScreenshotHandler:
|
||||
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
self._last_pending_check = 0.0
|
||||
|
||||
# ==================== 生命周期 ====================
|
||||
|
||||
@@ -180,20 +181,26 @@ class ScreenshotHandler:
|
||||
|
||||
backoff = 5 # 重置退避
|
||||
|
||||
# 每 60 秒检查一次 pending 消息
|
||||
import time as _time
|
||||
if _time.time() - self._last_pending_check > 60:
|
||||
self._last_pending_check = _time.time()
|
||||
self._cleanup_pending_messages()
|
||||
|
||||
for stream_name, messages in results:
|
||||
for msg_id, fields in messages:
|
||||
try:
|
||||
self._handle_request(fields)
|
||||
except Exception as e:
|
||||
logger.error("[截图] 处理请求失败: %s", e)
|
||||
finally:
|
||||
# ACK 消息
|
||||
# 处理成功才 ACK
|
||||
try:
|
||||
self._cloud_redis.xack(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error("[截图] 处理请求失败 (msg_id=%s): %s", msg_id, e)
|
||||
# 不 ACK,消息留在 pending list 等待重试
|
||||
|
||||
except redis.ConnectionError as e:
|
||||
if self._stop_event.is_set():
|
||||
@@ -409,3 +416,38 @@ class ScreenshotHandler:
|
||||
logger.info("[截图] 降级写 Redis 成功: request_id=%s", request_id)
|
||||
except Exception as e:
|
||||
logger.error("[截图] 降级写 Redis 也失败: %s", e)
|
||||
|
||||
# ==================== Pending 消息清理 ====================
|
||||
|
||||
_MAX_RETRY_COUNT = 3
|
||||
_PENDING_IDLE_MS = 30000 # 消息 pending 超过 30 秒才处理
|
||||
|
||||
def _cleanup_pending_messages(self):
|
||||
"""清理 pending list 中重试次数过多的消息"""
|
||||
try:
|
||||
pending = self._cloud_redis.xpending_range(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP,
|
||||
min="-", max="+", count=50,
|
||||
consumername=self._consumer_name
|
||||
)
|
||||
for entry in pending:
|
||||
msg_id = entry['message_id']
|
||||
delivery_count = entry['times_delivered']
|
||||
idle_ms = entry['time_since_delivered']
|
||||
|
||||
if idle_ms < self._PENDING_IDLE_MS:
|
||||
continue
|
||||
|
||||
if delivery_count > self._MAX_RETRY_COUNT:
|
||||
logger.warning(
|
||||
"[截图] 消息超过最大重试次数,丢弃: msg_id=%s, retries=%d",
|
||||
msg_id, delivery_count
|
||||
)
|
||||
try:
|
||||
self._cloud_redis.xack(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug("[截图] 检查 pending list: %s", e)
|
||||
|
||||
Reference in New Issue
Block a user