修复:P0+P1 生产稳定性和性能优化(6项)

P0 稳定性修复:
- 告警去重字典添加惰性清理机制,防止长时间运行内存溢出
- Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏
- 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试

P1 性能优化:
- GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片
- 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式)
- ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 14:05:57 +08:00
parent a9a5457583
commit 5a0265de52
8 changed files with 593 additions and 41 deletions

View File

@@ -47,6 +47,7 @@ class AlarmUploadWorker:
self._logger = logging.getLogger("alarm_upload_worker")
self._redis: Optional[redis.Redis] = None
self._redis_binary: Optional[redis.Redis] = None # 用于读取截图 bytes
self._cos_client = None # 懒初始化
self._thread: Optional[threading.Thread] = None
@@ -80,6 +81,16 @@ class AlarmUploadWorker:
)
self._redis.ping()
self._logger.info(f"Worker Redis 连接成功: {redis_cfg.host}:{redis_cfg.port}/{redis_cfg.db}")
# 二进制 Redis 连接(用于读取截图 bytes不做 decode
self._redis_binary = redis.Redis(
host=redis_cfg.host,
port=redis_cfg.port,
db=redis_cfg.db,
password=redis_cfg.password,
decode_responses=False,
socket_connect_timeout=5,
)
except Exception as e:
self._logger.error(f"Worker Redis 连接失败: {e}")
return
@@ -136,6 +147,12 @@ class AlarmUploadWorker:
except Exception:
pass
if self._redis_binary:
try:
self._redis_binary.close()
except Exception:
pass
self._logger.info("AlarmUploadWorker 已停止")
def _worker_loop(self):
@@ -184,21 +201,43 @@ class AlarmUploadWorker:
self._logger.info(f"开始处理告警: {alarm_id} (retry={retry_count})")
# Step 1: 上传截图到 COS(从 base64 解码后直接上传字节流)
# Step 1: 上传截图到 COS
snapshot_key = (alarm_data.get("ext_data") or {}).get("_snapshot_key")
snapshot_b64 = alarm_data.get("snapshot_b64")
object_key = None
if snapshot_b64:
if snapshot_key:
# 新格式:从独立 Redis key 获取原始 bytes
try:
image_bytes = self._redis_binary.get(snapshot_key) if self._redis_binary else None
if image_bytes is None:
self._logger.warning(f"截图 key 已过期: {snapshot_key}, 无截图继续上报")
else:
object_key = self._upload_snapshot_to_cos(
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
)
if object_key is None:
self._handle_retry(alarm_json, "COS 上传失败")
return
# 上传成功后删除临时 key
try:
if self._redis_binary:
self._redis_binary.delete(snapshot_key)
except Exception:
pass
except Exception as e:
self._logger.error(f"截图获取/上传失败: {e}")
self._handle_retry(alarm_json, f"截图处理失败: {e}")
return
elif snapshot_b64:
# 兼容旧格式 (Base64)
try:
import base64
image_bytes = base64.b64decode(snapshot_b64)
object_key = self._upload_snapshot_to_cos(
image_bytes,
alarm_id,
alarm_data.get("device_id", "unknown"),
image_bytes, alarm_id, alarm_data.get("device_id", "unknown")
)
if object_key is None:
# COS 上传失败,进入重试
self._handle_retry(alarm_json, "COS 上传失败")
return
except Exception as e: