修复:P0+P1 生产稳定性和性能优化(6项)

P0 稳定性修复:
- 告警去重字典添加惰性清理机制,防止长时间运行内存溢出
- Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏
- 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试

P1 性能优化:
- GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片
- 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式)
- ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 14:05:57 +08:00
parent a9a5457583
commit 5a0265de52
8 changed files with 593 additions and 41 deletions

View File

@@ -59,6 +59,7 @@ class ScreenshotHandler:
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._last_pending_check = 0.0
# ==================== 生命周期 ====================
@@ -180,20 +181,26 @@ class ScreenshotHandler:
backoff = 5 # 重置退避
# 每 60 秒检查一次 pending 消息
import time as _time
if _time.time() - self._last_pending_check > 60:
self._last_pending_check = _time.time()
self._cleanup_pending_messages()
for stream_name, messages in results:
for msg_id, fields in messages:
try:
self._handle_request(fields)
except Exception as e:
logger.error("[截图] 处理请求失败: %s", e)
finally:
# ACK 消息
# 处理成功才 ACK
try:
self._cloud_redis.xack(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
)
except Exception:
pass
except Exception as e:
logger.error("[截图] 处理请求失败 (msg_id=%s): %s", msg_id, e)
# 不 ACK消息留在 pending list 等待重试
except redis.ConnectionError as e:
if self._stop_event.is_set():
@@ -409,3 +416,38 @@ class ScreenshotHandler:
logger.info("[截图] 降级写 Redis 成功: request_id=%s", request_id)
except Exception as e:
logger.error("[截图] 降级写 Redis 也失败: %s", e)
# ==================== Pending 消息清理 ====================
_MAX_RETRY_COUNT = 3
_PENDING_IDLE_MS = 30000 # 消息 pending 超过 30 秒才处理
def _cleanup_pending_messages(self):
"""清理 pending list 中重试次数过多的消息"""
try:
pending = self._cloud_redis.xpending_range(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP,
min="-", max="+", count=50,
consumername=self._consumer_name
)
for entry in pending:
msg_id = entry['message_id']
delivery_count = entry['times_delivered']
idle_ms = entry['time_since_delivered']
if idle_ms < self._PENDING_IDLE_MS:
continue
if delivery_count > self._MAX_RETRY_COUNT:
logger.warning(
"[截图] 消息超过最大重试次数,丢弃: msg_id=%s, retries=%d",
msg_id, delivery_count
)
try:
self._cloud_redis.xack(
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
)
except Exception:
pass
except Exception as e:
logger.debug("[截图] 检查 pending list: %s", e)