修复:P0+P1 生产稳定性和性能优化(6项)
P0 稳定性修复: - 告警去重字典添加惰性清理机制,防止长时间运行内存溢出 - Redis 连接断开时显式 close() 后再置 None,防止文件描述符泄漏 - 截图消息 ACK 移至成功路径,失败消息留在 pending list 自动重试 P1 性能优化: - GPU NMS 添加 torch.no_grad() + 显式释放临时张量,减少显存碎片 - 截图存储改为 Redis 原始 bytes,去掉 Base64 编解码开销(兼容旧格式) - ROI 配置查询 N+1 改为 get_all_bindings() 单次 JOIN 查询 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -59,6 +59,7 @@ class ScreenshotHandler:
|
||||
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
self._last_pending_check = 0.0
|
||||
|
||||
# ==================== 生命周期 ====================
|
||||
|
||||
@@ -180,20 +181,26 @@ class ScreenshotHandler:
|
||||
|
||||
backoff = 5 # 重置退避
|
||||
|
||||
# 每 60 秒检查一次 pending 消息
|
||||
import time as _time
|
||||
if _time.time() - self._last_pending_check > 60:
|
||||
self._last_pending_check = _time.time()
|
||||
self._cleanup_pending_messages()
|
||||
|
||||
for stream_name, messages in results:
|
||||
for msg_id, fields in messages:
|
||||
try:
|
||||
self._handle_request(fields)
|
||||
except Exception as e:
|
||||
logger.error("[截图] 处理请求失败: %s", e)
|
||||
finally:
|
||||
# ACK 消息
|
||||
# 处理成功才 ACK
|
||||
try:
|
||||
self._cloud_redis.xack(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error("[截图] 处理请求失败 (msg_id=%s): %s", msg_id, e)
|
||||
# 不 ACK,消息留在 pending list 等待重试
|
||||
|
||||
except redis.ConnectionError as e:
|
||||
if self._stop_event.is_set():
|
||||
@@ -409,3 +416,38 @@ class ScreenshotHandler:
|
||||
logger.info("[截图] 降级写 Redis 成功: request_id=%s", request_id)
|
||||
except Exception as e:
|
||||
logger.error("[截图] 降级写 Redis 也失败: %s", e)
|
||||
|
||||
# ==================== Pending 消息清理 ====================
|
||||
|
||||
_MAX_RETRY_COUNT = 3
|
||||
_PENDING_IDLE_MS = 30000 # 消息 pending 超过 30 秒才处理
|
||||
|
||||
def _cleanup_pending_messages(self):
|
||||
"""清理 pending list 中重试次数过多的消息"""
|
||||
try:
|
||||
pending = self._cloud_redis.xpending_range(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP,
|
||||
min="-", max="+", count=50,
|
||||
consumername=self._consumer_name
|
||||
)
|
||||
for entry in pending:
|
||||
msg_id = entry['message_id']
|
||||
delivery_count = entry['times_delivered']
|
||||
idle_ms = entry['time_since_delivered']
|
||||
|
||||
if idle_ms < self._PENDING_IDLE_MS:
|
||||
continue
|
||||
|
||||
if delivery_count > self._MAX_RETRY_COUNT:
|
||||
logger.warning(
|
||||
"[截图] 消息超过最大重试次数,丢弃: msg_id=%s, retries=%d",
|
||||
msg_id, delivery_count
|
||||
)
|
||||
try:
|
||||
self._cloud_redis.xack(
|
||||
SNAP_REQUEST_STREAM, SNAP_CONSUMER_GROUP, msg_id
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug("[截图] 检查 pending list: %s", e)
|
||||
|
||||
Reference in New Issue
Block a user