From c78759fd52cc0667db4c2fc12eb49d0c24e924ca Mon Sep 17 00:00:00 2001 From: lzh Date: Mon, 20 Apr 2026 15:21:33 +0800 Subject: [PATCH] =?UTF-8?q?feat(ops):=20=E6=96=B0=E5=A2=9E=E4=BF=9D?= =?UTF-8?q?=E6=B4=81=E5=B7=A5=E5=8D=95=E8=B6=85=E6=97=B6=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E5=8F=96=E6=B6=88=20Job=20+=20=E9=9B=86=E6=88=90=E6=B5=8B?= =?UTF-8?q?=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 背景:保洁工单偶尔因设备离线/信标丢失导致卡在非终态(如 PENDING 超 12h 没派, DISPATCHED 超 12h 没确认),靠人工清理成本高。补一个每小时跑的 XXL-Job 扫描关单。 实现: - CleanOrderAutoCancelJob.scanAndCancel: * 查询 update_time 距今超 timeoutHours(默认 12h)的 CLEAN 工单 * 状态白名单 = PENDING/QUEUED/DISPATCHED/CONFIRMED/ARRIVED,**排除 PAUSED** (PAUSED 是 P0 打断的产物,应由 resumeInterruptedOrder 走状态机恢复, 此处若把它 CANCEL,会破坏 P0 完成后的 resume 链路) * 调用 orderLifecycleManager.cancelOrder 走完整责任链,事件监听器负责 TTS 停播/设备关联回收/审计日志 * cancel 前再 selectById 做乐观校验:若 update_time 已刷新或状态已变 (COMPLETED/CANCELLED/PAUSED),跳过;避免候选装内存到实际 cancel 之间用户刚触达的工单被误杀 * 单单独立 try/catch 隔离,单条失败不断批 * batchSize 限流(默认 200),事件风暴防护 - application.yaml 补默认配置:viewsh.ops.clean.auto-cancel.{timeout-hours, batch-size} - CleanOrderAutoCancelJobTest 覆盖 6 条不变量: 无候选零计数、全成功、部分失败不中断、乐观锁跳过 stale、终态跳过、PAUSED 跳过 XXL-Job 配置建议: - JobHandler: cleanOrderAutoCancelJob - Cron: 0 17 * * * ? (每小时 :17,避开整点尖峰) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../job/CleanOrderAutoCancelJob.java | 160 ++++++++++++++ .../job/CleanOrderAutoCancelJobTest.java | 198 ++++++++++++++++++ .../src/main/resources/application.yaml | 6 + 3 files changed, 364 insertions(+) create mode 100644 viewsh-module-ops/viewsh-module-environment-biz/src/main/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJob.java create mode 100644 viewsh-module-ops/viewsh-module-environment-biz/src/test/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJobTest.java diff --git a/viewsh-module-ops/viewsh-module-environment-biz/src/main/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJob.java b/viewsh-module-ops/viewsh-module-environment-biz/src/main/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJob.java new file mode 100644 index 00000000..d2071e63 --- /dev/null +++ b/viewsh-module-ops/viewsh-module-environment-biz/src/main/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJob.java @@ -0,0 +1,160 @@ +package com.viewsh.module.ops.environment.job; + +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.util.StrUtil; +import com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX; +import com.viewsh.framework.tenant.core.job.TenantJob; +import com.viewsh.module.ops.core.lifecycle.OrderLifecycleManager; +import com.viewsh.module.ops.dal.dataobject.workorder.OpsOrderDO; +import com.viewsh.module.ops.dal.mysql.workorder.OpsOrderMapper; +import com.viewsh.module.ops.enums.OperatorTypeEnum; +import com.viewsh.module.ops.enums.WorkOrderStatusEnum; +import com.xxl.job.core.handler.annotation.XxlJob; +import jakarta.annotation.Resource; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.time.LocalDateTime; +import java.util.List; + +/** + * 保洁工单超时自动取消 Job + *

+ * 职责: + * 扫描所有保洁类(order_type=CLEAN)非终态工单, + * 若最近一次进展(update_time)距今超过阈值(默认 12 小时), + * 以 SYSTEM 身份走正常取消流程将其关闭。 + *

+ * 设计要点: + * 1. 时间基准使用 update_time 而非 create_time——任何状态转换/字段更新都会刷新 update_time, + * 这样"按最新进展计算超时"才准确:刚被重派的 DISPATCHED 单不会因 create_time 老而被误杀。 + * 2. 状态白名单 = PENDING / QUEUED / DISPATCHED / CONFIRMED / ARRIVED(不含 PAUSED)。 + * PAUSED 是 P0 打断的产物,应由 resumeInterruptedOrder 经状态机走 PAUSED → DISPATCHED + * 恢复。若此 Job 把 PAUSED 单直接 CANCELLED,P0 完成后的 resume 会在状态机检查 + * "PAUSED → DISPATCHED" 时因源状态已变为 CANCELLED 而抛 IllegalStateException, + * 进而破坏 P0 恢复链路。PAUSED 若真的卡死(P0 也卡),交由人工审核,不自动化。 + * 3. 取消调用 {@link OrderLifecycleManager#cancelOrder} 走完整责任链: + * StateTransitionHandler → QueueSyncHandler → EventPublishHandler + * → CleanOrderEventListener.onOrderStateChanged(CANCELLED) 会统一处理 + * TTS 停播、设备工单关联回收、审计日志。 + * 4. 单单独立事务 + try/catch 隔离,单条失败不影响批次其余工单。 + * 5. 单次扫描限 batchSize 条,防止异常堆积时一次性取消过多触发事件风暴; + * 未处理完的工单留给下一轮 cron。 + * 6. cancel 前再做一次乐观校验:重查 update_time 是否仍 <= threshold。 + * 候选装内存到实际 cancel 之间如果有用户触达(确认/到岗),update_time 会被刷新; + * 此时放弃 cancel,避免误杀用户刚触达的工单。 + *

+ * XXL-Job 配置建议: + * - JobHandler: cleanOrderAutoCancelJob + * - Cron: 0 17 * * * ? (每小时 :17 触发,避开整点尖峰) + * + * @author lzh + */ +@Slf4j +@Component +public class CleanOrderAutoCancelJob { + + private static final String BUSINESS_TYPE_CLEAN = "CLEAN"; + private static final String CANCEL_REASON = "超过12小时未处理,系统自动完结"; + + @Resource + private OpsOrderMapper opsOrderMapper; + + @Resource + private OrderLifecycleManager orderLifecycleManager; + + /** 超时时长(小时),update_time 距今超过此值视为卡死 */ + @Value("${viewsh.ops.clean.auto-cancel.timeout-hours:12}") + private int timeoutHours; + + /** 单次最大扫描/取消工单数,防止事件风暴 */ + @Value("${viewsh.ops.clean.auto-cancel.batch-size:200}") + private int batchSize; + + @XxlJob("cleanOrderAutoCancelJob") + @TenantJob + public String execute() { + try { + CancelResult result = scanAndCancel(); + return StrUtil.format( + "保洁工单超时自动取消完成: 扫描 {} 单, 成功 {}, 失败 {}, 跳过 {}, 耗时 {} ms", + result.scanned, result.succeeded, result.failed, result.skippedStale, result.durationMs); + } catch (Exception e) { + log.error("[CleanOrderAutoCancelJob] 执行失败", e); + return StrUtil.format("保洁工单超时自动取消失败: {}", e.getMessage()); + } + } + + public CancelResult scanAndCancel() { + long startTime = System.currentTimeMillis(); + LocalDateTime threshold = LocalDateTime.now().minusHours(timeoutHours); + + log.info("[CleanOrderAutoCancelJob] 开始扫描: timeoutHours={}, threshold={}, batchSize={}", + timeoutHours, threshold, batchSize); + + List candidates = opsOrderMapper.selectList(new LambdaQueryWrapperX() + .eq(OpsOrderDO::getOrderType, BUSINESS_TYPE_CLEAN) + .notIn(OpsOrderDO::getStatus, + WorkOrderStatusEnum.COMPLETED.getStatus(), + WorkOrderStatusEnum.CANCELLED.getStatus(), + // PAUSED 交由 resumeInterruptedOrder 经状态机恢复,不在此 Job 自动化处理 + WorkOrderStatusEnum.PAUSED.getStatus()) + .le(OpsOrderDO::getUpdateTime, threshold) + .orderByAsc(OpsOrderDO::getUpdateTime) + .last("LIMIT " + batchSize)); + + if (CollUtil.isEmpty(candidates)) { + log.info("[CleanOrderAutoCancelJob] 无超时工单"); + return new CancelResult(0, 0, 0, 0, System.currentTimeMillis() - startTime); + } + + int succeeded = 0; + int failed = 0; + int skippedStale = 0; + + for (OpsOrderDO order : candidates) { + Long orderId = order.getId(); + try { + // 乐观校验:候选装内存→实际 cancel 之间,用户可能已触达工单刷新 update_time。 + // 重查一次确认仍超时,避免把用户刚点过的工单一并 cancel 掉。 + OpsOrderDO fresh = opsOrderMapper.selectById(orderId); + if (fresh == null + || WorkOrderStatusEnum.COMPLETED.getStatus().equals(fresh.getStatus()) + || WorkOrderStatusEnum.CANCELLED.getStatus().equals(fresh.getStatus()) + || WorkOrderStatusEnum.PAUSED.getStatus().equals(fresh.getStatus()) + || fresh.getUpdateTime() == null + || fresh.getUpdateTime().isAfter(threshold)) { + skippedStale++; + log.info("[CleanOrderAutoCancelJob] 并发触达/状态已变,跳过: orderId={}, snapshotStatus={}, latestStatus={}, latestUpdateTime={}", + orderId, order.getStatus(), + fresh != null ? fresh.getStatus() : "NOT_FOUND", + fresh != null ? fresh.getUpdateTime() : null); + continue; + } + + orderLifecycleManager.cancelOrder( + orderId, + null, + OperatorTypeEnum.SYSTEM, + CANCEL_REASON); + succeeded++; + log.info("[CleanOrderAutoCancelJob] 自动取消成功: orderId={}, orderCode={}, status={}, updateTime={}", + orderId, order.getOrderCode(), order.getStatus(), order.getUpdateTime()); + } catch (Exception e) { + failed++; + log.warn("[CleanOrderAutoCancelJob] 自动取消失败: orderId={}, orderCode={}, status={}, error={}", + orderId, order.getOrderCode(), order.getStatus(), e.getMessage(), e); + } + } + + long duration = System.currentTimeMillis() - startTime; + log.info("[CleanOrderAutoCancelJob] 扫描完成: 扫描 {} 单, 成功 {}, 失败 {}, 跳过 {}, 耗时 {} ms", + candidates.size(), succeeded, failed, skippedStale, duration); + + return new CancelResult(candidates.size(), succeeded, failed, skippedStale, duration); + } + + public record CancelResult(int scanned, int succeeded, int failed, int skippedStale, long durationMs) { + } +} diff --git a/viewsh-module-ops/viewsh-module-environment-biz/src/test/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJobTest.java b/viewsh-module-ops/viewsh-module-environment-biz/src/test/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJobTest.java new file mode 100644 index 00000000..a18dbf22 --- /dev/null +++ b/viewsh-module-ops/viewsh-module-environment-biz/src/test/java/com/viewsh/module/ops/environment/job/CleanOrderAutoCancelJobTest.java @@ -0,0 +1,198 @@ +package com.viewsh.module.ops.environment.job; + +import com.viewsh.module.ops.core.lifecycle.OrderLifecycleManager; +import com.viewsh.module.ops.dal.dataobject.workorder.OpsOrderDO; +import com.viewsh.module.ops.dal.mysql.workorder.OpsOrderMapper; +import com.viewsh.module.ops.enums.OperatorTypeEnum; +import com.viewsh.module.ops.enums.WorkOrderStatusEnum; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.test.util.ReflectionTestUtils; + +import java.time.LocalDateTime; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * 验证 CleanOrderAutoCancelJob 的五条不变量: + *

    + *
  1. 无候选 → 返回零结果,不触发取消
  2. + *
  3. 正常批次 → 依次 cancel,成功计数正确
  4. + *
  5. 单条失败不中断其余 → try/catch 隔离
  6. + *
  7. 候选到 cancel 间被用户触达 → 乐观锁跳过(避免误杀)
  8. + *
  9. 候选到 cancel 间状态变为终态/PAUSED → 跳过
  10. + *
+ */ +@ExtendWith(MockitoExtension.class) +class CleanOrderAutoCancelJobTest { + + @Mock + private OpsOrderMapper opsOrderMapper; + @Mock + private OrderLifecycleManager orderLifecycleManager; + + @InjectMocks + private CleanOrderAutoCancelJob job; + + @BeforeEach + void setUp() { + ReflectionTestUtils.setField(job, "timeoutHours", 12); + ReflectionTestUtils.setField(job, "batchSize", 200); + } + + @Test + void scanAndCancel_whenNoCandidates_shouldReturnZeroCounts() { + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(Collections.emptyList()); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(0, result.scanned()); + assertEquals(0, result.succeeded()); + assertEquals(0, result.failed()); + assertEquals(0, result.skippedStale()); + verify(orderLifecycleManager, never()).cancelOrder(anyLong(), any(), any(), any()); + } + + @Test + void scanAndCancel_whenAllCandidatesStillStale_shouldCancelAll() { + LocalDateTime staleTime = LocalDateTime.now().minusHours(13); + OpsOrderDO a = stale(101L, "WO-101", WorkOrderStatusEnum.DISPATCHED, staleTime); + OpsOrderDO b = stale(102L, "WO-102", WorkOrderStatusEnum.CONFIRMED, staleTime); + OpsOrderDO c = stale(103L, "WO-103", WorkOrderStatusEnum.ARRIVED, staleTime); + + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(List.of(a, b, c)); + // Fresh fetch confirms all three are still stale + when(opsOrderMapper.selectById(101L)).thenReturn(a); + when(opsOrderMapper.selectById(102L)).thenReturn(b); + when(opsOrderMapper.selectById(103L)).thenReturn(c); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(3, result.scanned()); + assertEquals(3, result.succeeded()); + assertEquals(0, result.failed()); + assertEquals(0, result.skippedStale()); + verify(orderLifecycleManager, times(3)) + .cancelOrder(anyLong(), eq(null), eq(OperatorTypeEnum.SYSTEM), any()); + } + + @Test + void scanAndCancel_whenOneCancelThrows_shouldNotAbortBatch() { + LocalDateTime staleTime = LocalDateTime.now().minusHours(13); + OpsOrderDO a = stale(201L, "WO-201", WorkOrderStatusEnum.DISPATCHED, staleTime); + OpsOrderDO b = stale(202L, "WO-202", WorkOrderStatusEnum.CONFIRMED, staleTime); + OpsOrderDO c = stale(203L, "WO-203", WorkOrderStatusEnum.ARRIVED, staleTime); + + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(List.of(a, b, c)); + when(opsOrderMapper.selectById(201L)).thenReturn(a); + when(opsOrderMapper.selectById(202L)).thenReturn(b); + when(opsOrderMapper.selectById(203L)).thenReturn(c); + // 第二条取消抛异常,不应影响第一、第三条。 + // 不能用 doThrow(...).when(mock).cancelOrder(eq(202L), ...)——strict stubs 会把"201L 调用和 202L 存根不匹配"判成错配。 + // 改用 doAnswer 按 orderId 路由,覆盖所有 cancel 调用。 + doAnswer(invocation -> { + Long orderId = invocation.getArgument(0); + if (orderId != null && orderId == 202L) { + throw new IllegalStateException("状态机非法转换"); + } + return null; + }).when(orderLifecycleManager).cancelOrder(anyLong(), any(), any(), any()); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(3, result.scanned()); + assertEquals(2, result.succeeded()); + assertEquals(1, result.failed()); + assertEquals(0, result.skippedStale()); + verify(orderLifecycleManager).cancelOrder(eq(201L), any(), any(), any()); + verify(orderLifecycleManager).cancelOrder(eq(202L), any(), any(), any()); + verify(orderLifecycleManager).cancelOrder(eq(203L), any(), any(), any()); + } + + @Test + void scanAndCancel_whenOrderTouchedBeforeCancel_shouldSkipAsStale() { + // 候选装内存时 update_time=13h ago,实际 cancel 前用户刚刚点确认,update_time 刷为"1 分钟前"。 + // 乐观校验应跳过,避免误杀已被触达的工单。 + LocalDateTime snapshotUpdate = LocalDateTime.now().minusHours(13); + LocalDateTime freshUpdate = LocalDateTime.now().minusMinutes(1); + + OpsOrderDO snapshot = stale(301L, "WO-301", WorkOrderStatusEnum.DISPATCHED, snapshotUpdate); + OpsOrderDO fresh = stale(301L, "WO-301", WorkOrderStatusEnum.CONFIRMED, freshUpdate); + + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(List.of(snapshot)); + when(opsOrderMapper.selectById(301L)).thenReturn(fresh); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(1, result.scanned()); + assertEquals(0, result.succeeded()); + assertEquals(1, result.skippedStale()); + verify(orderLifecycleManager, never()).cancelOrder(anyLong(), any(), any(), any()); + } + + @Test + void scanAndCancel_whenOrderBecameTerminal_shouldSkip() { + // 候选装内存时还是 ARRIVED,实际 cancel 前已被其他路径 forceComplete 为 COMPLETED + LocalDateTime staleTime = LocalDateTime.now().minusHours(13); + OpsOrderDO snapshot = stale(401L, "WO-401", WorkOrderStatusEnum.ARRIVED, staleTime); + OpsOrderDO fresh = stale(401L, "WO-401", WorkOrderStatusEnum.COMPLETED, staleTime); + + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(List.of(snapshot)); + when(opsOrderMapper.selectById(401L)).thenReturn(fresh); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(1, result.skippedStale()); + verify(orderLifecycleManager, never()).cancelOrder(anyLong(), any(), any(), any()); + } + + @Test + void scanAndCancel_whenOrderBecamePaused_shouldSkip() { + // 快照是 DISPATCHED,刚被 P0 打断成 PAUSED——此 Job 应放行给 resumeInterruptedOrder + LocalDateTime staleTime = LocalDateTime.now().minusHours(13); + OpsOrderDO snapshot = stale(501L, "WO-501", WorkOrderStatusEnum.DISPATCHED, staleTime); + OpsOrderDO fresh = stale(501L, "WO-501", WorkOrderStatusEnum.PAUSED, + LocalDateTime.now().minusHours(14)); // update_time 刚刷新,但仍<=threshold;状态变 PAUSED 就该跳过 + + when(opsOrderMapper.selectList(any(com.viewsh.framework.mybatis.core.query.LambdaQueryWrapperX.class))) + .thenReturn(List.of(snapshot)); + when(opsOrderMapper.selectById(501L)).thenReturn(fresh); + + CleanOrderAutoCancelJob.CancelResult result = job.scanAndCancel(); + + assertEquals(1, result.skippedStale()); + verify(orderLifecycleManager, never()).cancelOrder(anyLong(), any(), any(), any()); + } + + // ==================== Helpers ==================== + + private OpsOrderDO stale(Long id, String code, WorkOrderStatusEnum status, LocalDateTime updateTime) { + OpsOrderDO order = OpsOrderDO.builder() + .id(id) + .orderCode(code) + .status(status.getStatus()) + .orderType("CLEAN") + .build(); + order.setUpdateTime(updateTime); + return order; + } +} diff --git a/viewsh-module-ops/viewsh-module-ops-server/src/main/resources/application.yaml b/viewsh-module-ops/viewsh-module-ops-server/src/main/resources/application.yaml index 3b48f1ea..b72e80c4 100644 --- a/viewsh-module-ops/viewsh-module-ops-server/src/main/resources/application.yaml +++ b/viewsh-module-ops/viewsh-module-ops-server/src/main/resources/application.yaml @@ -146,6 +146,12 @@ viewsh: connect-timeout: 5000 read-timeout: 10000 max-retry: 2 + clean: + auto-cancel: + # 保洁工单 update_time 距今超过此小时数视为卡死,由 CleanOrderAutoCancelJob 自动取消 + timeout-hours: 12 + # 单次扫描/取消上限,防止事件风暴;超出的工单留给下一轮 cron + batch-size: 200 # API 签名配置:外部系统调用开放接口时使用(如安保工单的告警系统) signature: apps: