fix(ops): 修复同一工牌并行多单的状态错乱

线上观察:管理员手动取消一个僵尸 DISPATCHED 单会引发"越清越多"——
系统顺势派队列首条给仍在工作的保洁员,监听器再用"旧工单残留"机制
尝试取消当前正在执行的工单,该取消走 REQUIRES_NEW 独立事务且吞异常,
最终新单落地、旧单残留,同一设备挂多个非终态工单。

修复两处:

1. DispatchEngineImpl.autoDispatchNext 入口加设备空闲校验:
   若执行人名下还有 DISPATCHED/CONFIRMED/ARRIVED/PAUSED 工单(排除
   completedOrderId),直接早返回,不再派发。所有调用方(保洁/安保
   handleCancelled、asyncCompleteAndDispatchNext、xxl-job 空闲扫描)
   自动受保护。新增 OpsOrderMapper.selectActiveByAssignee。

2. BadgeDeviceStatusEventListener.handleDispatched 移除"残留取消":
   旧逻辑用 REQUIRES_NEW 事务 + 吞异常,是对数据已错乱场景的暴力兜底,
   失败时导致误杀。改为只打 ERROR 告警暴露问题,仅清理 Redis 关联。
   真正的防线在 DispatchEngine 入口。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
lzh
2026-04-20 10:54:54 +08:00
parent 6bbd49355d
commit 4d85659277
3 changed files with 51 additions and 32 deletions

View File

@@ -15,9 +15,6 @@ import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Component;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionDefinition;
import org.springframework.transaction.support.TransactionTemplate;
/**
* 工牌设备状态事件监听器
@@ -87,9 +84,6 @@ public class BadgeDeviceStatusEventListener {
@Resource
private OrderLifecycleManager orderLifecycleManager;
@Resource
private PlatformTransactionManager transactionManager;
/**
* 监听工单状态变更事件,同步更新设备工单关联
* <p>
@@ -180,40 +174,27 @@ public class BadgeDeviceStatusEventListener {
/**
* 处理工单推送状态(首次设置工单关联)
* <p>
* 若 Redis 里检测到旧 orderId正常业务不应出现仅打 ERROR 告警并清理 Redis 关联。
* 此前版本会在此处"自动取消旧工单",但那是对"数据已错乱"场景的暴力兜底:
* <ul>
* <li>取消使用 REQUIRES_NEW 独立事务且吞异常,失败时新单照常落地,旧单残留,形成越清越多</li>
* <li>真正的防线应在 DispatchEngine.autoDispatchNext 入口做设备空闲校验</li>
* </ul>
* 现改为被动告警,暴露问题等待定位,避免误杀保洁员正在执行的任务。
*/
private void handleDispatched(Long deviceId, Long orderId, OpsOrderDO order) {
// 检查并清理旧工单(防止工单切换时状态残留)
BadgeDeviceStatusDTO deviceStatus = badgeDeviceStatusService.getBadgeStatus(deviceId);
if (deviceStatus != null && deviceStatus.getCurrentOpsOrderId() != null) {
Long oldOrderId = deviceStatus.getCurrentOpsOrderId();
if (!oldOrderId.equals(orderId)) {
log.warn("[BadgeDeviceStatusEventListener] 派发新工单时检测到旧工单残留: " +
"deviceId={}, oldOrderId={}, newOrderId={}", deviceId, oldOrderId, orderId);
// 检查旧工单是否仍在进行中,如果是则先取消
OpsOrderDO oldOrder = opsOrderMapper.selectById(oldOrderId);
if (oldOrder != null) {
WorkOrderStatusEnum oldStatus = WorkOrderStatusEnum.fromStatus(oldOrder.getStatus());
if (oldStatus == WorkOrderStatusEnum.DISPATCHED
|| oldStatus == WorkOrderStatusEnum.CONFIRMED
|| oldStatus == WorkOrderStatusEnum.ARRIVED) {
// 旧工单仍在进行,先取消
// 使用 REQUIRES_NEW 独立事务,避免内层异常标记外层事务 rollback-only
log.warn("[BadgeDeviceStatusEventListener] 取消残留的旧工单: oldOrderId={}", oldOrderId);
try {
TransactionTemplate txTemplate = new TransactionTemplate(transactionManager);
txTemplate.setPropagationBehavior(TransactionDefinition.PROPAGATION_REQUIRES_NEW);
txTemplate.executeWithoutResult(status -> {
orderLifecycleManager.cancelOrder(oldOrderId, deviceId,
OperatorTypeEnum.SYSTEM, "新工单派发,自动取消旧工单");
});
} catch (Exception e) {
log.error("[BadgeDeviceStatusEventListener] 取消旧工单失败: oldOrderId={}", oldOrderId, e);
}
}
}
String oldStatus = oldOrder != null ? oldOrder.getStatus() : "NOT_FOUND";
log.error("[BadgeDeviceStatusEventListener] 派发新工单时检测到旧工单残留(数据可能已错乱,需人工核查): " +
"deviceId={}, oldOrderId={}, oldStatus={}, newOrderId={}",
deviceId, oldOrderId, oldStatus, orderId);
// 确保设备状态清理(无论旧工单是否取消成功
// 清理 Redis 中对旧工单的关联(纯 Redis 操作,不触达状态机
badgeDeviceStatusService.clearCurrentOrder(deviceId);
}
}

View File

@@ -178,6 +178,22 @@ public class DispatchEngineImpl implements DispatchEngine {
public DispatchResult autoDispatchNext(Long completedOrderId, Long assigneeId) {
log.info("任务完成后自动派发下一单: completedOrderId={}, assigneeId={}", completedOrderId, assigneeId);
if (assigneeId == null) {
log.warn("autoDispatchNext 缺少执行人,跳过派发: completedOrderId={}", completedOrderId);
return DispatchResult.success("缺少执行人,跳过派发", null);
}
// 空闲校验若执行人仍挂着其他活跃工单DISPATCHED/CONFIRMED/ARRIVED/PAUSED
// 说明设备尚未真正空闲,不应再派发新任务——否则会触发"同一设备并行多单"的状态错乱,
// 典型场景是管理员手动取消一个僵尸 DISPATCHED 单时handleCancelled 会调到这里。
List<OpsOrderDO> activeOrders = orderMapper.selectActiveByAssignee(assigneeId, completedOrderId);
if (!activeOrders.isEmpty()) {
OpsOrderDO head = activeOrders.get(0);
log.info("执行人仍有活跃工单,跳过自动派发: assigneeId={}, completedOrderId={}, activeCount={}, sampleOrderId={}, sampleStatus={}",
assigneeId, completedOrderId, activeOrders.size(), head.getId(), head.getStatus());
return DispatchResult.success("执行人非空闲,跳过派发", assigneeId);
}
Long fallbackAreaId = null;
OpsOrderDO completedOrder = orderMapper.selectById(completedOrderId);
if (completedOrder != null) {

View File

@@ -92,6 +92,28 @@ public interface OpsOrderMapper extends BaseMapperX<OpsOrderDO> {
.last("LIMIT 1"));
}
/**
* 查询执行人名下尚未结束的工单DISPATCHED/CONFIRMED/ARRIVED/PAUSED
* <p>
* 用于 autoDispatchNext 等调度入口的空闲校验:若该执行人仍挂着活跃工单,
* 则不应再派发新任务,避免"越清越多"的级联派发。
*
* @param assigneeId 执行人ID工牌设备ID
* @param excludeOrderId 需要排除的工单ID通常是刚完成/取消触发本次调度的工单),可传 null
* @return 活跃工单列表,按创建时间升序
*/
default List<OpsOrderDO> selectActiveByAssignee(Long assigneeId, Long excludeOrderId) {
return selectList(new LambdaQueryWrapperX<OpsOrderDO>()
.eq(OpsOrderDO::getAssigneeId, assigneeId)
.in(OpsOrderDO::getStatus,
WorkOrderStatusEnum.DISPATCHED.getStatus(),
WorkOrderStatusEnum.CONFIRMED.getStatus(),
WorkOrderStatusEnum.ARRIVED.getStatus(),
WorkOrderStatusEnum.PAUSED.getStatus())
.ne(excludeOrderId != null, OpsOrderDO::getId, excludeOrderId)
.orderByAsc(OpsOrderDO::getCreateTime));
}
// ==================== 统计聚合查询 ====================
/**