fix(ops): 修复同一工牌并行多单的状态错乱
线上观察:管理员手动取消一个僵尸 DISPATCHED 单会引发"越清越多"—— 系统顺势派队列首条给仍在工作的保洁员,监听器再用"旧工单残留"机制 尝试取消当前正在执行的工单,该取消走 REQUIRES_NEW 独立事务且吞异常, 最终新单落地、旧单残留,同一设备挂多个非终态工单。 修复两处: 1. DispatchEngineImpl.autoDispatchNext 入口加设备空闲校验: 若执行人名下还有 DISPATCHED/CONFIRMED/ARRIVED/PAUSED 工单(排除 completedOrderId),直接早返回,不再派发。所有调用方(保洁/安保 handleCancelled、asyncCompleteAndDispatchNext、xxl-job 空闲扫描) 自动受保护。新增 OpsOrderMapper.selectActiveByAssignee。 2. BadgeDeviceStatusEventListener.handleDispatched 移除"残留取消": 旧逻辑用 REQUIRES_NEW 事务 + 吞异常,是对数据已错乱场景的暴力兜底, 失败时导致误杀。改为只打 ERROR 告警暴露问题,仅清理 Redis 关联。 真正的防线在 DispatchEngine 入口。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,9 +15,6 @@ import jakarta.annotation.Resource;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.context.event.EventListener;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.PlatformTransactionManager;
|
||||
import org.springframework.transaction.TransactionDefinition;
|
||||
import org.springframework.transaction.support.TransactionTemplate;
|
||||
|
||||
/**
|
||||
* 工牌设备状态事件监听器
|
||||
@@ -87,9 +84,6 @@ public class BadgeDeviceStatusEventListener {
|
||||
@Resource
|
||||
private OrderLifecycleManager orderLifecycleManager;
|
||||
|
||||
@Resource
|
||||
private PlatformTransactionManager transactionManager;
|
||||
|
||||
/**
|
||||
* 监听工单状态变更事件,同步更新设备工单关联
|
||||
* <p>
|
||||
@@ -180,40 +174,27 @@ public class BadgeDeviceStatusEventListener {
|
||||
|
||||
/**
|
||||
* 处理工单推送状态(首次设置工单关联)
|
||||
* <p>
|
||||
* 若 Redis 里检测到旧 orderId(正常业务不应出现),仅打 ERROR 告警并清理 Redis 关联。
|
||||
* 此前版本会在此处"自动取消旧工单",但那是对"数据已错乱"场景的暴力兜底:
|
||||
* <ul>
|
||||
* <li>取消使用 REQUIRES_NEW 独立事务且吞异常,失败时新单照常落地,旧单残留,形成越清越多</li>
|
||||
* <li>真正的防线应在 DispatchEngine.autoDispatchNext 入口做设备空闲校验</li>
|
||||
* </ul>
|
||||
* 现改为被动告警,暴露问题等待定位,避免误杀保洁员正在执行的任务。
|
||||
*/
|
||||
private void handleDispatched(Long deviceId, Long orderId, OpsOrderDO order) {
|
||||
// 检查并清理旧工单(防止工单切换时状态残留)
|
||||
BadgeDeviceStatusDTO deviceStatus = badgeDeviceStatusService.getBadgeStatus(deviceId);
|
||||
if (deviceStatus != null && deviceStatus.getCurrentOpsOrderId() != null) {
|
||||
Long oldOrderId = deviceStatus.getCurrentOpsOrderId();
|
||||
if (!oldOrderId.equals(orderId)) {
|
||||
log.warn("[BadgeDeviceStatusEventListener] 派发新工单时检测到旧工单残留: " +
|
||||
"deviceId={}, oldOrderId={}, newOrderId={}", deviceId, oldOrderId, orderId);
|
||||
|
||||
// 检查旧工单是否仍在进行中,如果是则先取消
|
||||
OpsOrderDO oldOrder = opsOrderMapper.selectById(oldOrderId);
|
||||
if (oldOrder != null) {
|
||||
WorkOrderStatusEnum oldStatus = WorkOrderStatusEnum.fromStatus(oldOrder.getStatus());
|
||||
if (oldStatus == WorkOrderStatusEnum.DISPATCHED
|
||||
|| oldStatus == WorkOrderStatusEnum.CONFIRMED
|
||||
|| oldStatus == WorkOrderStatusEnum.ARRIVED) {
|
||||
// 旧工单仍在进行,先取消
|
||||
// 使用 REQUIRES_NEW 独立事务,避免内层异常标记外层事务 rollback-only
|
||||
log.warn("[BadgeDeviceStatusEventListener] 取消残留的旧工单: oldOrderId={}", oldOrderId);
|
||||
try {
|
||||
TransactionTemplate txTemplate = new TransactionTemplate(transactionManager);
|
||||
txTemplate.setPropagationBehavior(TransactionDefinition.PROPAGATION_REQUIRES_NEW);
|
||||
txTemplate.executeWithoutResult(status -> {
|
||||
orderLifecycleManager.cancelOrder(oldOrderId, deviceId,
|
||||
OperatorTypeEnum.SYSTEM, "新工单派发,自动取消旧工单");
|
||||
});
|
||||
} catch (Exception e) {
|
||||
log.error("[BadgeDeviceStatusEventListener] 取消旧工单失败: oldOrderId={}", oldOrderId, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
String oldStatus = oldOrder != null ? oldOrder.getStatus() : "NOT_FOUND";
|
||||
log.error("[BadgeDeviceStatusEventListener] 派发新工单时检测到旧工单残留(数据可能已错乱,需人工核查): " +
|
||||
"deviceId={}, oldOrderId={}, oldStatus={}, newOrderId={}",
|
||||
deviceId, oldOrderId, oldStatus, orderId);
|
||||
|
||||
// 确保设备状态清理(无论旧工单是否取消成功)
|
||||
// 清理 Redis 中对旧工单的关联(纯 Redis 操作,不触达状态机)
|
||||
badgeDeviceStatusService.clearCurrentOrder(deviceId);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,6 +178,22 @@ public class DispatchEngineImpl implements DispatchEngine {
|
||||
public DispatchResult autoDispatchNext(Long completedOrderId, Long assigneeId) {
|
||||
log.info("任务完成后自动派发下一单: completedOrderId={}, assigneeId={}", completedOrderId, assigneeId);
|
||||
|
||||
if (assigneeId == null) {
|
||||
log.warn("autoDispatchNext 缺少执行人,跳过派发: completedOrderId={}", completedOrderId);
|
||||
return DispatchResult.success("缺少执行人,跳过派发", null);
|
||||
}
|
||||
|
||||
// 空闲校验:若执行人仍挂着其他活跃工单(DISPATCHED/CONFIRMED/ARRIVED/PAUSED),
|
||||
// 说明设备尚未真正空闲,不应再派发新任务——否则会触发"同一设备并行多单"的状态错乱,
|
||||
// 典型场景是管理员手动取消一个僵尸 DISPATCHED 单时,handleCancelled 会调到这里。
|
||||
List<OpsOrderDO> activeOrders = orderMapper.selectActiveByAssignee(assigneeId, completedOrderId);
|
||||
if (!activeOrders.isEmpty()) {
|
||||
OpsOrderDO head = activeOrders.get(0);
|
||||
log.info("执行人仍有活跃工单,跳过自动派发: assigneeId={}, completedOrderId={}, activeCount={}, sampleOrderId={}, sampleStatus={}",
|
||||
assigneeId, completedOrderId, activeOrders.size(), head.getId(), head.getStatus());
|
||||
return DispatchResult.success("执行人非空闲,跳过派发", assigneeId);
|
||||
}
|
||||
|
||||
Long fallbackAreaId = null;
|
||||
OpsOrderDO completedOrder = orderMapper.selectById(completedOrderId);
|
||||
if (completedOrder != null) {
|
||||
|
||||
@@ -92,6 +92,28 @@ public interface OpsOrderMapper extends BaseMapperX<OpsOrderDO> {
|
||||
.last("LIMIT 1"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询执行人名下尚未结束的工单(DISPATCHED/CONFIRMED/ARRIVED/PAUSED)
|
||||
* <p>
|
||||
* 用于 autoDispatchNext 等调度入口的空闲校验:若该执行人仍挂着活跃工单,
|
||||
* 则不应再派发新任务,避免"越清越多"的级联派发。
|
||||
*
|
||||
* @param assigneeId 执行人ID(工牌设备ID)
|
||||
* @param excludeOrderId 需要排除的工单ID(通常是刚完成/取消触发本次调度的工单),可传 null
|
||||
* @return 活跃工单列表,按创建时间升序
|
||||
*/
|
||||
default List<OpsOrderDO> selectActiveByAssignee(Long assigneeId, Long excludeOrderId) {
|
||||
return selectList(new LambdaQueryWrapperX<OpsOrderDO>()
|
||||
.eq(OpsOrderDO::getAssigneeId, assigneeId)
|
||||
.in(OpsOrderDO::getStatus,
|
||||
WorkOrderStatusEnum.DISPATCHED.getStatus(),
|
||||
WorkOrderStatusEnum.CONFIRMED.getStatus(),
|
||||
WorkOrderStatusEnum.ARRIVED.getStatus(),
|
||||
WorkOrderStatusEnum.PAUSED.getStatus())
|
||||
.ne(excludeOrderId != null, OpsOrderDO::getId, excludeOrderId)
|
||||
.orderByAsc(OpsOrderDO::getCreateTime));
|
||||
}
|
||||
|
||||
// ==================== 统计聚合查询 ====================
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user