chore(ci): 部署加磁盘预检 + 部署后自动清理 Prod 本地镜像与 Registry
Some checks failed
Java CI with Maven / build (11) (push) Has been cancelled
Java CI with Maven / build (17) (push) Has been cancelled
Java CI with Maven / build (8) (push) Has been cancelled

- 新增 Pre-deploy Check:SSH 到 Prod/Registry 读根分区空闲,<5% 直接 fail(规避磁盘满时 sshd 连带崩溃导致的 scp 失败),5~10% 仅告警
- 新增 Cleanup Old Images stage:部署成功后每服务保留最近 3 个镜像
  * Prod 侧调用 scripts/cleanup.sh
  * Registry 侧调用 scripts/registry-cleanup.py + 触发容器内 garbage-collect
- scripts/cleanup.sh:去掉 volume prune 的交互 read(CI 下会卡住),支持 --keep/--prune-volumes/--registry 参数
- scripts/registry-cleanup.py:按 tag 内数字降序保留最新 N 个;覆盖 Docker v2/OCI 多种 manifest Accept;多 tag 指向同一 digest 去重;失败不影响发布

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
lzh
2026-04-24 11:20:37 +08:00
parent acd7a35e1d
commit 8c5c5ef44a
3 changed files with 467 additions and 35 deletions

131
Jenkinsfile vendored
View File

@@ -22,6 +22,8 @@ pipeline {
// 镜像仓库配置Infra 服务器内网地址Prod 服务器可通过内网拉取)
REGISTRY = '172.17.16.7:5000'
REGISTRY_HOST = '172.17.16.7'
REGISTRY_CONTAINER = 'registry'
DEPS_IMAGE = "${REGISTRY}/aiot-deps:latest"
// 服务配置
@@ -36,6 +38,13 @@ pipeline {
STAGING_DEPLOY_HOST = '172.17.16.7'
STAGING_DEPLOY_PATH = '/opt/aiot-platform-cloud'
// 磁盘守护阈值(%):低于 MIN 直接 fail低于 WARN 仅告警
DISK_FREE_MIN_PCT = '5'
DISK_FREE_WARN_PCT = '10'
// 镜像保留份数(每服务)
IMAGE_KEEP_COUNT = '3'
// 性能配置 - 将动态调整
BUILD_TIMEOUT = 45
DEPLOY_TIMEOUT = 10
@@ -270,6 +279,30 @@ pipeline {
}
}
stage('Pre-deploy Check') {
when {
allOf {
expression { env.SERVICES_TO_BUILD != '' }
anyOf {
branch 'master'
branch 'release/next'
}
}
}
steps {
script {
def stageStartTime = System.currentTimeMillis()
echo "🛡️ Pre-deploy health check: remote disk & SSH reachability"
// 检查 Prod 与 Registry 两台主机的磁盘,低于阈值 fail fast避免 sshd 在磁盘满时被拖垮
checkRemoteDiskOrFail(env.DEPLOY_HOST, 'Deploy')
checkRemoteDiskOrFail(env.REGISTRY_HOST, 'Registry')
recordStageMetrics('Pre-deploy Check', stageStartTime)
}
}
}
stage('Deploy') {
when {
allOf {
@@ -283,7 +316,7 @@ pipeline {
steps {
script {
def stageStartTime = System.currentTimeMillis()
def servicesToDeploy = env.SERVICES_TO_BUILD.split(',')
def sortedServices = sortServicesByDependency(servicesToDeploy)
@@ -375,6 +408,32 @@ pipeline {
}
}
}
stage('Cleanup Old Images') {
when {
allOf {
expression { env.SERVICES_TO_BUILD != '' }
anyOf {
branch 'master'
branch 'release/next'
}
}
}
steps {
script {
def stageStartTime = System.currentTimeMillis()
echo "🧹 Cleaning up old images (keep=${env.IMAGE_KEEP_COUNT})"
// Prod/Staging 本地:清旧镜像 + dangling + builder cache
cleanupDeployHost(env.DEPLOY_HOST, env.IMAGE_KEEP_COUNT)
// Registry按保留策略删 manifest + 触发 GC 释放磁盘
cleanupRegistry(env.REGISTRY_HOST, env.REGISTRY_CONTAINER, env.IMAGE_KEEP_COUNT)
recordStageMetrics('Cleanup Old Images', stageStartTime)
}
}
}
}
post {
@@ -1073,3 +1132,73 @@ def getModulePathForService(String service) {
]
return map.get(service, service)
}
// ============================================
// 磁盘/清理相关 helperProd + Registry
// ============================================
// 检查远端磁盘剩余百分比;低于 MIN 阈值直接 fail低于 WARN 仅告警
def checkRemoteDiskOrFail(String host, String role) {
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
def minPct = Integer.parseInt(env.DISK_FREE_MIN_PCT)
def warnPct = Integer.parseInt(env.DISK_FREE_WARN_PCT)
def freePct
try {
// awk 的 $5+0 会把 "42%" 强转为 42避免多层 gsub 转义
freePct = sh(
script: "ssh ${sshOpts} root@${host} \"df -P / | awk 'NR==2 { print 100 - \\\$5+0 }'\"",
returnStdout: true
).trim() as int
} catch (Exception e) {
error("❌ [${role}] 无法通过 SSH 到 ${host} 读取磁盘(可能 sshd 已被磁盘满拖垮):${e.message}")
}
echo " ${role}@${host}: 根分区空闲 ${freePct}%"
if (freePct < minPct) {
error("❌ [${role}] ${host} 根分区空闲仅 ${freePct}% < ${minPct}%,终止部署避免二次失败。请先手动清理或跑 scripts/cleanup.sh")
} else if (freePct < warnPct) {
echo "⚠️ [${role}] ${host} 空闲 ${freePct}% < ${warnPct}%,本次部署后会触发自动清理"
}
}
// Prod/Staging 本地清理:调用仓库内的 cleanup.sh
def cleanupDeployHost(String host, String keep) {
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
try {
echo "📤 Syncing scripts/cleanup.sh to ${host}..."
sh "scp ${sshOpts} scripts/cleanup.sh root@${host}:${env.DEPLOY_PATH}/cleanup.sh"
sh """
ssh ${sshOpts} root@${host} '
cd ${env.DEPLOY_PATH}
chmod +x cleanup.sh
./cleanup.sh --keep=${keep} --registry=${env.REGISTRY}
'
"""
echo "✅ Deploy host 清理完成"
} catch (Exception e) {
// 清理失败不影响发布结果,仅告警
echo "⚠️ Deploy host 清理失败(不致命):${e.message}"
}
}
// Registry 清理:同步 python 脚本 → 按保留策略删 manifest → GC
def cleanupRegistry(String host, String registryContainer, String keep) {
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
try {
echo "📤 Syncing scripts/registry-cleanup.py to ${host}..."
sh "scp ${sshOpts} scripts/registry-cleanup.py root@${host}:/tmp/registry-cleanup.py"
sh """
ssh ${sshOpts} root@${host} '
python3 /tmp/registry-cleanup.py \
--registry http://localhost:5000 \
--keep ${keep} \
--repos ${env.CORE_SERVICES} \
--gc-container ${registryContainer}
'
"""
echo "✅ Registry 清理 + GC 完成"
} catch (Exception e) {
echo "⚠️ Registry 清理失败(不致命):${e.message}"
}
}