chore(ci): 部署加磁盘预检 + 部署后自动清理 Prod 本地镜像与 Registry
- 新增 Pre-deploy Check:SSH 到 Prod/Registry 读根分区空闲,<5% 直接 fail(规避磁盘满时 sshd 连带崩溃导致的 scp 失败),5~10% 仅告警 - 新增 Cleanup Old Images stage:部署成功后每服务保留最近 3 个镜像 * Prod 侧调用 scripts/cleanup.sh * Registry 侧调用 scripts/registry-cleanup.py + 触发容器内 garbage-collect - scripts/cleanup.sh:去掉 volume prune 的交互 read(CI 下会卡住),支持 --keep/--prune-volumes/--registry 参数 - scripts/registry-cleanup.py:按 tag 内数字降序保留最新 N 个;覆盖 Docker v2/OCI 多种 manifest Accept;多 tag 指向同一 digest 去重;失败不影响发布 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
131
Jenkinsfile
vendored
131
Jenkinsfile
vendored
@@ -22,6 +22,8 @@ pipeline {
|
||||
|
||||
// 镜像仓库配置(Infra 服务器内网地址,Prod 服务器可通过内网拉取)
|
||||
REGISTRY = '172.17.16.7:5000'
|
||||
REGISTRY_HOST = '172.17.16.7'
|
||||
REGISTRY_CONTAINER = 'registry'
|
||||
DEPS_IMAGE = "${REGISTRY}/aiot-deps:latest"
|
||||
|
||||
// 服务配置
|
||||
@@ -36,6 +38,13 @@ pipeline {
|
||||
STAGING_DEPLOY_HOST = '172.17.16.7'
|
||||
STAGING_DEPLOY_PATH = '/opt/aiot-platform-cloud'
|
||||
|
||||
// 磁盘守护阈值(%):低于 MIN 直接 fail;低于 WARN 仅告警
|
||||
DISK_FREE_MIN_PCT = '5'
|
||||
DISK_FREE_WARN_PCT = '10'
|
||||
|
||||
// 镜像保留份数(每服务)
|
||||
IMAGE_KEEP_COUNT = '3'
|
||||
|
||||
// 性能配置 - 将动态调整
|
||||
BUILD_TIMEOUT = 45
|
||||
DEPLOY_TIMEOUT = 10
|
||||
@@ -270,6 +279,30 @@ pipeline {
|
||||
}
|
||||
}
|
||||
|
||||
stage('Pre-deploy Check') {
|
||||
when {
|
||||
allOf {
|
||||
expression { env.SERVICES_TO_BUILD != '' }
|
||||
anyOf {
|
||||
branch 'master'
|
||||
branch 'release/next'
|
||||
}
|
||||
}
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
def stageStartTime = System.currentTimeMillis()
|
||||
echo "🛡️ Pre-deploy health check: remote disk & SSH reachability"
|
||||
|
||||
// 检查 Prod 与 Registry 两台主机的磁盘,低于阈值 fail fast,避免 sshd 在磁盘满时被拖垮
|
||||
checkRemoteDiskOrFail(env.DEPLOY_HOST, 'Deploy')
|
||||
checkRemoteDiskOrFail(env.REGISTRY_HOST, 'Registry')
|
||||
|
||||
recordStageMetrics('Pre-deploy Check', stageStartTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Deploy') {
|
||||
when {
|
||||
allOf {
|
||||
@@ -283,7 +316,7 @@ pipeline {
|
||||
steps {
|
||||
script {
|
||||
def stageStartTime = System.currentTimeMillis()
|
||||
|
||||
|
||||
def servicesToDeploy = env.SERVICES_TO_BUILD.split(',')
|
||||
def sortedServices = sortServicesByDependency(servicesToDeploy)
|
||||
|
||||
@@ -375,6 +408,32 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stage('Cleanup Old Images') {
|
||||
when {
|
||||
allOf {
|
||||
expression { env.SERVICES_TO_BUILD != '' }
|
||||
anyOf {
|
||||
branch 'master'
|
||||
branch 'release/next'
|
||||
}
|
||||
}
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
def stageStartTime = System.currentTimeMillis()
|
||||
echo "🧹 Cleaning up old images (keep=${env.IMAGE_KEEP_COUNT})"
|
||||
|
||||
// Prod/Staging 本地:清旧镜像 + dangling + builder cache
|
||||
cleanupDeployHost(env.DEPLOY_HOST, env.IMAGE_KEEP_COUNT)
|
||||
|
||||
// Registry:按保留策略删 manifest + 触发 GC 释放磁盘
|
||||
cleanupRegistry(env.REGISTRY_HOST, env.REGISTRY_CONTAINER, env.IMAGE_KEEP_COUNT)
|
||||
|
||||
recordStageMetrics('Cleanup Old Images', stageStartTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
post {
|
||||
@@ -1073,3 +1132,73 @@ def getModulePathForService(String service) {
|
||||
]
|
||||
return map.get(service, service)
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// 磁盘/清理相关 helper(Prod + Registry)
|
||||
// ============================================
|
||||
|
||||
// 检查远端磁盘剩余百分比;低于 MIN 阈值直接 fail;低于 WARN 仅告警
|
||||
def checkRemoteDiskOrFail(String host, String role) {
|
||||
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
|
||||
def minPct = Integer.parseInt(env.DISK_FREE_MIN_PCT)
|
||||
def warnPct = Integer.parseInt(env.DISK_FREE_WARN_PCT)
|
||||
|
||||
def freePct
|
||||
try {
|
||||
// awk 的 $5+0 会把 "42%" 强转为 42,避免多层 gsub 转义
|
||||
freePct = sh(
|
||||
script: "ssh ${sshOpts} root@${host} \"df -P / | awk 'NR==2 { print 100 - \\\$5+0 }'\"",
|
||||
returnStdout: true
|
||||
).trim() as int
|
||||
} catch (Exception e) {
|
||||
error("❌ [${role}] 无法通过 SSH 到 ${host} 读取磁盘(可能 sshd 已被磁盘满拖垮):${e.message}")
|
||||
}
|
||||
|
||||
echo " ${role}@${host}: 根分区空闲 ${freePct}%"
|
||||
if (freePct < minPct) {
|
||||
error("❌ [${role}] ${host} 根分区空闲仅 ${freePct}% < ${minPct}%,终止部署避免二次失败。请先手动清理或跑 scripts/cleanup.sh")
|
||||
} else if (freePct < warnPct) {
|
||||
echo "⚠️ [${role}] ${host} 空闲 ${freePct}% < ${warnPct}%,本次部署后会触发自动清理"
|
||||
}
|
||||
}
|
||||
|
||||
// Prod/Staging 本地清理:调用仓库内的 cleanup.sh
|
||||
def cleanupDeployHost(String host, String keep) {
|
||||
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
|
||||
try {
|
||||
echo "📤 Syncing scripts/cleanup.sh to ${host}..."
|
||||
sh "scp ${sshOpts} scripts/cleanup.sh root@${host}:${env.DEPLOY_PATH}/cleanup.sh"
|
||||
sh """
|
||||
ssh ${sshOpts} root@${host} '
|
||||
cd ${env.DEPLOY_PATH}
|
||||
chmod +x cleanup.sh
|
||||
./cleanup.sh --keep=${keep} --registry=${env.REGISTRY}
|
||||
'
|
||||
"""
|
||||
echo "✅ Deploy host 清理完成"
|
||||
} catch (Exception e) {
|
||||
// 清理失败不影响发布结果,仅告警
|
||||
echo "⚠️ Deploy host 清理失败(不致命):${e.message}"
|
||||
}
|
||||
}
|
||||
|
||||
// Registry 清理:同步 python 脚本 → 按保留策略删 manifest → GC
|
||||
def cleanupRegistry(String host, String registryContainer, String keep) {
|
||||
def sshOpts = "-o StrictHostKeyChecking=no -o ConnectTimeout=10 -i ${env.SSH_KEY}"
|
||||
try {
|
||||
echo "📤 Syncing scripts/registry-cleanup.py to ${host}..."
|
||||
sh "scp ${sshOpts} scripts/registry-cleanup.py root@${host}:/tmp/registry-cleanup.py"
|
||||
sh """
|
||||
ssh ${sshOpts} root@${host} '
|
||||
python3 /tmp/registry-cleanup.py \
|
||||
--registry http://localhost:5000 \
|
||||
--keep ${keep} \
|
||||
--repos ${env.CORE_SERVICES} \
|
||||
--gc-container ${registryContainer}
|
||||
'
|
||||
"""
|
||||
echo "✅ Registry 清理 + GC 完成"
|
||||
} catch (Exception e) {
|
||||
echo "⚠️ Registry 清理失败(不致命):${e.message}"
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user