From 9a9524ac07a6a01207cc9e2dde3fb9aa23f59746 Mon Sep 17 00:00:00 2001 From: XuqmGroup Date: Sat, 13 Jun 2026 00:54:02 +0800 Subject: [PATCH] =?UTF-8?q?feat(=E5=AE=89=E5=85=A8=E4=B8=AD=E5=BF=83):=20?= =?UTF-8?q?=E4=B8=80=E9=94=AE=E6=9B=B4=E6=96=B0=E5=A2=9E=E5=8A=A0=E5=81=A5?= =?UTF-8?q?=E5=BA=B7=E6=A3=80=E6=9F=A5=E4=B8=8E=E8=87=AA=E5=8A=A8=E5=9B=9E?= =?UTF-8?q?=E6=BB=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 每个服务重建后轮询容器状态最长 60s: - 容器保持 running 10s 以上 → 更新成功 - 容器已 exited → 立即触发回滚(retag 旧镜像 ID 重建容器) - 超时未就绪 → 同样触发回滚 tenant-service 的自更新助手容器也包含相同逻辑: 60s 内不健康则 retag 旧镜像并重建,保证平台始终可访问。 拉取镜像前统一保存各服务旧镜像 ID(captureCurrentImageIds), 回滚时通过 docker tag 恢复旧版本。 Co-Authored-By: Claude Sonnet 4.6 --- .../tenant/service/SystemUpdateService.java | 244 +++++++++++++++++- 1 file changed, 234 insertions(+), 10 deletions(-) diff --git a/tenant-service/src/main/java/com/xuqm/tenant/service/SystemUpdateService.java b/tenant-service/src/main/java/com/xuqm/tenant/service/SystemUpdateService.java index 8c15b72..29efdf8 100644 --- a/tenant-service/src/main/java/com/xuqm/tenant/service/SystemUpdateService.java +++ b/tenant-service/src/main/java/com/xuqm/tenant/service/SystemUpdateService.java @@ -49,6 +49,11 @@ public class SystemUpdateService { "file-service", "tenant-web", "im-service", "push-service", "update-service", "license-service", "nginx" ); + // 健康检查配置:新容器需在此时间内保持 running 状态才视为健康 + private static final int HEALTH_CHECK_TIMEOUT_SEC = 60; + private static final int HEALTH_STABLE_REQUIRED_SEC = 10; + private static final int HEALTH_CHECK_INTERVAL_SEC = 5; + private static final Set ALLOWED_LOG_SERVICES = Set.of( "tenant-service", "file-service", "im-service", "push-service", "update-service", "license-service", "nginx", "tenant-web" @@ -297,6 +302,12 @@ public class SystemUpdateService { // 确保 tenant-service 在最后 toUpdate.remove("tenant-service"); + // 拉取前先保存所有服务的旧镜像 ID,用于启动失败时回滚 + List allToSnapshot = new ArrayList<>(toUpdate); + allToSnapshot.add("tenant-service"); + Map oldImageIds = captureCurrentImageIds(allToSnapshot); + emit.accept(" 已快照 " + oldImageIds.size() + " 个服务的旧版本镜像(更新失败时自动回滚)"); + emit.accept(">>> 拉取镜像(" + toUpdate.size() + " 个服务)..."); for (String svc : toUpdate) { emit.accept(" pulling " + svc + " ..."); @@ -306,7 +317,7 @@ public class SystemUpdateService { exec(emit, "docker", "compose", "-f", composeFile, "pull", "--quiet", "tenant-service"); emit.accept(">>> 镜像拉取完成"); - restartAndSelfUpdate(emit, composeFile); + restartAndSelfUpdate(emit, composeFile, oldImageIds); } /** 拉取最新镜像并重建所有容器。 */ @@ -314,13 +325,13 @@ public class SystemUpdateService { runSelectiveUpdate(emit, null); } - /** 保留数据,重置容器和数据库表结构。 */ + /** 保留数据,重置容器和数据库表结构。重置不涉及镜像变更,不做回滚。 */ public void runReset(Consumer emit) { String composeFile = deployRoot + "/docker-compose.yml"; patchConfigs(emit); resetDatabaseSchema(emit); - restartAndSelfUpdate(emit, composeFile); + restartAndSelfUpdate(emit, composeFile, Map.of()); } // ── 数据库重置(保留核心数据)────────────────────────────────────────────── @@ -774,13 +785,56 @@ public class SystemUpdateService { // ── 重启核心 ──────────────────────────────────────────────────────────────── - private void restartAndSelfUpdate(Consumer emit, String composeFile) { - emit.accept(">>> 重建各服务容器..."); + /** + * 重建各服务容器,并对每个服务进行健康检查。 + * 若新容器在 HEALTH_CHECK_TIMEOUT_SEC 内未保持稳定运行,自动回滚到旧镜像。 + * @param oldImageIds 拉取新镜像前保存的旧镜像 ID(sha256);为空时跳过回滚 + */ + private void restartAndSelfUpdate(Consumer emit, String composeFile, Map oldImageIds) { + emit.accept(">>> 重建各服务容器(含健康检查与自动回滚)..."); + List rolledBack = new ArrayList<>(); + List failed = new ArrayList<>(); + for (String svc : OTHER_SERVICES) { emit.accept(" restarting " + svc + " ..."); exec(emit, "docker", "compose", "-f", composeFile, "up", "-d", "--no-deps", "--force-recreate", svc); - emit.accept(" " + svc + " ✓"); + + boolean healthy = waitForServiceStable(emit, svc, HEALTH_CHECK_TIMEOUT_SEC); + if (healthy) { + emit.accept(" " + svc + " ✓"); + } else { + String oldId = oldImageIds.get(svc); + if (oldId != null && !oldId.isBlank()) { + emit.accept(" [警告] " + svc + " 启动失败,正在回滚旧版本..."); + boolean rollbackOk = rollbackService(emit, composeFile, svc, oldId); + if (rollbackOk) { + rolledBack.add(svc); + } else { + failed.add(svc); + } + } else { + emit.accept(" [错误] " + svc + " 启动失败且无旧镜像 ID,无法自动回滚"); + failed.add(svc); + } + // 输出该服务最近日志,辅助排查 + try { + String tail = getServiceLogs(svc, 30); + emit.accept(" --- " + svc + " 近期日志(末30行)---"); + for (String l : tail.split("\n")) { + if (!l.isBlank()) emit.accept(" " + l); + } + emit.accept(" ---"); + } catch (Exception ignored) {} + } + } + + if (!rolledBack.isEmpty()) { + emit.accept(">>> [警告] 以下服务已自动回滚到旧版本: " + String.join(", ", rolledBack)); + emit.accept(">>> 请检查代码或配置后重新发版。"); + } + if (!failed.isEmpty()) { + emit.accept(">>> [严重] 以下服务更新失败且回滚无效,需人工介入: " + String.join(", ", failed)); } emit.accept(">>> 启动自更新助手容器..."); @@ -791,7 +845,7 @@ public class SystemUpdateService { emit.accept("DONE"); return; } - boolean helperStarted = spawnSelfUpdater(composeFile, selfImage); + boolean helperStarted = spawnSelfUpdater(composeFile, selfImage, oldImageIds.getOrDefault("tenant-service", "")); if (helperStarted) { emit.accept(">>> 助手容器已就绪,tenant-service 即将重建(连接将短暂中断)..."); emit.accept("RESTART_SELF"); @@ -802,6 +856,146 @@ public class SystemUpdateService { } } + // ── 镜像快照与健康检查 ────────────────────────────────────────────────────── + + /** + * 在拉取新镜像前,保存各服务当前运行容器的镜像 ID(sha256)。 + * 存为 Map<serviceName, imageId>,用于更新失败时 docker tag 回旧版本。 + */ + private Map captureCurrentImageIds(List services) { + Map ids = new LinkedHashMap<>(); + for (String svc : services) { + try { + Process ps = new ProcessBuilder( + "docker", "ps", + "--filter", "label=com.docker.compose.service=" + svc, + "--format", "{{.ID}}" + ).redirectErrorStream(true).start(); + String containerId = new String(ps.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim(); + ps.waitFor(); + if (containerId.isEmpty()) continue; + containerId = containerId.split("\n")[0].trim(); + + Process inspect = new ProcessBuilder( + "docker", "inspect", "--format", "{{.Image}}", containerId + ).redirectErrorStream(true).start(); + String imageId = new String(inspect.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim(); + inspect.waitFor(); + if (!imageId.isEmpty()) { + ids.put(svc, imageId); + } + } catch (Exception e) { + log.warn("captureCurrentImageIds: failed for {}: {}", svc, e.getMessage()); + } + } + return ids; + } + + /** + * 轮询容器状态,直到容器持续 HEALTH_STABLE_REQUIRED_SEC 秒保持 running。 + * 若检测到容器已 exited,立即返回 false(快速失败)。 + */ + private boolean waitForServiceStable(Consumer emit, String service, int timeoutSeconds) { + int elapsed = 0; + int stableSeconds = 0; + + while (elapsed < timeoutSeconds) { + try { Thread.sleep(HEALTH_CHECK_INTERVAL_SEC * 1000L); } + catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; } + elapsed += HEALTH_CHECK_INTERVAL_SEC; + stableSeconds += HEALTH_CHECK_INTERVAL_SEC; + + try { + // 检查 running 状态 + Process runPs = new ProcessBuilder( + "docker", "ps", + "--filter", "label=com.docker.compose.service=" + service, + "--filter", "status=running", + "--format", "{{.ID}}" + ).redirectErrorStream(true).start(); + String running = new String(runPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim(); + runPs.waitFor(); + + if (running.isEmpty()) { + // 检查是否已 exited(快速失败) + Process exitPs = new ProcessBuilder( + "docker", "ps", "-a", + "--filter", "label=com.docker.compose.service=" + service, + "--filter", "status=exited", + "--format", "{{.Status}}" + ).redirectErrorStream(true).start(); + String exited = new String(exitPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim(); + exitPs.waitFor(); + + if (!exited.isEmpty()) { + emit.accept(" [健康检查] " + service + " 已退出 (" + exited + "),快速判定失败"); + return false; + } + stableSeconds = 0; + emit.accept(" [健康检查] " + service + " 等待启动... (" + elapsed + "/" + timeoutSeconds + "s)"); + } else { + emit.accept(" [健康检查] " + service + " running (" + + stableSeconds + "/" + HEALTH_STABLE_REQUIRED_SEC + "s)"); + if (stableSeconds >= HEALTH_STABLE_REQUIRED_SEC) { + return true; + } + } + } catch (Exception e) { + stableSeconds = 0; + emit.accept(" [健康检查] " + service + " 状态查询异常: " + e.getMessage()); + } + } + emit.accept(" [健康检查] " + service + " 超时(" + timeoutSeconds + "s 内未就绪)"); + return false; + } + + /** + * 将指定服务回滚到旧镜像。 + * 先将旧镜像 ID 重新 tag 为 :latest,再 docker compose up 重建容器。 + * @return true 表示回滚后服务成功启动 + */ + private boolean rollbackService(Consumer emit, String composeFile, String service, String oldImageId) { + String imageName = resolveServiceImageName(service); + if (imageName != null) { + try { + Process tag = new ProcessBuilder("docker", "tag", oldImageId, imageName) + .redirectErrorStream(true).start(); + String tagOut = new String(tag.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim(); + int tagCode = tag.waitFor(); + if (tagCode != 0) { + emit.accept(" [回滚警告] docker tag 返回 " + tagCode + (tagOut.isEmpty() ? "" : ": " + tagOut)); + } + } catch (Exception e) { + emit.accept(" [回滚警告] 重新标记镜像失败: " + e.getMessage()); + } + } + + exec(emit, "docker", "compose", "-f", composeFile, "up", "-d", "--no-deps", "--force-recreate", service); + + boolean ok = waitForServiceStable(emit, service, 60); + if (ok) { + emit.accept(" [回滚] " + service + " 已回滚到旧版本 ✓"); + } else { + emit.accept(" [严重] " + service + " 回滚后仍无法启动,请人工介入!"); + emit.accept(" [诊断] docker logs $(docker ps -a --filter label=com.docker.compose.service=" + service + " -q --latest)"); + } + return ok; + } + + /** 从 .env 读取 REGISTRY 和 IMAGE_TAG,拼接服务完整镜像名(registry/service:tag)。 */ + private String resolveServiceImageName(String service) { + try { + Path envFile = Paths.get(deployRoot, ".env"); + String registry = readEnvValue(envFile, "REGISTRY"); + String imageTag = readEnvValue(envFile, "IMAGE_TAG"); + if (registry == null) return null; + if (imageTag == null || imageTag.isBlank()) imageTag = "latest"; + return registry + "/" + service + ":" + imageTag; + } catch (Exception e) { + return null; + } + } + // ── 配置文件热修复 ────────────────────────────────────────────────────────── private void patchConfigs(Consumer emit) { @@ -1001,13 +1195,43 @@ public class SystemUpdateService { } } - private boolean spawnSelfUpdater(String composeFile, String image) { + /** + * 启动自更新助手容器,负责重建 tenant-service(当前进程无法重建自身)。 + * 包含健康检查:若新容器在 60s 内未保持运行,自动回滚到旧镜像。 + * + * @param oldTenantImageId 更新前保存的旧镜像 ID,空字符串表示无法回滚 + */ + private boolean spawnSelfUpdater(String composeFile, String image, String oldTenantImageId) { try { new ProcessBuilder("docker", "rm", "-f", "xuqm-self-updater") .redirectErrorStream(true).start().waitFor(); - String shellCmd = "sleep 8 && docker compose -f " + composeFile - + " up -d --no-deps --force-recreate tenant-service"; + String tenantImageName = resolveServiceImageName("tenant-service"); + if (tenantImageName == null) tenantImageName = ""; + + // Shell 脚本:重建 → 等待健康 → 不健康则回滚 + // 健康检查:60s 内每 10s 轮询一次,连续 running 即视为成功 + String shellCmd = "sleep 8 && " + + "OLD_ID='" + oldTenantImageId.replace("'", "") + "' && " + + "IMG='" + tenantImageName.replace("'", "") + "' && " + + "docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service && " + + "HEALTHY=false && " + + "for i in 1 2 3 4 5 6; do " + + " sleep 10; " + + " if docker ps --filter 'label=com.docker.compose.service=tenant-service' " + + " --filter 'status=running' -q 2>/dev/null | grep -q .; then " + + " HEALTHY=true; break; " + + " fi; " + + " if docker ps -a --filter 'label=com.docker.compose.service=tenant-service' " + + " --filter 'status=exited' -q 2>/dev/null | grep -q .; then " + + " break; " + + " fi; " + + "done; " + + "if [ \"$HEALTHY\" != \"true\" ] && [ -n \"$OLD_ID\" ] && [ -n \"$IMG\" ]; then " + + " echo '[ROLLBACK] tenant-service unhealthy, reverting to old image...'; " + + " docker tag \"$OLD_ID\" \"$IMG\"; " + + " docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service; " + + "fi"; Process p = new ProcessBuilder( "docker", "run", "-d", "--rm",