feat(安全中心): 一键更新增加健康检查与自动回滚

每个服务重建后轮询容器状态最长 60s:
- 容器保持 running 10s 以上 → 更新成功
- 容器已 exited → 立即触发回滚(retag 旧镜像 ID 重建容器)
- 超时未就绪 → 同样触发回滚

tenant-service 的自更新助手容器也包含相同逻辑:
60s 内不健康则 retag 旧镜像并重建,保证平台始终可访问。

拉取镜像前统一保存各服务旧镜像 ID(captureCurrentImageIds),
回滚时通过 docker tag <old-id> <image:tag> 恢复旧版本。

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
这个提交包含在:
XuqmGroup 2026-06-13 00:54:02 +08:00
父节点 ffdb7c56fe
当前提交 9a9524ac07

查看文件

@ -49,6 +49,11 @@ public class SystemUpdateService {
"file-service", "tenant-web", "im-service", "push-service", "update-service", "license-service", "nginx"
);
// 健康检查配置新容器需在此时间内保持 running 状态才视为健康
private static final int HEALTH_CHECK_TIMEOUT_SEC = 60;
private static final int HEALTH_STABLE_REQUIRED_SEC = 10;
private static final int HEALTH_CHECK_INTERVAL_SEC = 5;
private static final Set<String> ALLOWED_LOG_SERVICES = Set.of(
"tenant-service", "file-service", "im-service", "push-service",
"update-service", "license-service", "nginx", "tenant-web"
@ -297,6 +302,12 @@ public class SystemUpdateService {
// 确保 tenant-service 在最后
toUpdate.remove("tenant-service");
// 拉取前先保存所有服务的旧镜像 ID用于启动失败时回滚
List<String> allToSnapshot = new ArrayList<>(toUpdate);
allToSnapshot.add("tenant-service");
Map<String, String> oldImageIds = captureCurrentImageIds(allToSnapshot);
emit.accept(" 已快照 " + oldImageIds.size() + " 个服务的旧版本镜像(更新失败时自动回滚)");
emit.accept(">>> 拉取镜像(" + toUpdate.size() + " 个服务)...");
for (String svc : toUpdate) {
emit.accept(" pulling " + svc + " ...");
@ -306,7 +317,7 @@ public class SystemUpdateService {
exec(emit, "docker", "compose", "-f", composeFile, "pull", "--quiet", "tenant-service");
emit.accept(">>> 镜像拉取完成");
restartAndSelfUpdate(emit, composeFile);
restartAndSelfUpdate(emit, composeFile, oldImageIds);
}
/** 拉取最新镜像并重建所有容器。 */
@ -314,13 +325,13 @@ public class SystemUpdateService {
runSelectiveUpdate(emit, null);
}
/** 保留数据,重置容器和数据库表结构。 */
/** 保留数据,重置容器和数据库表结构。重置不涉及镜像变更,不做回滚。 */
public void runReset(Consumer<String> emit) {
String composeFile = deployRoot + "/docker-compose.yml";
patchConfigs(emit);
resetDatabaseSchema(emit);
restartAndSelfUpdate(emit, composeFile);
restartAndSelfUpdate(emit, composeFile, Map.of());
}
// 数据库重置保留核心数据
@ -774,13 +785,56 @@ public class SystemUpdateService {
// 重启核心
private void restartAndSelfUpdate(Consumer<String> emit, String composeFile) {
emit.accept(">>> 重建各服务容器...");
/**
* 重建各服务容器并对每个服务进行健康检查
* 若新容器在 HEALTH_CHECK_TIMEOUT_SEC 内未保持稳定运行自动回滚到旧镜像
* @param oldImageIds 拉取新镜像前保存的旧镜像 IDsha256为空时跳过回滚
*/
private void restartAndSelfUpdate(Consumer<String> emit, String composeFile, Map<String, String> oldImageIds) {
emit.accept(">>> 重建各服务容器(含健康检查与自动回滚)...");
List<String> rolledBack = new ArrayList<>();
List<String> failed = new ArrayList<>();
for (String svc : OTHER_SERVICES) {
emit.accept(" restarting " + svc + " ...");
exec(emit, "docker", "compose", "-f", composeFile,
"up", "-d", "--no-deps", "--force-recreate", svc);
boolean healthy = waitForServiceStable(emit, svc, HEALTH_CHECK_TIMEOUT_SEC);
if (healthy) {
emit.accept(" " + svc + "");
} else {
String oldId = oldImageIds.get(svc);
if (oldId != null && !oldId.isBlank()) {
emit.accept(" [警告] " + svc + " 启动失败,正在回滚旧版本...");
boolean rollbackOk = rollbackService(emit, composeFile, svc, oldId);
if (rollbackOk) {
rolledBack.add(svc);
} else {
failed.add(svc);
}
} else {
emit.accept(" [错误] " + svc + " 启动失败且无旧镜像 ID,无法自动回滚");
failed.add(svc);
}
// 输出该服务最近日志辅助排查
try {
String tail = getServiceLogs(svc, 30);
emit.accept(" --- " + svc + " 近期日志末30行---");
for (String l : tail.split("\n")) {
if (!l.isBlank()) emit.accept(" " + l);
}
emit.accept(" ---");
} catch (Exception ignored) {}
}
}
if (!rolledBack.isEmpty()) {
emit.accept(">>> [警告] 以下服务已自动回滚到旧版本: " + String.join(", ", rolledBack));
emit.accept(">>> 请检查代码或配置后重新发版。");
}
if (!failed.isEmpty()) {
emit.accept(">>> [严重] 以下服务更新失败且回滚无效,需人工介入: " + String.join(", ", failed));
}
emit.accept(">>> 启动自更新助手容器...");
@ -791,7 +845,7 @@ public class SystemUpdateService {
emit.accept("DONE");
return;
}
boolean helperStarted = spawnSelfUpdater(composeFile, selfImage);
boolean helperStarted = spawnSelfUpdater(composeFile, selfImage, oldImageIds.getOrDefault("tenant-service", ""));
if (helperStarted) {
emit.accept(">>> 助手容器已就绪,tenant-service 即将重建(连接将短暂中断)...");
emit.accept("RESTART_SELF");
@ -802,6 +856,146 @@ public class SystemUpdateService {
}
}
// 镜像快照与健康检查
/**
* 在拉取新镜像前保存各服务当前运行容器的镜像 IDsha256
* 存为 Map&lt;serviceName, imageId&gt;用于更新失败时 docker tag 回旧版本
*/
private Map<String, String> captureCurrentImageIds(List<String> services) {
Map<String, String> ids = new LinkedHashMap<>();
for (String svc : services) {
try {
Process ps = new ProcessBuilder(
"docker", "ps",
"--filter", "label=com.docker.compose.service=" + svc,
"--format", "{{.ID}}"
).redirectErrorStream(true).start();
String containerId = new String(ps.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
ps.waitFor();
if (containerId.isEmpty()) continue;
containerId = containerId.split("\n")[0].trim();
Process inspect = new ProcessBuilder(
"docker", "inspect", "--format", "{{.Image}}", containerId
).redirectErrorStream(true).start();
String imageId = new String(inspect.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
inspect.waitFor();
if (!imageId.isEmpty()) {
ids.put(svc, imageId);
}
} catch (Exception e) {
log.warn("captureCurrentImageIds: failed for {}: {}", svc, e.getMessage());
}
}
return ids;
}
/**
* 轮询容器状态直到容器持续 HEALTH_STABLE_REQUIRED_SEC 秒保持 running
* 若检测到容器已 exited立即返回 false快速失败
*/
private boolean waitForServiceStable(Consumer<String> emit, String service, int timeoutSeconds) {
int elapsed = 0;
int stableSeconds = 0;
while (elapsed < timeoutSeconds) {
try { Thread.sleep(HEALTH_CHECK_INTERVAL_SEC * 1000L); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; }
elapsed += HEALTH_CHECK_INTERVAL_SEC;
stableSeconds += HEALTH_CHECK_INTERVAL_SEC;
try {
// 检查 running 状态
Process runPs = new ProcessBuilder(
"docker", "ps",
"--filter", "label=com.docker.compose.service=" + service,
"--filter", "status=running",
"--format", "{{.ID}}"
).redirectErrorStream(true).start();
String running = new String(runPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
runPs.waitFor();
if (running.isEmpty()) {
// 检查是否已 exited快速失败
Process exitPs = new ProcessBuilder(
"docker", "ps", "-a",
"--filter", "label=com.docker.compose.service=" + service,
"--filter", "status=exited",
"--format", "{{.Status}}"
).redirectErrorStream(true).start();
String exited = new String(exitPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
exitPs.waitFor();
if (!exited.isEmpty()) {
emit.accept(" [健康检查] " + service + " 已退出 (" + exited + "),快速判定失败");
return false;
}
stableSeconds = 0;
emit.accept(" [健康检查] " + service + " 等待启动... (" + elapsed + "/" + timeoutSeconds + "s)");
} else {
emit.accept(" [健康检查] " + service + " running ("
+ stableSeconds + "/" + HEALTH_STABLE_REQUIRED_SEC + "s)");
if (stableSeconds >= HEALTH_STABLE_REQUIRED_SEC) {
return true;
}
}
} catch (Exception e) {
stableSeconds = 0;
emit.accept(" [健康检查] " + service + " 状态查询异常: " + e.getMessage());
}
}
emit.accept(" [健康检查] " + service + " 超时(" + timeoutSeconds + "s 内未就绪)");
return false;
}
/**
* 将指定服务回滚到旧镜像
* 先将旧镜像 ID 重新 tag :latest docker compose up 重建容器
* @return true 表示回滚后服务成功启动
*/
private boolean rollbackService(Consumer<String> emit, String composeFile, String service, String oldImageId) {
String imageName = resolveServiceImageName(service);
if (imageName != null) {
try {
Process tag = new ProcessBuilder("docker", "tag", oldImageId, imageName)
.redirectErrorStream(true).start();
String tagOut = new String(tag.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
int tagCode = tag.waitFor();
if (tagCode != 0) {
emit.accept(" [回滚警告] docker tag 返回 " + tagCode + (tagOut.isEmpty() ? "" : ": " + tagOut));
}
} catch (Exception e) {
emit.accept(" [回滚警告] 重新标记镜像失败: " + e.getMessage());
}
}
exec(emit, "docker", "compose", "-f", composeFile, "up", "-d", "--no-deps", "--force-recreate", service);
boolean ok = waitForServiceStable(emit, service, 60);
if (ok) {
emit.accept(" [回滚] " + service + " 已回滚到旧版本 ✓");
} else {
emit.accept(" [严重] " + service + " 回滚后仍无法启动,请人工介入!");
emit.accept(" [诊断] docker logs $(docker ps -a --filter label=com.docker.compose.service=" + service + " -q --latest)");
}
return ok;
}
/** 从 .env 读取 REGISTRY 和 IMAGE_TAG,拼接服务完整镜像名registry/service:tag。 */
private String resolveServiceImageName(String service) {
try {
Path envFile = Paths.get(deployRoot, ".env");
String registry = readEnvValue(envFile, "REGISTRY");
String imageTag = readEnvValue(envFile, "IMAGE_TAG");
if (registry == null) return null;
if (imageTag == null || imageTag.isBlank()) imageTag = "latest";
return registry + "/" + service + ":" + imageTag;
} catch (Exception e) {
return null;
}
}
// 配置文件热修复
private void patchConfigs(Consumer<String> emit) {
@ -1001,13 +1195,43 @@ public class SystemUpdateService {
}
}
private boolean spawnSelfUpdater(String composeFile, String image) {
/**
* 启动自更新助手容器负责重建 tenant-service当前进程无法重建自身
* 包含健康检查若新容器在 60s 内未保持运行自动回滚到旧镜像
*
* @param oldTenantImageId 更新前保存的旧镜像 ID空字符串表示无法回滚
*/
private boolean spawnSelfUpdater(String composeFile, String image, String oldTenantImageId) {
try {
new ProcessBuilder("docker", "rm", "-f", "xuqm-self-updater")
.redirectErrorStream(true).start().waitFor();
String shellCmd = "sleep 8 && docker compose -f " + composeFile
+ " up -d --no-deps --force-recreate tenant-service";
String tenantImageName = resolveServiceImageName("tenant-service");
if (tenantImageName == null) tenantImageName = "";
// Shell 脚本重建 等待健康 不健康则回滚
// 健康检查60s 内每 10s 轮询一次连续 running 即视为成功
String shellCmd = "sleep 8 && "
+ "OLD_ID='" + oldTenantImageId.replace("'", "") + "' && "
+ "IMG='" + tenantImageName.replace("'", "") + "' && "
+ "docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service && "
+ "HEALTHY=false && "
+ "for i in 1 2 3 4 5 6; do "
+ " sleep 10; "
+ " if docker ps --filter 'label=com.docker.compose.service=tenant-service' "
+ " --filter 'status=running' -q 2>/dev/null | grep -q .; then "
+ " HEALTHY=true; break; "
+ " fi; "
+ " if docker ps -a --filter 'label=com.docker.compose.service=tenant-service' "
+ " --filter 'status=exited' -q 2>/dev/null | grep -q .; then "
+ " break; "
+ " fi; "
+ "done; "
+ "if [ \"$HEALTHY\" != \"true\" ] && [ -n \"$OLD_ID\" ] && [ -n \"$IMG\" ]; then "
+ " echo '[ROLLBACK] tenant-service unhealthy, reverting to old image...'; "
+ " docker tag \"$OLD_ID\" \"$IMG\"; "
+ " docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service; "
+ "fi";
Process p = new ProcessBuilder(
"docker", "run", "-d", "--rm",