feat(安全中心): 一键更新增加健康检查与自动回滚
每个服务重建后轮询容器状态最长 60s: - 容器保持 running 10s 以上 → 更新成功 - 容器已 exited → 立即触发回滚(retag 旧镜像 ID 重建容器) - 超时未就绪 → 同样触发回滚 tenant-service 的自更新助手容器也包含相同逻辑: 60s 内不健康则 retag 旧镜像并重建,保证平台始终可访问。 拉取镜像前统一保存各服务旧镜像 ID(captureCurrentImageIds), 回滚时通过 docker tag <old-id> <image:tag> 恢复旧版本。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
这个提交包含在:
父节点
ffdb7c56fe
当前提交
9a9524ac07
@ -49,6 +49,11 @@ public class SystemUpdateService {
|
|||||||
"file-service", "tenant-web", "im-service", "push-service", "update-service", "license-service", "nginx"
|
"file-service", "tenant-web", "im-service", "push-service", "update-service", "license-service", "nginx"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// 健康检查配置:新容器需在此时间内保持 running 状态才视为健康
|
||||||
|
private static final int HEALTH_CHECK_TIMEOUT_SEC = 60;
|
||||||
|
private static final int HEALTH_STABLE_REQUIRED_SEC = 10;
|
||||||
|
private static final int HEALTH_CHECK_INTERVAL_SEC = 5;
|
||||||
|
|
||||||
private static final Set<String> ALLOWED_LOG_SERVICES = Set.of(
|
private static final Set<String> ALLOWED_LOG_SERVICES = Set.of(
|
||||||
"tenant-service", "file-service", "im-service", "push-service",
|
"tenant-service", "file-service", "im-service", "push-service",
|
||||||
"update-service", "license-service", "nginx", "tenant-web"
|
"update-service", "license-service", "nginx", "tenant-web"
|
||||||
@ -297,6 +302,12 @@ public class SystemUpdateService {
|
|||||||
// 确保 tenant-service 在最后
|
// 确保 tenant-service 在最后
|
||||||
toUpdate.remove("tenant-service");
|
toUpdate.remove("tenant-service");
|
||||||
|
|
||||||
|
// 拉取前先保存所有服务的旧镜像 ID,用于启动失败时回滚
|
||||||
|
List<String> allToSnapshot = new ArrayList<>(toUpdate);
|
||||||
|
allToSnapshot.add("tenant-service");
|
||||||
|
Map<String, String> oldImageIds = captureCurrentImageIds(allToSnapshot);
|
||||||
|
emit.accept(" 已快照 " + oldImageIds.size() + " 个服务的旧版本镜像(更新失败时自动回滚)");
|
||||||
|
|
||||||
emit.accept(">>> 拉取镜像(" + toUpdate.size() + " 个服务)...");
|
emit.accept(">>> 拉取镜像(" + toUpdate.size() + " 个服务)...");
|
||||||
for (String svc : toUpdate) {
|
for (String svc : toUpdate) {
|
||||||
emit.accept(" pulling " + svc + " ...");
|
emit.accept(" pulling " + svc + " ...");
|
||||||
@ -306,7 +317,7 @@ public class SystemUpdateService {
|
|||||||
exec(emit, "docker", "compose", "-f", composeFile, "pull", "--quiet", "tenant-service");
|
exec(emit, "docker", "compose", "-f", composeFile, "pull", "--quiet", "tenant-service");
|
||||||
emit.accept(">>> 镜像拉取完成");
|
emit.accept(">>> 镜像拉取完成");
|
||||||
|
|
||||||
restartAndSelfUpdate(emit, composeFile);
|
restartAndSelfUpdate(emit, composeFile, oldImageIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** 拉取最新镜像并重建所有容器。 */
|
/** 拉取最新镜像并重建所有容器。 */
|
||||||
@ -314,13 +325,13 @@ public class SystemUpdateService {
|
|||||||
runSelectiveUpdate(emit, null);
|
runSelectiveUpdate(emit, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** 保留数据,重置容器和数据库表结构。 */
|
/** 保留数据,重置容器和数据库表结构。重置不涉及镜像变更,不做回滚。 */
|
||||||
public void runReset(Consumer<String> emit) {
|
public void runReset(Consumer<String> emit) {
|
||||||
String composeFile = deployRoot + "/docker-compose.yml";
|
String composeFile = deployRoot + "/docker-compose.yml";
|
||||||
|
|
||||||
patchConfigs(emit);
|
patchConfigs(emit);
|
||||||
resetDatabaseSchema(emit);
|
resetDatabaseSchema(emit);
|
||||||
restartAndSelfUpdate(emit, composeFile);
|
restartAndSelfUpdate(emit, composeFile, Map.of());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── 数据库重置(保留核心数据)──────────────────────────────────────────────
|
// ── 数据库重置(保留核心数据)──────────────────────────────────────────────
|
||||||
@ -774,13 +785,56 @@ public class SystemUpdateService {
|
|||||||
|
|
||||||
// ── 重启核心 ────────────────────────────────────────────────────────────────
|
// ── 重启核心 ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
private void restartAndSelfUpdate(Consumer<String> emit, String composeFile) {
|
/**
|
||||||
emit.accept(">>> 重建各服务容器...");
|
* 重建各服务容器,并对每个服务进行健康检查。
|
||||||
|
* 若新容器在 HEALTH_CHECK_TIMEOUT_SEC 内未保持稳定运行,自动回滚到旧镜像。
|
||||||
|
* @param oldImageIds 拉取新镜像前保存的旧镜像 ID(sha256);为空时跳过回滚
|
||||||
|
*/
|
||||||
|
private void restartAndSelfUpdate(Consumer<String> emit, String composeFile, Map<String, String> oldImageIds) {
|
||||||
|
emit.accept(">>> 重建各服务容器(含健康检查与自动回滚)...");
|
||||||
|
List<String> rolledBack = new ArrayList<>();
|
||||||
|
List<String> failed = new ArrayList<>();
|
||||||
|
|
||||||
for (String svc : OTHER_SERVICES) {
|
for (String svc : OTHER_SERVICES) {
|
||||||
emit.accept(" restarting " + svc + " ...");
|
emit.accept(" restarting " + svc + " ...");
|
||||||
exec(emit, "docker", "compose", "-f", composeFile,
|
exec(emit, "docker", "compose", "-f", composeFile,
|
||||||
"up", "-d", "--no-deps", "--force-recreate", svc);
|
"up", "-d", "--no-deps", "--force-recreate", svc);
|
||||||
|
|
||||||
|
boolean healthy = waitForServiceStable(emit, svc, HEALTH_CHECK_TIMEOUT_SEC);
|
||||||
|
if (healthy) {
|
||||||
emit.accept(" " + svc + " ✓");
|
emit.accept(" " + svc + " ✓");
|
||||||
|
} else {
|
||||||
|
String oldId = oldImageIds.get(svc);
|
||||||
|
if (oldId != null && !oldId.isBlank()) {
|
||||||
|
emit.accept(" [警告] " + svc + " 启动失败,正在回滚旧版本...");
|
||||||
|
boolean rollbackOk = rollbackService(emit, composeFile, svc, oldId);
|
||||||
|
if (rollbackOk) {
|
||||||
|
rolledBack.add(svc);
|
||||||
|
} else {
|
||||||
|
failed.add(svc);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
emit.accept(" [错误] " + svc + " 启动失败且无旧镜像 ID,无法自动回滚");
|
||||||
|
failed.add(svc);
|
||||||
|
}
|
||||||
|
// 输出该服务最近日志,辅助排查
|
||||||
|
try {
|
||||||
|
String tail = getServiceLogs(svc, 30);
|
||||||
|
emit.accept(" --- " + svc + " 近期日志(末30行)---");
|
||||||
|
for (String l : tail.split("\n")) {
|
||||||
|
if (!l.isBlank()) emit.accept(" " + l);
|
||||||
|
}
|
||||||
|
emit.accept(" ---");
|
||||||
|
} catch (Exception ignored) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!rolledBack.isEmpty()) {
|
||||||
|
emit.accept(">>> [警告] 以下服务已自动回滚到旧版本: " + String.join(", ", rolledBack));
|
||||||
|
emit.accept(">>> 请检查代码或配置后重新发版。");
|
||||||
|
}
|
||||||
|
if (!failed.isEmpty()) {
|
||||||
|
emit.accept(">>> [严重] 以下服务更新失败且回滚无效,需人工介入: " + String.join(", ", failed));
|
||||||
}
|
}
|
||||||
|
|
||||||
emit.accept(">>> 启动自更新助手容器...");
|
emit.accept(">>> 启动自更新助手容器...");
|
||||||
@ -791,7 +845,7 @@ public class SystemUpdateService {
|
|||||||
emit.accept("DONE");
|
emit.accept("DONE");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
boolean helperStarted = spawnSelfUpdater(composeFile, selfImage);
|
boolean helperStarted = spawnSelfUpdater(composeFile, selfImage, oldImageIds.getOrDefault("tenant-service", ""));
|
||||||
if (helperStarted) {
|
if (helperStarted) {
|
||||||
emit.accept(">>> 助手容器已就绪,tenant-service 即将重建(连接将短暂中断)...");
|
emit.accept(">>> 助手容器已就绪,tenant-service 即将重建(连接将短暂中断)...");
|
||||||
emit.accept("RESTART_SELF");
|
emit.accept("RESTART_SELF");
|
||||||
@ -802,6 +856,146 @@ public class SystemUpdateService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── 镜像快照与健康检查 ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 在拉取新镜像前,保存各服务当前运行容器的镜像 ID(sha256)。
|
||||||
|
* 存为 Map<serviceName, imageId>,用于更新失败时 docker tag 回旧版本。
|
||||||
|
*/
|
||||||
|
private Map<String, String> captureCurrentImageIds(List<String> services) {
|
||||||
|
Map<String, String> ids = new LinkedHashMap<>();
|
||||||
|
for (String svc : services) {
|
||||||
|
try {
|
||||||
|
Process ps = new ProcessBuilder(
|
||||||
|
"docker", "ps",
|
||||||
|
"--filter", "label=com.docker.compose.service=" + svc,
|
||||||
|
"--format", "{{.ID}}"
|
||||||
|
).redirectErrorStream(true).start();
|
||||||
|
String containerId = new String(ps.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||||
|
ps.waitFor();
|
||||||
|
if (containerId.isEmpty()) continue;
|
||||||
|
containerId = containerId.split("\n")[0].trim();
|
||||||
|
|
||||||
|
Process inspect = new ProcessBuilder(
|
||||||
|
"docker", "inspect", "--format", "{{.Image}}", containerId
|
||||||
|
).redirectErrorStream(true).start();
|
||||||
|
String imageId = new String(inspect.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||||
|
inspect.waitFor();
|
||||||
|
if (!imageId.isEmpty()) {
|
||||||
|
ids.put(svc, imageId);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("captureCurrentImageIds: failed for {}: {}", svc, e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 轮询容器状态,直到容器持续 HEALTH_STABLE_REQUIRED_SEC 秒保持 running。
|
||||||
|
* 若检测到容器已 exited,立即返回 false(快速失败)。
|
||||||
|
*/
|
||||||
|
private boolean waitForServiceStable(Consumer<String> emit, String service, int timeoutSeconds) {
|
||||||
|
int elapsed = 0;
|
||||||
|
int stableSeconds = 0;
|
||||||
|
|
||||||
|
while (elapsed < timeoutSeconds) {
|
||||||
|
try { Thread.sleep(HEALTH_CHECK_INTERVAL_SEC * 1000L); }
|
||||||
|
catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; }
|
||||||
|
elapsed += HEALTH_CHECK_INTERVAL_SEC;
|
||||||
|
stableSeconds += HEALTH_CHECK_INTERVAL_SEC;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 检查 running 状态
|
||||||
|
Process runPs = new ProcessBuilder(
|
||||||
|
"docker", "ps",
|
||||||
|
"--filter", "label=com.docker.compose.service=" + service,
|
||||||
|
"--filter", "status=running",
|
||||||
|
"--format", "{{.ID}}"
|
||||||
|
).redirectErrorStream(true).start();
|
||||||
|
String running = new String(runPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||||
|
runPs.waitFor();
|
||||||
|
|
||||||
|
if (running.isEmpty()) {
|
||||||
|
// 检查是否已 exited(快速失败)
|
||||||
|
Process exitPs = new ProcessBuilder(
|
||||||
|
"docker", "ps", "-a",
|
||||||
|
"--filter", "label=com.docker.compose.service=" + service,
|
||||||
|
"--filter", "status=exited",
|
||||||
|
"--format", "{{.Status}}"
|
||||||
|
).redirectErrorStream(true).start();
|
||||||
|
String exited = new String(exitPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||||
|
exitPs.waitFor();
|
||||||
|
|
||||||
|
if (!exited.isEmpty()) {
|
||||||
|
emit.accept(" [健康检查] " + service + " 已退出 (" + exited + "),快速判定失败");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
stableSeconds = 0;
|
||||||
|
emit.accept(" [健康检查] " + service + " 等待启动... (" + elapsed + "/" + timeoutSeconds + "s)");
|
||||||
|
} else {
|
||||||
|
emit.accept(" [健康检查] " + service + " running ("
|
||||||
|
+ stableSeconds + "/" + HEALTH_STABLE_REQUIRED_SEC + "s)");
|
||||||
|
if (stableSeconds >= HEALTH_STABLE_REQUIRED_SEC) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
stableSeconds = 0;
|
||||||
|
emit.accept(" [健康检查] " + service + " 状态查询异常: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
emit.accept(" [健康检查] " + service + " 超时(" + timeoutSeconds + "s 内未就绪)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将指定服务回滚到旧镜像。
|
||||||
|
* 先将旧镜像 ID 重新 tag 为 :latest,再 docker compose up 重建容器。
|
||||||
|
* @return true 表示回滚后服务成功启动
|
||||||
|
*/
|
||||||
|
private boolean rollbackService(Consumer<String> emit, String composeFile, String service, String oldImageId) {
|
||||||
|
String imageName = resolveServiceImageName(service);
|
||||||
|
if (imageName != null) {
|
||||||
|
try {
|
||||||
|
Process tag = new ProcessBuilder("docker", "tag", oldImageId, imageName)
|
||||||
|
.redirectErrorStream(true).start();
|
||||||
|
String tagOut = new String(tag.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||||
|
int tagCode = tag.waitFor();
|
||||||
|
if (tagCode != 0) {
|
||||||
|
emit.accept(" [回滚警告] docker tag 返回 " + tagCode + (tagOut.isEmpty() ? "" : ": " + tagOut));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
emit.accept(" [回滚警告] 重新标记镜像失败: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
exec(emit, "docker", "compose", "-f", composeFile, "up", "-d", "--no-deps", "--force-recreate", service);
|
||||||
|
|
||||||
|
boolean ok = waitForServiceStable(emit, service, 60);
|
||||||
|
if (ok) {
|
||||||
|
emit.accept(" [回滚] " + service + " 已回滚到旧版本 ✓");
|
||||||
|
} else {
|
||||||
|
emit.accept(" [严重] " + service + " 回滚后仍无法启动,请人工介入!");
|
||||||
|
emit.accept(" [诊断] docker logs $(docker ps -a --filter label=com.docker.compose.service=" + service + " -q --latest)");
|
||||||
|
}
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 从 .env 读取 REGISTRY 和 IMAGE_TAG,拼接服务完整镜像名(registry/service:tag)。 */
|
||||||
|
private String resolveServiceImageName(String service) {
|
||||||
|
try {
|
||||||
|
Path envFile = Paths.get(deployRoot, ".env");
|
||||||
|
String registry = readEnvValue(envFile, "REGISTRY");
|
||||||
|
String imageTag = readEnvValue(envFile, "IMAGE_TAG");
|
||||||
|
if (registry == null) return null;
|
||||||
|
if (imageTag == null || imageTag.isBlank()) imageTag = "latest";
|
||||||
|
return registry + "/" + service + ":" + imageTag;
|
||||||
|
} catch (Exception e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── 配置文件热修复 ──────────────────────────────────────────────────────────
|
// ── 配置文件热修复 ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
private void patchConfigs(Consumer<String> emit) {
|
private void patchConfigs(Consumer<String> emit) {
|
||||||
@ -1001,13 +1195,43 @@ public class SystemUpdateService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean spawnSelfUpdater(String composeFile, String image) {
|
/**
|
||||||
|
* 启动自更新助手容器,负责重建 tenant-service(当前进程无法重建自身)。
|
||||||
|
* 包含健康检查:若新容器在 60s 内未保持运行,自动回滚到旧镜像。
|
||||||
|
*
|
||||||
|
* @param oldTenantImageId 更新前保存的旧镜像 ID,空字符串表示无法回滚
|
||||||
|
*/
|
||||||
|
private boolean spawnSelfUpdater(String composeFile, String image, String oldTenantImageId) {
|
||||||
try {
|
try {
|
||||||
new ProcessBuilder("docker", "rm", "-f", "xuqm-self-updater")
|
new ProcessBuilder("docker", "rm", "-f", "xuqm-self-updater")
|
||||||
.redirectErrorStream(true).start().waitFor();
|
.redirectErrorStream(true).start().waitFor();
|
||||||
|
|
||||||
String shellCmd = "sleep 8 && docker compose -f " + composeFile
|
String tenantImageName = resolveServiceImageName("tenant-service");
|
||||||
+ " up -d --no-deps --force-recreate tenant-service";
|
if (tenantImageName == null) tenantImageName = "";
|
||||||
|
|
||||||
|
// Shell 脚本:重建 → 等待健康 → 不健康则回滚
|
||||||
|
// 健康检查:60s 内每 10s 轮询一次,连续 running 即视为成功
|
||||||
|
String shellCmd = "sleep 8 && "
|
||||||
|
+ "OLD_ID='" + oldTenantImageId.replace("'", "") + "' && "
|
||||||
|
+ "IMG='" + tenantImageName.replace("'", "") + "' && "
|
||||||
|
+ "docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service && "
|
||||||
|
+ "HEALTHY=false && "
|
||||||
|
+ "for i in 1 2 3 4 5 6; do "
|
||||||
|
+ " sleep 10; "
|
||||||
|
+ " if docker ps --filter 'label=com.docker.compose.service=tenant-service' "
|
||||||
|
+ " --filter 'status=running' -q 2>/dev/null | grep -q .; then "
|
||||||
|
+ " HEALTHY=true; break; "
|
||||||
|
+ " fi; "
|
||||||
|
+ " if docker ps -a --filter 'label=com.docker.compose.service=tenant-service' "
|
||||||
|
+ " --filter 'status=exited' -q 2>/dev/null | grep -q .; then "
|
||||||
|
+ " break; "
|
||||||
|
+ " fi; "
|
||||||
|
+ "done; "
|
||||||
|
+ "if [ \"$HEALTHY\" != \"true\" ] && [ -n \"$OLD_ID\" ] && [ -n \"$IMG\" ]; then "
|
||||||
|
+ " echo '[ROLLBACK] tenant-service unhealthy, reverting to old image...'; "
|
||||||
|
+ " docker tag \"$OLD_ID\" \"$IMG\"; "
|
||||||
|
+ " docker compose -f " + composeFile + " up -d --no-deps --force-recreate tenant-service; "
|
||||||
|
+ "fi";
|
||||||
|
|
||||||
Process p = new ProcessBuilder(
|
Process p = new ProcessBuilder(
|
||||||
"docker", "run", "-d", "--rm",
|
"docker", "run", "-d", "--rm",
|
||||||
|
|||||||
正在加载...
在新工单中引用
屏蔽一个用户