fix(安全中心): 修复健康检查误判 force-recreate 旧容器为失败
force-recreate 会先停掉旧容器(status=exited),若此时健康检查 轮询到旧容器的 exited 状态,会误判新容器失败并触发不必要的回滚。 修复方式: - 新增 getNewestContainerId() 在 compose up 后立即拿到新容器 ID - waitForServiceStable 接受 containerId 参数,通过 docker inspect 精确轮询新容器状态,完全隔离旧容器的干扰 - 退化路径(containerId=null)保留原有服务名轮询逻辑 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
这个提交包含在:
父节点
9a9524ac07
当前提交
9084831b2a
@ -799,8 +799,10 @@ public class SystemUpdateService {
|
||||
emit.accept(" restarting " + svc + " ...");
|
||||
exec(emit, "docker", "compose", "-f", composeFile,
|
||||
"up", "-d", "--no-deps", "--force-recreate", svc);
|
||||
// 拿到 compose up 之后最新创建的容器 ID,排除旧容器干扰
|
||||
String newContainerId = getNewestContainerId(svc);
|
||||
|
||||
boolean healthy = waitForServiceStable(emit, svc, HEALTH_CHECK_TIMEOUT_SEC);
|
||||
boolean healthy = waitForServiceStable(emit, svc, newContainerId, HEALTH_CHECK_TIMEOUT_SEC);
|
||||
if (healthy) {
|
||||
emit.accept(" " + svc + " ✓");
|
||||
} else {
|
||||
@ -892,12 +894,34 @@ public class SystemUpdateService {
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询容器状态,直到容器持续 HEALTH_STABLE_REQUIRED_SEC 秒保持 running。
|
||||
* 若检测到容器已 exited,立即返回 false(快速失败)。
|
||||
* 获取指定服务最新创建的容器 ID(含已停止容器)。
|
||||
* 在 docker compose up --force-recreate 之后立即调用,确保拿到新容器而非旧容器。
|
||||
*/
|
||||
private boolean waitForServiceStable(Consumer<String> emit, String service, int timeoutSeconds) {
|
||||
int elapsed = 0;
|
||||
int stableSeconds = 0;
|
||||
private String getNewestContainerId(String service) {
|
||||
try {
|
||||
Process p = new ProcessBuilder(
|
||||
"docker", "ps", "-a", "-n", "1",
|
||||
"--filter", "label=com.docker.compose.service=" + service,
|
||||
"--format", "{{.ID}}"
|
||||
).redirectErrorStream(true).start();
|
||||
String out = new String(p.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||
p.waitFor();
|
||||
return out.isEmpty() ? null : out.split("\n")[0].trim();
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询指定容器的状态,直到该容器持续 HEALTH_STABLE_REQUIRED_SEC 秒保持 running。
|
||||
* 通过 containerId 精确定位新容器,避免 --force-recreate 停掉旧容器时的误判。
|
||||
* 若容器已 exited,立即返回 false(快速失败)。
|
||||
*
|
||||
* @param containerId 新容器 ID;为 null 时退化为服务名轮询
|
||||
*/
|
||||
private boolean waitForServiceStable(Consumer<String> emit, String service, String containerId, int timeoutSeconds) {
|
||||
int elapsed = 0;
|
||||
int stableSeconds = 0;
|
||||
|
||||
while (elapsed < timeoutSeconds) {
|
||||
try { Thread.sleep(HEALTH_CHECK_INTERVAL_SEC * 1000L); }
|
||||
@ -906,39 +930,43 @@ public class SystemUpdateService {
|
||||
stableSeconds += HEALTH_CHECK_INTERVAL_SEC;
|
||||
|
||||
try {
|
||||
// 检查 running 状态
|
||||
Process runPs = new ProcessBuilder(
|
||||
"docker", "ps",
|
||||
"--filter", "label=com.docker.compose.service=" + service,
|
||||
"--filter", "status=running",
|
||||
"--format", "{{.ID}}"
|
||||
).redirectErrorStream(true).start();
|
||||
String running = new String(runPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||
runPs.waitFor();
|
||||
|
||||
if (running.isEmpty()) {
|
||||
// 检查是否已 exited(快速失败)
|
||||
Process exitPs = new ProcessBuilder(
|
||||
"docker", "ps", "-a",
|
||||
"--filter", "label=com.docker.compose.service=" + service,
|
||||
"--filter", "status=exited",
|
||||
"--format", "{{.Status}}"
|
||||
String statusLine;
|
||||
if (containerId != null) {
|
||||
// 直接 inspect 新容器,避免旧容器干扰
|
||||
Process ins = new ProcessBuilder(
|
||||
"docker", "inspect", "--format",
|
||||
"{{.State.Status}} {{.State.ExitCode}}", containerId
|
||||
).redirectErrorStream(true).start();
|
||||
String exited = new String(exitPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||
exitPs.waitFor();
|
||||
|
||||
if (!exited.isEmpty()) {
|
||||
emit.accept(" [健康检查] " + service + " 已退出 (" + exited + "),快速判定失败");
|
||||
return false;
|
||||
}
|
||||
stableSeconds = 0;
|
||||
emit.accept(" [健康检查] " + service + " 等待启动... (" + elapsed + "/" + timeoutSeconds + "s)");
|
||||
statusLine = new String(ins.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||
ins.waitFor();
|
||||
} else {
|
||||
// 退化模式:查 running 容器
|
||||
Process runPs = new ProcessBuilder(
|
||||
"docker", "ps",
|
||||
"--filter", "label=com.docker.compose.service=" + service,
|
||||
"--filter", "status=running",
|
||||
"--format", "{{.ID}}"
|
||||
).redirectErrorStream(true).start();
|
||||
String runOut = new String(runPs.getInputStream().readAllBytes(), StandardCharsets.UTF_8).trim();
|
||||
runPs.waitFor();
|
||||
statusLine = runOut.isEmpty() ? "unknown 0" : "running 0";
|
||||
}
|
||||
|
||||
if (statusLine.startsWith("running")) {
|
||||
emit.accept(" [健康检查] " + service + " running ("
|
||||
+ stableSeconds + "/" + HEALTH_STABLE_REQUIRED_SEC + "s)");
|
||||
if (stableSeconds >= HEALTH_STABLE_REQUIRED_SEC) {
|
||||
return true;
|
||||
}
|
||||
} else if (statusLine.startsWith("exited")) {
|
||||
// 快速失败:新容器已退出
|
||||
emit.accept(" [健康检查] " + service + " 已退出 (" + statusLine + "),快速判定失败");
|
||||
return false;
|
||||
} else {
|
||||
// created / paused / restarting 等中间状态
|
||||
stableSeconds = 0;
|
||||
emit.accept(" [健康检查] " + service + " 等待启动... status=" + statusLine
|
||||
+ " (" + elapsed + "/" + timeoutSeconds + "s)");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
stableSeconds = 0;
|
||||
@ -971,8 +999,9 @@ public class SystemUpdateService {
|
||||
}
|
||||
|
||||
exec(emit, "docker", "compose", "-f", composeFile, "up", "-d", "--no-deps", "--force-recreate", service);
|
||||
String rollbackContainerId = getNewestContainerId(service);
|
||||
|
||||
boolean ok = waitForServiceStable(emit, service, 60);
|
||||
boolean ok = waitForServiceStable(emit, service, rollbackContainerId, 60);
|
||||
if (ok) {
|
||||
emit.accept(" [回滚] " + service + " 已回滚到旧版本 ✓");
|
||||
} else {
|
||||
|
||||
正在加载...
在新工单中引用
屏蔽一个用户