Skip to content

Commit 718d44e

Browse files
fix(bootstrap): surface Helm install failure on namespace timeout (#211)
Signed-off-by: Manoj-engineer <194872717+Manoj-engineer@users.noreply.github.com>
1 parent de9dcaa commit 718d44e

File tree

1 file changed

+154
-4
lines changed
  • crates/openshell-bootstrap/src

1 file changed

+154
-4
lines changed

crates/openshell-bootstrap/src/lib.rs

Lines changed: 154 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ pub use crate::docker::{
4848
DockerPreflight, ExistingGatewayInfo, check_docker_available, create_ssh_docker_client,
4949
};
5050
pub use crate::metadata::{
51-
GatewayMetadata, clear_active_gateway, extract_host_from_ssh_destination, get_gateway_metadata,
52-
list_gateways, load_active_gateway, load_gateway_metadata, load_last_sandbox,
53-
remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway, save_last_sandbox,
54-
store_gateway_metadata,
51+
GatewayMetadata, clear_active_gateway, extract_host_from_ssh_destination,
52+
get_gateway_metadata, list_gateways, load_active_gateway, load_gateway_metadata,
53+
load_last_sandbox, remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway,
54+
save_last_sandbox, store_gateway_metadata,
5555
};
5656

5757
/// Options for remote SSH deployment.
@@ -950,6 +950,116 @@ async fn probe_container_dns(docker: &Docker, container_name: &str) -> Result<bo
950950
Ok(exit_code == 0 && output.contains("DNS_OK"))
951951
}
952952

953+
/// Query the status and logs of the `helm-install-<chart>` Job(s) that k3s runs
954+
/// at startup to deploy the embedded Helm charts (e.g. the openshell chart).
955+
///
956+
/// When the Job has failed we return a formatted string containing the Job
957+
/// failure reason and the last 30 lines of its pod logs so that callers can
958+
/// surface this as the *real* cause of the "namespace not ready" timeout.
959+
///
960+
/// Returns `None` when:
961+
/// - the exec into the container itself fails (container not running), or
962+
/// - no failed Helm install Job is found.
963+
async fn diagnose_helm_failure(
964+
docker: &Docker,
965+
container_name: &str,
966+
kubeconfig: &str,
967+
) -> Option<String> {
968+
// Find all Helm install Jobs that have at least one failed condition.
969+
// Query jobs that have either a non-zero status.failed count OR a pod in
970+
// Error/CrashLoopBackOff (status.failed stays "<none>" while the job is
971+
// still within its backoffLimit retry window).
972+
let (job_output, job_exit) = exec_capture_with_exit(
973+
docker,
974+
container_name,
975+
vec![
976+
"sh".to_string(),
977+
"-c".to_string(),
978+
format!(
979+
"KUBECONFIG={kubeconfig} kubectl get jobs -n kube-system \
980+
--no-headers -o custom-columns=NAME:.metadata.name,FAILED:.status.failed \
981+
2>/dev/null | awk '{{if ($2 != \"0\") print $1}}'"
982+
),
983+
],
984+
)
985+
.await
986+
.ok()?;
987+
988+
if job_exit != 0 || job_output.trim().is_empty() {
989+
return None;
990+
}
991+
992+
// Collect failed Helm install jobs (k3s names them `helm-install-<chart>`).
993+
let failed_jobs: Vec<&str> = job_output
994+
.lines()
995+
.map(str::trim)
996+
.filter(|l| !l.is_empty() && l.starts_with("helm-install-"))
997+
.collect();
998+
999+
if failed_jobs.is_empty() {
1000+
return None;
1001+
}
1002+
1003+
let mut parts: Vec<String> = Vec::new();
1004+
1005+
for job in &failed_jobs {
1006+
// Get the Job's status conditions for a concise failure reason.
1007+
let cond_output = exec_capture_with_exit(
1008+
docker,
1009+
container_name,
1010+
vec![
1011+
"sh".to_string(),
1012+
"-c".to_string(),
1013+
format!(
1014+
"KUBECONFIG={kubeconfig} kubectl get job {job} -n kube-system \
1015+
-o jsonpath='{{range .status.conditions[*]}}{{.type}}: {{.message}}{{\"\\n\"}}{{end}}' \
1016+
2>/dev/null"
1017+
),
1018+
],
1019+
)
1020+
.await
1021+
.map(|(out, _)| out)
1022+
.unwrap_or_default();
1023+
1024+
// Get the last 30 lines of logs from the Job's pod(s).
1025+
let log_output = exec_capture_with_exit(
1026+
docker,
1027+
container_name,
1028+
vec![
1029+
"sh".to_string(),
1030+
"-c".to_string(),
1031+
format!(
1032+
"KUBECONFIG={kubeconfig} kubectl logs -n kube-system \
1033+
-l job-name={job} --tail=30 2>&1"
1034+
),
1035+
],
1036+
)
1037+
.await
1038+
.map(|(out, _)| out)
1039+
.unwrap_or_default();
1040+
1041+
let mut section = format!("Job {job} failed.");
1042+
let cond = cond_output.trim();
1043+
if !cond.is_empty() {
1044+
section.push_str(&format!("\n Status: {}", cond.replace('\n', "\n ")));
1045+
}
1046+
let logs = log_output.trim();
1047+
if !logs.is_empty() {
1048+
section.push_str("\n Last job logs:");
1049+
for line in logs.lines().take(30) {
1050+
section.push_str(&format!("\n {line}"));
1051+
}
1052+
}
1053+
parts.push(section);
1054+
}
1055+
1056+
if parts.is_empty() {
1057+
None
1058+
} else {
1059+
Some(parts.join("\n\n"))
1060+
}
1061+
}
1062+
9531063
async fn wait_for_namespace(
9541064
docker: &Docker,
9551065
container_name: &str,
@@ -1040,6 +1150,20 @@ async fn wait_for_namespace(
10401150
}
10411151

10421152
if attempt + 1 == attempts {
1153+
// Before returning a generic timeout error, check whether a Helm
1154+
// install job failed. If so, surface the real Helm error so the
1155+
// user doesn't have to dig through job logs manually.
1156+
let helm_hint = diagnose_helm_failure(docker, container_name, kubeconfig).await;
1157+
if let Some(hint) = helm_hint {
1158+
return Err(miette::miette!(
1159+
"timed out waiting for namespace '{namespace}' to exist.\n\n\
1160+
A Helm install job appears to have failed — this is likely the root cause:\n\n\
1161+
{hint}\n\n\
1162+
To inspect the full job logs run:\n \
1163+
kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1164+
))
1165+
.wrap_err("K8s namespace not ready");
1166+
}
10431167
let logs = fetch_recent_logs(docker, container_name, 40).await;
10441168
return Err(miette::miette!(
10451169
"timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}"
@@ -1077,4 +1201,30 @@ mod tests {
10771201
);
10781202
}
10791203
}
1204+
1205+
/// Simulate the error message shape produced by `diagnose_helm_failure` and
1206+
/// ensure that `diagnose_failure` (in errors.rs) does not suppress or
1207+
/// override it — the Helm hint is intentionally surfaced verbatim inside
1208+
/// the `wait_for_namespace` timeout error, so we only need to verify the
1209+
/// string construction here rather than end-to-end container exec.
1210+
#[test]
1211+
fn helm_failure_hint_is_included_in_namespace_timeout_message() {
1212+
// Replicate the error message that `wait_for_namespace` would produce
1213+
// when `diagnose_helm_failure` returns a non-None hint.
1214+
let helm_hint = "Job helm-install-openshell failed.\n \
1215+
Status: Failed: error validating \"\": apiVersion not set\n \
1216+
Last job logs:\n Error: INSTALLATION FAILED: unable to build kubernetes \
1217+
objects from release manifest: error validating data: apiVersion not set";
1218+
let error_msg = format!(
1219+
"timed out waiting for namespace 'openshell' to exist.\n\n\
1220+
A Helm install job appears to have failed — this is likely the root cause:\n\n\
1221+
{helm_hint}\n\n\
1222+
To inspect the full job logs run:\n \
1223+
kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1224+
);
1225+
assert!(error_msg.contains("helm-install-openshell"));
1226+
assert!(error_msg.contains("apiVersion not set"));
1227+
assert!(error_msg.contains("INSTALLATION FAILED"));
1228+
assert!(error_msg.contains("kubectl logs -n kube-system"));
1229+
}
10801230
}

0 commit comments

Comments
 (0)