@@ -48,10 +48,10 @@ pub use crate::docker::{
4848 DockerPreflight , ExistingGatewayInfo , check_docker_available, create_ssh_docker_client,
4949} ;
5050pub use crate :: metadata:: {
51- GatewayMetadata , clear_active_gateway, extract_host_from_ssh_destination, get_gateway_metadata ,
52- list_gateways, load_active_gateway, load_gateway_metadata, load_last_sandbox ,
53- remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway, save_last_sandbox ,
54- store_gateway_metadata,
51+ GatewayMetadata , clear_active_gateway, extract_host_from_ssh_destination,
52+ get_gateway_metadata , list_gateways, load_active_gateway, load_gateway_metadata,
53+ load_last_sandbox , remove_gateway_metadata, resolve_ssh_hostname, save_active_gateway,
54+ save_last_sandbox , store_gateway_metadata,
5555} ;
5656
5757/// Options for remote SSH deployment.
@@ -950,6 +950,116 @@ async fn probe_container_dns(docker: &Docker, container_name: &str) -> Result<bo
950950 Ok ( exit_code == 0 && output. contains ( "DNS_OK" ) )
951951}
952952
953+ /// Query the status and logs of the `helm-install-<chart>` Job(s) that k3s runs
954+ /// at startup to deploy the embedded Helm charts (e.g. the openshell chart).
955+ ///
956+ /// When the Job has failed we return a formatted string containing the Job
957+ /// failure reason and the last 30 lines of its pod logs so that callers can
958+ /// surface this as the *real* cause of the "namespace not ready" timeout.
959+ ///
960+ /// Returns `None` when:
961+ /// - the exec into the container itself fails (container not running), or
962+ /// - no failed Helm install Job is found.
963+ async fn diagnose_helm_failure (
964+ docker : & Docker ,
965+ container_name : & str ,
966+ kubeconfig : & str ,
967+ ) -> Option < String > {
968+ // Find all Helm install Jobs that have at least one failed condition.
969+ // Query jobs that have either a non-zero status.failed count OR a pod in
970+ // Error/CrashLoopBackOff (status.failed stays "<none>" while the job is
971+ // still within its backoffLimit retry window).
972+ let ( job_output, job_exit) = exec_capture_with_exit (
973+ docker,
974+ container_name,
975+ vec ! [
976+ "sh" . to_string( ) ,
977+ "-c" . to_string( ) ,
978+ format!(
979+ "KUBECONFIG={kubeconfig} kubectl get jobs -n kube-system \
980+ --no-headers -o custom-columns=NAME:.metadata.name,FAILED:.status.failed \
981+ 2>/dev/null | awk '{{if ($2 != \" 0\" ) print $1}}'"
982+ ) ,
983+ ] ,
984+ )
985+ . await
986+ . ok ( ) ?;
987+
988+ if job_exit != 0 || job_output. trim ( ) . is_empty ( ) {
989+ return None ;
990+ }
991+
992+ // Collect failed Helm install jobs (k3s names them `helm-install-<chart>`).
993+ let failed_jobs: Vec < & str > = job_output
994+ . lines ( )
995+ . map ( str:: trim)
996+ . filter ( |l| !l. is_empty ( ) && l. starts_with ( "helm-install-" ) )
997+ . collect ( ) ;
998+
999+ if failed_jobs. is_empty ( ) {
1000+ return None ;
1001+ }
1002+
1003+ let mut parts: Vec < String > = Vec :: new ( ) ;
1004+
1005+ for job in & failed_jobs {
1006+ // Get the Job's status conditions for a concise failure reason.
1007+ let cond_output = exec_capture_with_exit (
1008+ docker,
1009+ container_name,
1010+ vec ! [
1011+ "sh" . to_string( ) ,
1012+ "-c" . to_string( ) ,
1013+ format!(
1014+ "KUBECONFIG={kubeconfig} kubectl get job {job} -n kube-system \
1015+ -o jsonpath='{{range .status.conditions[*]}}{{.type}}: {{.message}}{{\" \\ n\" }}{{end}}' \
1016+ 2>/dev/null"
1017+ ) ,
1018+ ] ,
1019+ )
1020+ . await
1021+ . map ( |( out, _) | out)
1022+ . unwrap_or_default ( ) ;
1023+
1024+ // Get the last 30 lines of logs from the Job's pod(s).
1025+ let log_output = exec_capture_with_exit (
1026+ docker,
1027+ container_name,
1028+ vec ! [
1029+ "sh" . to_string( ) ,
1030+ "-c" . to_string( ) ,
1031+ format!(
1032+ "KUBECONFIG={kubeconfig} kubectl logs -n kube-system \
1033+ -l job-name={job} --tail=30 2>&1"
1034+ ) ,
1035+ ] ,
1036+ )
1037+ . await
1038+ . map ( |( out, _) | out)
1039+ . unwrap_or_default ( ) ;
1040+
1041+ let mut section = format ! ( "Job {job} failed." ) ;
1042+ let cond = cond_output. trim ( ) ;
1043+ if !cond. is_empty ( ) {
1044+ section. push_str ( & format ! ( "\n Status: {}" , cond. replace( '\n' , "\n " ) ) ) ;
1045+ }
1046+ let logs = log_output. trim ( ) ;
1047+ if !logs. is_empty ( ) {
1048+ section. push_str ( "\n Last job logs:" ) ;
1049+ for line in logs. lines ( ) . take ( 30 ) {
1050+ section. push_str ( & format ! ( "\n {line}" ) ) ;
1051+ }
1052+ }
1053+ parts. push ( section) ;
1054+ }
1055+
1056+ if parts. is_empty ( ) {
1057+ None
1058+ } else {
1059+ Some ( parts. join ( "\n \n " ) )
1060+ }
1061+ }
1062+
9531063async fn wait_for_namespace (
9541064 docker : & Docker ,
9551065 container_name : & str ,
@@ -1040,6 +1150,20 @@ async fn wait_for_namespace(
10401150 }
10411151
10421152 if attempt + 1 == attempts {
1153+ // Before returning a generic timeout error, check whether a Helm
1154+ // install job failed. If so, surface the real Helm error so the
1155+ // user doesn't have to dig through job logs manually.
1156+ let helm_hint = diagnose_helm_failure ( docker, container_name, kubeconfig) . await ;
1157+ if let Some ( hint) = helm_hint {
1158+ return Err ( miette:: miette!(
1159+ "timed out waiting for namespace '{namespace}' to exist.\n \n \
1160+ A Helm install job appears to have failed — this is likely the root cause:\n \n \
1161+ {hint}\n \n \
1162+ To inspect the full job logs run:\n \
1163+ kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1164+ ) )
1165+ . wrap_err ( "K8s namespace not ready" ) ;
1166+ }
10431167 let logs = fetch_recent_logs ( docker, container_name, 40 ) . await ;
10441168 return Err ( miette:: miette!(
10451169 "timed out waiting for namespace '{namespace}' to exist: {output}\n {logs}"
@@ -1077,4 +1201,30 @@ mod tests {
10771201 ) ;
10781202 }
10791203 }
1204+
1205+ /// Simulate the error message shape produced by `diagnose_helm_failure` and
1206+ /// ensure that `diagnose_failure` (in errors.rs) does not suppress or
1207+ /// override it — the Helm hint is intentionally surfaced verbatim inside
1208+ /// the `wait_for_namespace` timeout error, so we only need to verify the
1209+ /// string construction here rather than end-to-end container exec.
1210+ #[ test]
1211+ fn helm_failure_hint_is_included_in_namespace_timeout_message ( ) {
1212+ // Replicate the error message that `wait_for_namespace` would produce
1213+ // when `diagnose_helm_failure` returns a non-None hint.
1214+ let helm_hint = "Job helm-install-openshell failed.\n \
1215+ Status: Failed: error validating \" \" : apiVersion not set\n \
1216+ Last job logs:\n Error: INSTALLATION FAILED: unable to build kubernetes \
1217+ objects from release manifest: error validating data: apiVersion not set";
1218+ let error_msg = format ! (
1219+ "timed out waiting for namespace 'openshell' to exist.\n \n \
1220+ A Helm install job appears to have failed — this is likely the root cause:\n \n \
1221+ {helm_hint}\n \n \
1222+ To inspect the full job logs run:\n \
1223+ kubectl logs -n kube-system -l job-name=helm-install-openshell --tail=50"
1224+ ) ;
1225+ assert ! ( error_msg. contains( "helm-install-openshell" ) ) ;
1226+ assert ! ( error_msg. contains( "apiVersion not set" ) ) ;
1227+ assert ! ( error_msg. contains( "INSTALLATION FAILED" ) ) ;
1228+ assert ! ( error_msg. contains( "kubectl logs -n kube-system" ) ) ;
1229+ }
10801230}
0 commit comments