Skip to content

Commit 107849e

Browse files
Merge pull request #30966 from BhargaviGudi/OCPBUGS-81716
OCPBUGS-81716: Add retry logic for transient network errors in restartKubeletOnNode
2 parents e03cfa1 + 7e1cff3 commit 107849e

File tree

2 files changed

+63
-18
lines changed

2 files changed

+63
-18
lines changed

test/extended/node/node_swap_cnv.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
255255
}()
256256

257257
g.By("Restarting kubelet to load the new configuration")
258-
err = restartKubeletOnNode(oc, cnvWorkerNode)
258+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
259259
o.Expect(err).NotTo(o.HaveOccurred())
260260

261261
g.By("Waiting for node to be ready after kubelet restart")
@@ -295,7 +295,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
295295
o.Expect(err).NotTo(o.HaveOccurred())
296296

297297
g.By("Restarting kubelet to apply LimitedSwap")
298-
err = restartKubeletOnNode(oc, cnvWorkerNode)
298+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
299299
o.Expect(err).NotTo(o.HaveOccurred())
300300
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
301301

@@ -356,7 +356,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
356356
framework.Logf("Created drop-in file: %s on %s", cnvDropInFilePath, cpNodeName)
357357

358358
g.By(fmt.Sprintf("Restarting kubelet on %s", cpNodeName))
359-
err = restartKubeletOnNode(oc, cpNodeName)
359+
err = restartKubeletOnNode(ctx, oc, cpNodeName)
360360
o.Expect(err).NotTo(o.HaveOccurred())
361361
waitForNodeToBeReady(ctx, oc, cpNodeName)
362362

@@ -413,7 +413,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
413413
framework.Logf("Confirmed: Directory does not exist after deletion")
414414

415415
g.By("Restarting kubelet")
416-
err = restartKubeletOnNode(oc, cnvWorkerNode)
416+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
417417
o.Expect(err).NotTo(o.HaveOccurred())
418418

419419
g.By("Waiting for node to be ready")
@@ -536,7 +536,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
536536

537537
g.By("Restarting kubelet")
538538
framework.Logf("Running: systemctl restart kubelet on node %s", cnvWorkerNode)
539-
err = restartKubeletOnNode(oc, cnvWorkerNode)
539+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
540540
o.Expect(err).NotTo(o.HaveOccurred())
541541
framework.Logf("Kubelet restart initiated, waiting for node to be ready...")
542542
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
@@ -704,14 +704,14 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
704704
framework.Logf("Removing: %s", file99)
705705
removeDropInFile(oc, cnvWorkerNode, file99)
706706
framework.Logf("Running: systemctl restart kubelet")
707-
restartKubeletOnNode(oc, cnvWorkerNode)
707+
restartKubeletOnNode(ctx, oc, cnvWorkerNode)
708708
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
709709
framework.Logf("Cleanup completed")
710710
}()
711711

712712
g.By("Restarting kubelet")
713713
framework.Logf("Running: systemctl restart kubelet")
714-
err = restartKubeletOnNode(oc, cnvWorkerNode)
714+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
715715
o.Expect(err).NotTo(o.HaveOccurred())
716716
framework.Logf("Waiting for node to be ready...")
717717
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
@@ -776,7 +776,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
776776
framework.Logf("Removing drop-in file from node: %s", node)
777777
removeDropInFile(oc, node, cnvDropInFilePath)
778778
framework.Logf("Restarting kubelet on node: %s", node)
779-
restartKubeletOnNode(oc, node)
779+
restartKubeletOnNode(ctx, oc, node)
780780
}
781781
for _, node := range cnvNodes {
782782
framework.Logf("Waiting for node %s to be ready...", node)
@@ -813,7 +813,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
813813
g.By("Restarting kubelet on all CNV nodes")
814814
for _, node := range cnvNodes {
815815
framework.Logf("Running: systemctl restart kubelet on node %s", node)
816-
err := restartKubeletOnNode(oc, node)
816+
err := restartKubeletOnNode(ctx, oc, node)
817817
o.Expect(err).NotTo(o.HaveOccurred())
818818
}
819819

@@ -946,13 +946,13 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
946946
framework.Logf("Note: OS swap was initially enabled, may need manual re-enable")
947947
}
948948
framework.Logf("Restarting kubelet on node: %s", cnvWorkerNode)
949-
restartKubeletOnNode(oc, cnvWorkerNode)
949+
restartKubeletOnNode(ctx, oc, cnvWorkerNode)
950950
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
951951
}()
952952

953953
g.By("Restarting kubelet with LimitedSwap config but no OS swap")
954954
framework.Logf("Running: systemctl restart kubelet on node %s", cnvWorkerNode)
955-
err = restartKubeletOnNode(oc, cnvWorkerNode)
955+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
956956
o.Expect(err).NotTo(o.HaveOccurred())
957957
framework.Logf("Waiting for node to be ready...")
958958
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
@@ -1084,7 +1084,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
10841084
framework.Logf("Removing drop-in file: %s", cnvDropInFilePath)
10851085
removeDropInFile(oc, cnvWorkerNode, cnvDropInFilePath)
10861086
framework.Logf("Restarting kubelet")
1087-
restartKubeletOnNode(oc, cnvWorkerNode)
1087+
restartKubeletOnNode(ctx, oc, cnvWorkerNode)
10881088
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
10891089
framework.Logf("Final cleanup completed")
10901090
}()
@@ -1153,7 +1153,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
11531153

11541154
g.By(fmt.Sprintf("Restarting kubelet with %s swap", swapSize.name))
11551155
framework.Logf("Running: systemctl restart kubelet")
1156-
err = restartKubeletOnNode(oc, cnvWorkerNode)
1156+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
11571157
o.Expect(err).NotTo(o.HaveOccurred())
11581158
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)
11591159

@@ -1296,7 +1296,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr
12961296

12971297
g.By("Restarting kubelet")
12981298
framework.Logf("Running: systemctl restart kubelet")
1299-
err = restartKubeletOnNode(oc, cnvWorkerNode)
1299+
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
13001300
o.Expect(err).NotTo(o.HaveOccurred())
13011301
framework.Logf("Waiting for node to be ready...")
13021302
waitForNodeToBeReady(ctx, oc, cnvWorkerNode)

test/extended/node/node_utils.go

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,9 +197,54 @@ func loadConfigFromFile(path string) string {
197197
}
198198

199199
// restartKubeletOnNode restarts the kubelet service on the specified node
200-
func restartKubeletOnNode(oc *exutil.CLI, nodeName string) error {
201-
_, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet")
202-
return err
200+
// Retries on transient network errors which are common on real clusters
201+
func restartKubeletOnNode(ctx context.Context, oc *exutil.CLI, nodeName string) error {
202+
const maxAttempts = 3
203+
var lastErr error
204+
for attempt := 0; attempt < maxAttempts; attempt++ {
205+
_, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet")
206+
if err == nil {
207+
return nil
208+
}
209+
lastErr = err
210+
if !isTransientNetworkError(err) {
211+
return fmt.Errorf("failed to restart kubelet on %s: %w", nodeName, err)
212+
}
213+
if attempt == maxAttempts-1 {
214+
break
215+
}
216+
backoff := time.Duration((attempt+1)*5) * time.Second
217+
framework.Logf("Attempt %d/%d to restart kubelet on %s failed: %v; retrying in %s",
218+
attempt+1, maxAttempts, nodeName, err, backoff)
219+
timer := time.NewTimer(backoff)
220+
select {
221+
case <-ctx.Done():
222+
timer.Stop()
223+
return fmt.Errorf("context canceled while restarting kubelet on %s: %w", nodeName, ctx.Err())
224+
case <-timer.C:
225+
}
226+
}
227+
return fmt.Errorf("failed to restart kubelet on %s after %d attempts: %w", nodeName, maxAttempts, lastErr)
228+
}
229+
230+
// isTransientNetworkError checks if the error is a transient network error worth retrying
231+
func isTransientNetworkError(err error) bool {
232+
if err == nil {
233+
return false
234+
}
235+
errStr := err.Error()
236+
transientErrors := []string{
237+
"connection refused",
238+
"connection reset",
239+
"connection timed out",
240+
"i/o timeout",
241+
}
242+
for _, transientErr := range transientErrors {
243+
if strings.Contains(errStr, transientErr) {
244+
return true
245+
}
246+
}
247+
return false
203248
}
204249

205250
// waitForNodeToBeReady waits for a node to become Ready
@@ -228,7 +273,7 @@ func cleanupDropInAndRestartKubelet(ctx context.Context, oc *exutil.CLI, nodeNam
228273
framework.Logf("Removing drop-in file: %s", filePath)
229274
removeDropInFile(oc, nodeName, filePath)
230275
framework.Logf("Restarting kubelet on node: %s", nodeName)
231-
restartKubeletOnNode(oc, nodeName)
276+
restartKubeletOnNode(ctx, oc, nodeName)
232277
framework.Logf("Waiting for node to be ready...")
233278
waitForNodeToBeReady(ctx, oc, nodeName)
234279
}

0 commit comments

Comments
 (0)