Skip to content

Commit 8c04a45

Browse files
committed
Add [Late] test to collect CRI-O goroutine dumps via SIGUSR1
This test runs after all other tests to capture CRI-O's goroutine state on every node, providing visibility into stuck operations after the suite has exercised CRI-O under load. The test sends SIGUSR1 to CRI-O on each node, reads the resulting goroutine dump from /tmp/crio-goroutine-stacks-*.log, and fails hard if any goroutine is stuck in an image pull (IO wait > 30 min in docker.(*bodyReader).Read -> net.(*conn).Read). Signed-off-by: Ayato Tokubi <atokubi@redhat.com>
1 parent 31582ab commit 8c04a45

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package node
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
"strconv"
7+
"strings"
8+
9+
g "github.com/onsi/ginkgo/v2"
10+
o "github.com/onsi/gomega"
11+
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
12+
"github.com/openshift/origin/pkg/test/ginkgo/result"
13+
exutil "github.com/openshift/origin/test/extended/util"
14+
e2e "k8s.io/kubernetes/test/e2e/framework"
15+
)
16+
17+
// goroutineHeaderRe matches Go goroutine dump headers, e.g.:
18+
//
19+
// goroutine 64418 [IO wait, 189 minutes]:
20+
// goroutine 1 [running]:
21+
var goroutineHeaderRe = regexp.MustCompile(`goroutine \d+ \[`)
22+
23+
// ioWaitDurationRe extracts the wait duration in minutes from an IO wait goroutine header, e.g.:
24+
//
25+
// goroutine 64418 [IO wait, 189 minutes]:
26+
var ioWaitDurationRe = regexp.MustCompile(`goroutine \d+ \[IO wait, (\d+) minutes\]:`)
27+
28+
// findStuckImagePulls checks a goroutine dump for goroutines that indicate
29+
// a stuck image pull: IO wait > 30 minutes with net.(*conn).Read called by
30+
// image/v5/docker.(*bodyReader).Read.
31+
func findStuckImagePulls(dump string) []string {
32+
// Split the dump into individual goroutine blocks.
33+
// Each block starts with "goroutine <id> [".
34+
blocks := goroutineHeaderRe.Split(dump, -1)
35+
headers := goroutineHeaderRe.FindAllString(dump, -1)
36+
37+
var stuck []string
38+
for i, header := range headers {
39+
block := header + blocks[i+1]
40+
41+
// Check: state is "IO wait" with duration > 30 minutes
42+
match := ioWaitDurationRe.FindStringSubmatch(block)
43+
if match == nil {
44+
continue
45+
}
46+
minutes, err := strconv.Atoi(match[1])
47+
if err != nil || minutes <= 30 {
48+
continue
49+
}
50+
51+
// Check: stack contains net.(*conn).Read
52+
connReadIdx := strings.Index(block, "net.(*conn).Read")
53+
if connReadIdx < 0 {
54+
continue
55+
}
56+
57+
// Check: image/v5/docker.(*bodyReader).Read appears as an
58+
// ascendant (caller) of net.(*conn).Read. In a goroutine dump
59+
// callers appear below callees, so bodyReader must come after
60+
// conn.Read in the text.
61+
bodyReaderIdx := strings.Index(block, "image/v5/docker.(*bodyReader).Read")
62+
if bodyReaderIdx < 0 || bodyReaderIdx <= connReadIdx {
63+
continue
64+
}
65+
66+
stuck = append(stuck, strings.TrimSpace(block))
67+
}
68+
return stuck
69+
}
70+
71+
var _ = g.Describe("[sig-node][Late]", func() {
72+
defer g.GinkgoRecover()
73+
oc := exutil.NewCLIWithoutNamespace("crio-goroutine-dump")
74+
75+
g.It("CRI-O should report goroutine stacks on all nodes",
76+
ote.Informing(), func(ctx g.SpecContext) {
77+
78+
nodes, err := exutil.GetAllClusterNodes(oc)
79+
o.Expect(err).NotTo(o.HaveOccurred())
80+
o.Expect(nodes).NotTo(o.BeEmpty(), "expected at least one node")
81+
82+
// Send SIGUSR1 to CRI-O, wait for dump file, then read the latest one.
83+
// CRI-O writes goroutine stacks to /tmp/crio-goroutine-stacks-<timestamp>.log.
84+
shellCmd := `CRIO_PID=$(pgrep -x crio 2>/dev/null) && \
85+
if [ -z "$CRIO_PID" ]; then echo "CRIO_NOT_FOUND"; exit 0; fi && \
86+
kill -USR1 $CRIO_PID && \
87+
sleep 3 && \
88+
cat $(ls -t /tmp/crio-goroutine-stacks-*.log 2>/dev/null | head -1)`
89+
90+
var dumps []string
91+
var stuckPulls []string
92+
for _, node := range nodes {
93+
g.By(fmt.Sprintf("Sending SIGUSR1 to CRI-O on node %s", node.Name))
94+
95+
output, err := exutil.DebugNodeRetryWithOptionsAndChroot(
96+
oc, node.Name, debugNamespace,
97+
"sh", "-c", shellCmd,
98+
)
99+
if err != nil {
100+
e2e.Logf("WARNING: failed to get CRI-O goroutine dump from node %s: %v",
101+
node.Name, err)
102+
continue
103+
}
104+
105+
if strings.Contains(output, "CRIO_NOT_FOUND") {
106+
e2e.Logf("WARNING: CRI-O process not found on node %s, skipping",
107+
node.Name)
108+
continue
109+
}
110+
111+
o.Expect(goroutineHeaderRe.MatchString(output)).To(o.BeTrue(),
112+
"expected goroutine stacks in CRI-O dump from node %s, got:\n%s", node.Name, output)
113+
114+
dumps = append(dumps, fmt.Sprintf("=== node/%s ===\n%s", node.Name, output))
115+
116+
for _, goroutine := range findStuckImagePulls(output) {
117+
stuckPulls = append(stuckPulls, fmt.Sprintf("node/%s:\n%s", node.Name, goroutine))
118+
}
119+
}
120+
121+
// Fail hard if any goroutine is stuck in an image pull
122+
o.Expect(stuckPulls).To(o.BeEmpty(),
123+
"found CRI-O goroutines stuck in image pull (IO wait > 30 min):\n%s",
124+
strings.Join(stuckPulls, "\n\n"))
125+
126+
// Flake to ensure goroutine dumps are visible in test results
127+
if len(dumps) > 0 {
128+
result.Flakef("CRI-O goroutine dumps collected:\n%s", strings.Join(dumps, "\n\n"))
129+
}
130+
})
131+
})

0 commit comments

Comments
 (0)