Skip to content

Commit 0418318

Browse files
committed
Add [Late] test to collect CRI-O goroutine dumps via SIGUSR1
This test runs after all other tests to capture CRI-O's goroutine state on every node, providing visibility into stuck operations after the suite has exercised CRI-O under load. The test sends SIGUSR1 to CRI-O on each node, reads the resulting goroutine dump from /tmp/crio-goroutine-stacks-*.log, and fails hard if any goroutine is stuck in an image pull (IO wait > 30 min in docker.(*bodyReader).Read -> net.(*conn).Read). Signed-off-by: Ayato Tokubi <atokubi@redhat.com>
1 parent 31582ab commit 0418318

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
package node
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
"strconv"
7+
"strings"
8+
9+
g "github.com/onsi/ginkgo/v2"
10+
o "github.com/onsi/gomega"
11+
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
12+
exutil "github.com/openshift/origin/test/extended/util"
13+
)
14+
15+
// goroutineHeaderRe matches Go goroutine dump headers, e.g.:
16+
//
17+
// goroutine 64418 [IO wait, 189 minutes]:
18+
// goroutine 1 [running]:
19+
var goroutineHeaderRe = regexp.MustCompile(`goroutine \d+ \[`)
20+
21+
// ioWaitDurationRe extracts the wait duration in minutes from an IO wait goroutine header, e.g.:
22+
//
23+
// goroutine 64418 [IO wait, 189 minutes]:
24+
var ioWaitDurationRe = regexp.MustCompile(`goroutine \d+ \[IO wait, (\d+) minutes\]:`)
25+
26+
// findStuckImagePulls checks a goroutine dump for goroutines that indicate
27+
// a stuck image pull: IO wait > 30 minutes with net.(*conn).Read called by
28+
// image/v5/docker.(*bodyReader).Read.
29+
func findStuckImagePulls(dump string) []string {
30+
// Split the dump into individual goroutine blocks.
31+
// Each block starts with "goroutine <id> [".
32+
blocks := goroutineHeaderRe.Split(dump, -1)
33+
headers := goroutineHeaderRe.FindAllString(dump, -1)
34+
35+
var stuck []string
36+
for i, header := range headers {
37+
block := header + blocks[i+1]
38+
39+
// Check: state is "IO wait" with duration > 30 minutes
40+
match := ioWaitDurationRe.FindStringSubmatch(block)
41+
if match == nil {
42+
continue
43+
}
44+
minutes, err := strconv.Atoi(match[1])
45+
if err != nil || minutes <= 30 {
46+
continue
47+
}
48+
49+
// Check: stack contains net.(*conn).Read
50+
connReadIdx := strings.Index(block, "net.(*conn).Read")
51+
if connReadIdx < 0 {
52+
continue
53+
}
54+
55+
// Check: image/v5/docker.(*bodyReader).Read appears as an
56+
// ascendant (caller) of net.(*conn).Read. In a goroutine dump
57+
// callers appear below callees, so bodyReader must come after
58+
// conn.Read in the text.
59+
bodyReaderIdx := strings.Index(block, "image/v5/docker.(*bodyReader).Read")
60+
if bodyReaderIdx < 0 || bodyReaderIdx <= connReadIdx {
61+
continue
62+
}
63+
64+
stuck = append(stuck, strings.TrimSpace(block))
65+
}
66+
return stuck
67+
}
68+
69+
var _ = g.Describe("[sig-node][Late]", func() {
70+
defer g.GinkgoRecover()
71+
oc := exutil.NewCLIWithoutNamespace("crio-goroutine-dump")
72+
73+
g.It("CRI-O should report goroutine stacks on all nodes",
74+
ote.Informing(), func(ctx g.SpecContext) {
75+
76+
nodes, err := exutil.GetAllClusterNodes(oc)
77+
o.Expect(err).NotTo(o.HaveOccurred())
78+
o.Expect(nodes).NotTo(o.BeEmpty(), "expected at least one node")
79+
80+
// Send SIGUSR1 to CRI-O and read the newly created dump file.
81+
// CRI-O writes goroutine stacks to /tmp/crio-goroutine-stacks-<timestamp>.log.
82+
// We snapshot existing files before signalling and wait for a new one
83+
// to appear, avoiding stale reads from previous runs.
84+
shellCmd := `CRIO_PID=$(pgrep -x crio 2>/dev/null) && \
85+
if [ -z "$CRIO_PID" ]; then echo "CRIO_NOT_FOUND"; exit 0; fi && \
86+
MARKER=$(mktemp) && \
87+
kill -USR1 $CRIO_PID && \
88+
for i in $(seq 1 10); do \
89+
NEW=$(find /tmp -maxdepth 1 -name 'crio-goroutine-stacks-*.log' -newer "$MARKER" 2>/dev/null | head -1) && \
90+
if [ -n "$NEW" ]; then rm -f "$MARKER"; cat "$NEW"; exit 0; fi; \
91+
sleep 1; \
92+
done && \
93+
rm -f "$MARKER" && echo "DUMP_TIMEOUT" && exit 1`
94+
95+
var dumps []string
96+
var stuckPulls []string
97+
var failedNodes []string
98+
for _, node := range nodes {
99+
g.By(fmt.Sprintf("Sending SIGUSR1 to CRI-O on node %s", node.Name))
100+
101+
output, err := exutil.DebugNodeRetryWithOptionsAndChroot(
102+
oc, node.Name, "default",
103+
"sh", "-c", shellCmd,
104+
)
105+
if err != nil {
106+
failedNodes = append(failedNodes,
107+
fmt.Sprintf("%s: debug pod failed: %v", node.Name, err))
108+
continue
109+
}
110+
111+
if strings.Contains(output, "CRIO_NOT_FOUND") {
112+
failedNodes = append(failedNodes,
113+
fmt.Sprintf("%s: CRI-O process not found", node.Name))
114+
continue
115+
}
116+
117+
if strings.Contains(output, "DUMP_TIMEOUT") {
118+
failedNodes = append(failedNodes,
119+
fmt.Sprintf("%s: timed out waiting for new goroutine dump file", node.Name))
120+
continue
121+
}
122+
123+
o.Expect(goroutineHeaderRe.MatchString(output)).To(o.BeTrue(),
124+
"expected goroutine stacks in CRI-O dump from node %s, got:\n%s", node.Name, output)
125+
126+
dumps = append(dumps, fmt.Sprintf("=== node/%s ===\n%s", node.Name, output))
127+
128+
for _, goroutine := range findStuckImagePulls(output) {
129+
stuckPulls = append(stuckPulls, fmt.Sprintf("node/%s:\n%s", node.Name, goroutine))
130+
}
131+
}
132+
133+
// Fail if any node did not produce a dump
134+
o.Expect(failedNodes).To(o.BeEmpty(),
135+
"failed to collect CRI-O goroutine dump from nodes:\n%s",
136+
strings.Join(failedNodes, "\n"))
137+
138+
// Fail hard if any goroutine is stuck in an image pull
139+
o.Expect(stuckPulls).To(o.BeEmpty(),
140+
"found CRI-O goroutines stuck in image pull (IO wait > 30 min):\n%s",
141+
strings.Join(stuckPulls, "\n\n"))
142+
143+
// Always fail so goroutine dumps are visible in test results.
144+
// The test is marked Informing so this won't block merges.
145+
if len(dumps) > 0 {
146+
g.Fail(fmt.Sprintf("CRI-O goroutine dumps collected:\n%s", strings.Join(dumps, "\n\n")))
147+
}
148+
})
149+
})

0 commit comments

Comments
 (0)