Skip to content

Commit f1853da

Browse files
committed
Add [Late] test to collect CRI-O goroutine dumps via SIGUSR1
Signed-off-by: Ayato Tokubi <atokubi@redhat.com>
1 parent 31582ab commit f1853da

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
package node
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
"strconv"
7+
"strings"
8+
"sync"
9+
10+
g "github.com/onsi/ginkgo/v2"
11+
o "github.com/onsi/gomega"
12+
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
13+
exutil "github.com/openshift/origin/test/extended/util"
14+
)
15+
16+
// goroutineHeaderRe matches Go goroutine dump headers, e.g.:
17+
//
18+
// goroutine 64418 [IO wait, 189 minutes]:
19+
// goroutine 1 [running]:
20+
var goroutineHeaderRe = regexp.MustCompile(`goroutine \d+ \[`)
21+
22+
// ioWaitDurationRe extracts the wait duration in minutes from an IO wait goroutine header, e.g.:
23+
//
24+
// goroutine 64418 [IO wait, 189 minutes]:
25+
var ioWaitDurationRe = regexp.MustCompile(`goroutine \d+ \[IO wait, (\d+) minutes\]:`)
26+
27+
// findStuckImagePulls checks a goroutine dump for goroutines that indicate
28+
// a stuck image pull: IO wait > 30 minutes with net.(*conn).Read called by
29+
// image/v5/docker.(*bodyReader).Read.
30+
func findStuckImagePulls(dump string) []string {
31+
// Split the dump into individual goroutine blocks.
32+
// Each block starts with "goroutine <id> [".
33+
blocks := goroutineHeaderRe.Split(dump, -1)
34+
headers := goroutineHeaderRe.FindAllString(dump, -1)
35+
36+
var stuck []string
37+
for i, header := range headers {
38+
block := header + blocks[i+1]
39+
40+
// Check: state is "IO wait" with duration > 30 minutes
41+
match := ioWaitDurationRe.FindStringSubmatch(block)
42+
if match == nil {
43+
continue
44+
}
45+
minutes, err := strconv.Atoi(match[1])
46+
if err != nil || minutes <= 30 {
47+
continue
48+
}
49+
50+
// Check: stack contains net.(*conn).Read
51+
connReadIdx := strings.Index(block, "net.(*conn).Read")
52+
if connReadIdx < 0 {
53+
continue
54+
}
55+
56+
// Check: image/v5/docker.(*bodyReader).Read appears as an
57+
// ascendant (caller) of net.(*conn).Read. In a goroutine dump
58+
// callers appear below callees, so bodyReader must come after
59+
// conn.Read in the text.
60+
bodyReaderIdx := strings.Index(block, "image/v5/docker.(*bodyReader).Read")
61+
if bodyReaderIdx < 0 || bodyReaderIdx <= connReadIdx {
62+
continue
63+
}
64+
65+
stuck = append(stuck, strings.TrimSpace(block))
66+
}
67+
return stuck
68+
}
69+
70+
var _ = g.Describe("[sig-node][Late]", func() {
71+
defer g.GinkgoRecover()
72+
oc := exutil.NewCLIWithoutNamespace("crio-goroutine-dump")
73+
74+
g.It("CRI-O should report goroutine stacks on all nodes",
75+
ote.Informing(), func(ctx g.SpecContext) {
76+
77+
nodes, err := exutil.GetAllClusterNodes(oc)
78+
o.Expect(err).NotTo(o.HaveOccurred())
79+
o.Expect(nodes).NotTo(o.BeEmpty(), "expected at least one node")
80+
81+
// Send SIGUSR1 to CRI-O and read the newly created dump file.
82+
// CRI-O writes goroutine stacks to /tmp/crio-goroutine-stacks-<timestamp>.log.
83+
//
84+
// We access CRI-O's /tmp through /proc/<pid>/root/tmp rather than
85+
// the host /tmp so that the dump is visible even when CRI-O's
86+
// systemd unit uses PrivateTmp=yes (as observed on MicroShift).
87+
//
88+
// The script snapshots the latest dump file before signalling and
89+
// polls until a newer, non-empty (-s) file appears to avoid
90+
// reading a file that CRI-O has created but not yet finished writing.
91+
shellCmd := `CRIO_PID=$(pgrep -x crio 2>/dev/null)
92+
if [ -z "$CRIO_PID" ]; then echo "CRIO_NOT_FOUND"; exit 0; fi
93+
CRIO_TMP=/proc/$CRIO_PID/root/tmp
94+
BEFORE=$(ls -t "$CRIO_TMP"/crio-goroutine-stacks-*.log 2>/dev/null | head -1)
95+
kill -USR1 $CRIO_PID
96+
for i in $(seq 1 30); do
97+
LATEST=$(ls -t "$CRIO_TMP"/crio-goroutine-stacks-*.log 2>/dev/null | head -1)
98+
if [ -n "$LATEST" ] && [ "$LATEST" != "$BEFORE" ] && [ -s "$LATEST" ]; then
99+
cat "$LATEST"; exit 0
100+
fi
101+
sleep 1
102+
done
103+
echo "DUMP_TIMEOUT"; exit 1`
104+
105+
// nodeResult holds the output from a single node's goroutine dump.
106+
type nodeResult struct {
107+
name string
108+
output string
109+
err error
110+
stuckPulls []string
111+
}
112+
113+
results := make([]nodeResult, len(nodes))
114+
var wg sync.WaitGroup
115+
for i, node := range nodes {
116+
wg.Add(1)
117+
go func(i int, nodeName string) {
118+
defer g.GinkgoRecover()
119+
defer wg.Done()
120+
121+
g.By(fmt.Sprintf("Sending SIGUSR1 to CRI-O on node %s", nodeName))
122+
123+
output, err := exutil.DebugNodeRetryWithOptionsAndChroot(
124+
oc, nodeName, "default",
125+
"sh", "-c", shellCmd,
126+
)
127+
results[i] = nodeResult{name: nodeName, output: output, err: err}
128+
}(i, node.Name)
129+
}
130+
wg.Wait()
131+
132+
var dumps []string
133+
var stuckPulls []string
134+
var failedNodes []string
135+
for _, r := range results {
136+
// Check output-based diagnostics before the generic error
137+
// because DebugNodeRetryWithOptionsAndChroot may return
138+
// both output and an error (e.g. non-zero exit from the
139+
// DUMP_TIMEOUT branch).
140+
if strings.Contains(r.output, "CRIO_NOT_FOUND") {
141+
failedNodes = append(failedNodes,
142+
fmt.Sprintf("%s: CRI-O process not found", r.name))
143+
continue
144+
}
145+
146+
if strings.Contains(r.output, "DUMP_TIMEOUT") {
147+
failedNodes = append(failedNodes,
148+
fmt.Sprintf("%s: timed out waiting for new goroutine dump file", r.name))
149+
continue
150+
}
151+
152+
if r.err != nil {
153+
failedNodes = append(failedNodes,
154+
fmt.Sprintf("%s: debug pod failed: %v", r.name, r.err))
155+
continue
156+
}
157+
158+
o.Expect(goroutineHeaderRe.MatchString(r.output)).To(o.BeTrue(),
159+
"expected goroutine stacks in CRI-O dump from node %s, output length=%d, got:\n%s", r.name, len(r.output), r.output)
160+
161+
dumps = append(dumps, fmt.Sprintf("=== node/%s ===\n%s", r.name, r.output))
162+
163+
for _, goroutine := range findStuckImagePulls(r.output) {
164+
stuckPulls = append(stuckPulls, fmt.Sprintf("node/%s:\n%s", r.name, goroutine))
165+
}
166+
}
167+
168+
// Fail if any node did not produce a dump
169+
o.Expect(failedNodes).To(o.BeEmpty(),
170+
"failed to collect CRI-O goroutine dump from nodes:\n%s",
171+
strings.Join(failedNodes, "\n"))
172+
173+
// Fail hard if any goroutine is stuck in an image pull
174+
o.Expect(stuckPulls).To(o.BeEmpty(),
175+
"found CRI-O goroutines stuck in image pull (IO wait > 30 min):\n%s",
176+
strings.Join(stuckPulls, "\n\n"))
177+
178+
// Always fail so goroutine dumps are visible in test results.
179+
// The test is marked Informing so this won't block merges.
180+
if len(dumps) > 0 {
181+
g.Fail(fmt.Sprintf("CRI-O goroutine dumps collected:\n%s", strings.Join(dumps, "\n\n")))
182+
}
183+
})
184+
})

0 commit comments

Comments
 (0)