Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions test/extended/node/crio_goroutinedump.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
package node

import (
"fmt"
"regexp"
"strconv"
"strings"
"sync"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo"
exutil "github.com/openshift/origin/test/extended/util"
)

// goroutineHeaderRe matches Go goroutine dump headers, e.g.:
//
// goroutine 64418 [IO wait, 189 minutes]:
// goroutine 1 [running]:
var goroutineHeaderRe = regexp.MustCompile(`goroutine \d+ \[`)

// ioWaitDurationRe extracts the wait duration in minutes from an IO wait goroutine header, e.g.:
//
// goroutine 64418 [IO wait, 189 minutes]:
var ioWaitDurationRe = regexp.MustCompile(`goroutine \d+ \[IO wait, (\d+) minutes\]:`)

// findStuckImagePulls checks a goroutine dump for goroutines that indicate
// a stuck image pull: IO wait > 30 minutes with net.(*conn).Read called by
// image/v5/docker.(*bodyReader).Read.
func findStuckImagePulls(dump string) []string {
// Split the dump into individual goroutine blocks.
// Each block starts with "goroutine <id> [".
blocks := goroutineHeaderRe.Split(dump, -1)
headers := goroutineHeaderRe.FindAllString(dump, -1)

var stuck []string
for i, header := range headers {
block := header + blocks[i+1]

// Check: state is "IO wait" with duration > 30 minutes
match := ioWaitDurationRe.FindStringSubmatch(block)
if match == nil {
continue
}
minutes, err := strconv.Atoi(match[1])
if err != nil || minutes <= 30 {
continue
}

// Check: stack contains net.(*conn).Read
connReadIdx := strings.Index(block, "net.(*conn).Read")
if connReadIdx < 0 {
continue
}

// Check: image/v5/docker.(*bodyReader).Read appears as an
// ascendant (caller) of net.(*conn).Read. In a goroutine dump
// callers appear below callees, so bodyReader must come after
// conn.Read in the text.
bodyReaderIdx := strings.Index(block, "image/v5/docker.(*bodyReader).Read")
if bodyReaderIdx < 0 || bodyReaderIdx <= connReadIdx {
continue
}

stuck = append(stuck, strings.TrimSpace(block))
}
return stuck
}

var _ = g.Describe("[sig-node][Late]", func() {
defer g.GinkgoRecover()
oc := exutil.NewCLIWithoutNamespace("crio-goroutine-dump")

g.It("CRI-O should report goroutine stacks on all nodes",
ote.Informing(), func(ctx g.SpecContext) {

nodes, err := exutil.GetAllClusterNodes(oc)
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(nodes).NotTo(o.BeEmpty(), "expected at least one node")

// Send SIGUSR1 to CRI-O and read the newly created dump file.
// CRI-O writes goroutine stacks to /tmp/crio-goroutine-stacks-<timestamp>.log.
//
// We access CRI-O's /tmp through /proc/<pid>/root/tmp rather than
// the host /tmp so that the dump is visible even when CRI-O's
// systemd unit uses PrivateTmp=yes (as observed on MicroShift).
//
// The script snapshots the latest dump file before signalling and
// polls until a newer, non-empty (-s) file appears to avoid
// reading a file that CRI-O has created but not yet finished writing.
shellCmd := `CRIO_PID=$(pgrep -x crio 2>/dev/null)
if [ -z "$CRIO_PID" ]; then echo "CRIO_NOT_FOUND"; exit 0; fi
CRIO_TMP=/proc/$CRIO_PID/root/tmp
BEFORE=$(ls -t "$CRIO_TMP"/crio-goroutine-stacks-*.log 2>/dev/null | head -1)
kill -USR1 $CRIO_PID
for i in $(seq 1 30); do
LATEST=$(ls -t "$CRIO_TMP"/crio-goroutine-stacks-*.log 2>/dev/null | head -1)
if [ -n "$LATEST" ] && [ "$LATEST" != "$BEFORE" ] && [ -s "$LATEST" ]; then
cat "$LATEST"; exit 0
fi
sleep 1
done
echo "DUMP_TIMEOUT"; exit 1`

// nodeResult holds the output from a single node's goroutine dump.
type nodeResult struct {
name string
output string
err error
stuckPulls []string
}

results := make([]nodeResult, len(nodes))
var wg sync.WaitGroup
for i, node := range nodes {
wg.Add(1)
go func(i int, nodeName string) {
defer g.GinkgoRecover()
defer wg.Done()

g.By(fmt.Sprintf("Sending SIGUSR1 to CRI-O on node %s", nodeName))

output, err := exutil.DebugNodeRetryWithOptionsAndChroot(
oc, nodeName, "default",
"sh", "-c", shellCmd,
)
results[i] = nodeResult{name: nodeName, output: output, err: err}
}(i, node.Name)
}
wg.Wait()
Comment on lines +115 to +130
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Bound parallel debug sessions to avoid cluster/API overload.

Line 115 currently spawns one debug flow per node at once. On larger clusters this can create a burst of debug pods and induce throttling/timeouts, making this test flaky.

Suggested change
 			results := make([]nodeResult, len(nodes))
 			var wg sync.WaitGroup
+			sem := make(chan struct{}, 10) // tune as needed
 			for i, node := range nodes {
 				wg.Add(1)
 				go func(i int, nodeName string) {
 					defer g.GinkgoRecover()
 					defer wg.Done()
+					sem <- struct{}{}
+					defer func() { <-sem }()
 
 					g.By(fmt.Sprintf("Sending SIGUSR1 to CRI-O on node %s", nodeName))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@test/extended/node/crio_goroutinedump.go` around lines 115 - 130, The test
spawns a goroutine per node concurrently which can overload the cluster/APIs;
limit concurrent debug sessions by adding a concurrency limiter (e.g., a
buffered semaphore channel or worker pool) around the loop so only a fixed
number (configurable, e.g., 5) of goroutines run at once; acquire the semaphore
before launching the goroutine that calls
exutil.DebugNodeRetryWithOptionsAndChroot and release it (close/receive) in the
goroutine's defer along with wg.Done(), ensuring results[i] still gets set and
g.GinkgoRecover() remains called; reference the existing loop variables (nodes,
i, node.Name), wg, results, nodeResult, shellCmd and
exutil.DebugNodeRetryWithOptionsAndChroot when implementing the limiter.


var dumps []string
var stuckPulls []string
var failedNodes []string
for _, r := range results {
// Check output-based diagnostics before the generic error
// because DebugNodeRetryWithOptionsAndChroot may return
// both output and an error (e.g. non-zero exit from the
// DUMP_TIMEOUT branch).
if strings.Contains(r.output, "CRIO_NOT_FOUND") {
failedNodes = append(failedNodes,
fmt.Sprintf("%s: CRI-O process not found", r.name))
continue
}

if strings.Contains(r.output, "DUMP_TIMEOUT") {
failedNodes = append(failedNodes,
fmt.Sprintf("%s: timed out waiting for new goroutine dump file", r.name))
continue
}

if r.err != nil {
failedNodes = append(failedNodes,
fmt.Sprintf("%s: debug pod failed: %v", r.name, r.err))
continue
}

o.Expect(goroutineHeaderRe.MatchString(r.output)).To(o.BeTrue(),
"expected goroutine stacks in CRI-O dump from node %s, output length=%d, got:\n%s", r.name, len(r.output), r.output)

dumps = append(dumps, fmt.Sprintf("=== node/%s ===\n%s", r.name, r.output))

for _, goroutine := range findStuckImagePulls(r.output) {
stuckPulls = append(stuckPulls, fmt.Sprintf("node/%s:\n%s", r.name, goroutine))
}
}

// Fail if any node did not produce a dump
o.Expect(failedNodes).To(o.BeEmpty(),
"failed to collect CRI-O goroutine dump from nodes:\n%s",
strings.Join(failedNodes, "\n"))

// Fail hard if any goroutine is stuck in an image pull
o.Expect(stuckPulls).To(o.BeEmpty(),
"found CRI-O goroutines stuck in image pull (IO wait > 30 min):\n%s",
strings.Join(stuckPulls, "\n\n"))
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// Always fail so goroutine dumps are visible in test results.
// The test is marked Informing so this won't block merges.
if len(dumps) > 0 {
g.Fail(fmt.Sprintf("CRI-O goroutine dumps collected:\n%s", strings.Join(dumps, "\n\n")))
}
Comment on lines +178 to +182
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

This makes the test always fail on success paths.

Line 146 forces failure whenever dumps are collected, so a healthy run can never pass. Prefer writing dumps to GinkgoWriter (or logs/artifacts) and reserve failure for actual fault conditions (failedNodes / stuckPulls).

Suggested fix
-			// Always fail so goroutine dumps are visible in test results.
-			// The test is marked Informing so this won't block merges.
-			if len(dumps) > 0 {
-				g.Fail(fmt.Sprintf("CRI-O goroutine dumps collected:\n%s", strings.Join(dumps, "\n\n")))
-			}
+			// Emit dumps for visibility without forcing spec failure.
+			if len(dumps) > 0 {
+				g.By("CRI-O goroutine dumps collected")
+				_, _ = fmt.Fprintf(g.GinkgoWriter, "%s\n", strings.Join(dumps, "\n\n"))
+			}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@test/extended/node/crio_goroutinedump.go` around lines 143 - 147, The current
block unconditionally calls g.Fail when dumps are present which causes
successful runs to fail; modify the CRI-O goroutine dump handling in the section
that references dumps and g.Fail so that dumps are written to GinkgoWriter (or
test logs/artifacts) instead of failing the test, and only call g.Fail if there
are actual fault conditions such as failedNodes or stuckPulls being non-empty;
update the fmt.Sprintf/g.Fail call to use GinkgoWriter.Printf (or similar) to
emit the joined dumps and add a conditional that preserves failing behavior only
when failedNodes or stuckPulls indicate a real error.

})
})