Skip to content

Commit 28e2d6d

Browse files
committed
Add test panic detection for CH and stress-ng
Implements test-level panic detection to catch Rust panics in Cloud Hypervisor integration tests and stress-ng test runs. Unlike kernel panics, these are test framework panics that don't cause VM crashes but still indicate test failures. Key features: - New TestPanicException class for test-level panics (distinct from KernelPanicException) - TEST_PANIC_PATTERNS: Detects 'panicked at' and 'stack backtrace:' markers - check_test_panic() function scans logs and appends panic details to TestResult.message - Preserves original failure message (e.g., test names) and adds panic context This enables catching test panics in CH integration tests and stress workloads while maintaining detailed failure information for debugging.
1 parent d5f981c commit 28e2d6d

File tree

3 files changed

+180
-4
lines changed

3 files changed

+180
-4
lines changed

lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@
3030
Sed,
3131
Whoami,
3232
)
33-
from lisa.util import LisaException, UnsupportedDistroException, find_groups_in_lines
33+
from lisa.util import (
34+
LisaException,
35+
UnsupportedDistroException,
36+
check_test_panic,
37+
find_groups_in_lines,
38+
)
3439

3540

3641
@dataclass
@@ -302,6 +307,7 @@ def _process_test_results(
302307
hypervisor: str,
303308
log_path: Path,
304309
subtests: Set[str],
310+
test_name: str,
305311
) -> None:
306312
"""Process test results and handle various failure scenarios."""
307313
# Report subtest results and collect logs before doing any assertions.
@@ -318,6 +324,14 @@ def _process_test_results(
318324

319325
self._save_kernel_logs(log_path)
320326

327+
self._check_test_panic_from_logs(
328+
test_result=test_result,
329+
log_path=log_path,
330+
content=result.stdout,
331+
stage=f"{test_type} tests",
332+
test_name=test_name,
333+
)
334+
321335
has_failures = len(failures) > 0
322336
if result.is_timeout and has_failures:
323337
self._handle_timeout_failure(
@@ -384,6 +398,7 @@ def run_tests(
384398
hypervisor,
385399
log_path,
386400
subtests["subtest_set"],
401+
test_name,
387402
)
388403

389404
def run_metrics_tests(
@@ -438,6 +453,14 @@ def run_metrics_tests(
438453
)
439454
self._write_testcase_log(log_path, testcase, trace)
440455

456+
self._check_test_panic_from_logs(
457+
test_result=test_result,
458+
log_path=log_path,
459+
content=trace,
460+
stage=f"metrics test {testcase}",
461+
test_name=testcase,
462+
)
463+
441464
self._save_kernel_logs(log_path)
442465

443466
# Check for kernel panic after all tests complete
@@ -727,6 +750,37 @@ def _extract_diagnostic_info(
727750

728751
return ""
729752

753+
def _check_test_panic_from_logs(
754+
self,
755+
test_result: TestResult,
756+
log_path: Path,
757+
content: str,
758+
stage: str,
759+
test_name: str,
760+
) -> None:
761+
# Collect test output from provided content and log file
762+
test_output = content or ""
763+
764+
log_file = log_path / f"{test_name}.log"
765+
if log_file.exists():
766+
try:
767+
with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
768+
test_output = f"{test_output}\n{f.read()}"
769+
except Exception:
770+
# Best-effort only; continue with existing content
771+
pass
772+
773+
# Check the collected output for panic markers
774+
if test_output.strip():
775+
check_test_panic(
776+
test_output,
777+
stage,
778+
self._log,
779+
test_result=test_result,
780+
node_name=self.node.name,
781+
source=f"{test_name}.log",
782+
)
783+
730784
def _extract_stdout_diagnostics(self, stdout: str) -> List[str]:
731785
"""Extract diagnostic information from stdout."""
732786
diagnostic_messages: List[str] = []

lisa/microsoft/testsuites/stress/stress_ng_suite.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from lisa.messages import TestStatus, send_sub_test_result_message
2727
from lisa.testsuite import TestResult
2828
from lisa.tools import StressNg
29-
from lisa.util import KernelPanicException, SkippedException
29+
from lisa.util import KernelPanicException, SkippedException, check_test_panic
3030
from lisa.util.logger import Logger
3131
from lisa.util.process import Process
3232

@@ -268,7 +268,11 @@ def _run_stress_ng_job(
268268
)
269269

270270
execution_status, execution_summary = self._monitor_stress_execution(
271-
stress_processes, nodes, log, job_file_name
271+
stress_processes,
272+
nodes,
273+
log,
274+
job_file_name,
275+
test_result,
272276
)
273277

274278
except Exception as execution_error:
@@ -339,6 +343,7 @@ def _monitor_stress_execution(
339343
nodes: List[RemoteNode],
340344
log: Logger,
341345
job_file_name: str,
346+
test_result: TestResult,
342347
) -> Tuple[TestStatus, str]:
343348
"""
344349
Monitor stress-ng execution and capture stress-ng info output.
@@ -355,9 +360,24 @@ def _monitor_stress_execution(
355360
for i, process in enumerate(stress_processes):
356361
node_name = nodes[i].name
357362
try:
358-
process.wait_result(timeout=self.TIME_OUT, expected_exit_code=0)
363+
result = process.wait_result(
364+
timeout=self.TIME_OUT,
365+
expected_exit_code=0,
366+
)
359367
log.info(f"{node_name} completed successfully")
360368

369+
# Check test output for panic markers
370+
test_output = f"{result.stdout}\n{result.stderr}".strip()
371+
if test_output:
372+
check_test_panic(
373+
test_output,
374+
stage=f"stress-ng job {job_file_name}",
375+
log=log,
376+
test_result=test_result,
377+
node_name=node_name,
378+
source="stress-ng output",
379+
)
380+
361381
# Process YAML output if applicable
362382
node_output = self._process_yaml_output(nodes[i], job_file_name, log)
363383

@@ -371,6 +391,36 @@ def _monitor_stress_execution(
371391
# Store the exception to re-raise after collecting all outputs
372392
exceptions_to_raise.append(e)
373393

394+
# Check test output for panic markers even on failure
395+
# Try multiple sources since log_buffer may not always be populated
396+
try:
397+
outputs = []
398+
buf = getattr(process, "log_buffer", None)
399+
if buf:
400+
outputs.append(buf.getvalue())
401+
402+
# If exception carries stdout/stderr, grab them too
403+
outputs.append(getattr(e, "stdout", ""))
404+
outputs.append(getattr(e, "stderr", ""))
405+
406+
# Fallback: exception message itself may contain output context
407+
outputs.append(str(e))
408+
409+
test_output = "\n".join([x for x in outputs if x]).strip()
410+
if test_output:
411+
check_test_panic(
412+
test_output,
413+
stage=f"stress-ng job {job_file_name} (failed)",
414+
log=log,
415+
test_result=test_result,
416+
node_name=node_name,
417+
source="stress-ng captured output",
418+
)
419+
except Exception:
420+
log.debug(
421+
f"Failed to check test panic on {node_name}", exc_info=True
422+
)
423+
374424
# Combine all node outputs, including node names for clarity
375425
execution_summary = f"Job: {job_file_name}\n\n"
376426
for i, node_output in enumerate(node_outputs):

lisa/util/__init__.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
if TYPE_CHECKING:
4444
from lisa.operating_system import OperatingSystem
45+
from lisa.testsuite import TestResult
4546
from lisa.util.logger import Logger
4647

4748
T = TypeVar("T")
@@ -145,6 +146,12 @@
145146
re.compile(r"(.*RIP: 0010:topology_sane.isra.*)$", re.MULTILINE),
146147
]
147148

149+
TEST_PANIC_PATTERNS: List[Pattern[str]] = [
150+
# Rust panics - must have "panicked at" with backtrace markers
151+
re.compile(r"^(.*panicked at .*)$", re.MULTILINE),
152+
re.compile(r"^(.*stack backtrace:.*)$", re.MULTILINE | re.IGNORECASE),
153+
]
154+
148155
# Root filesystem mount failure patterns
149156
ROOTFS_FAILURE_PATTERNS: List[Pattern[str]] = [
150157
# Warning: dracut-initqueue timeout - starting timeout scripts
@@ -387,6 +394,24 @@ def __str__(self) -> str:
387394
)
388395

389396

397+
class PostTestPanicDetectedError(LisaException):
398+
"""
399+
This exception is raised when a test panic is detected in test output.
400+
"""
401+
402+
def __init__(self, stage: str, panics: List[Any], source: str = "test log") -> None:
403+
self.stage = stage
404+
self.panics = panics
405+
self.source = source
406+
super().__init__(str(self))
407+
408+
def __str__(self) -> str:
409+
return (
410+
f"{self.stage} found test panic in {self.source}. "
411+
f"Detected Test Panic lines: {self.panics}"
412+
)
413+
414+
390415
class RootFsMountFailedException(LisaException):
391416
"""
392417
This exception is used to indicate root filesystem mount failure.
@@ -979,6 +1004,53 @@ def check_panic(content: str, stage: str, log: "Logger") -> None:
9791004
raise KernelPanicException(stage, panics)
9801005

9811006

1007+
def append_test_panic_to_test_result(
1008+
test_result: "TestResult", node_name: str, panics: List[str]
1009+
) -> None:
1010+
# Remove duplicates while preserving order
1011+
unique_panics = list(dict.fromkeys(panics))
1012+
panic_lines = "\n".join(unique_panics)
1013+
panic_summary = (
1014+
f"TEST PANIC DETECTED on {node_name}\n"
1015+
f"Detected Test Panic Lines:\n{panic_lines}\n"
1016+
)
1017+
1018+
# Check if we've already added this panic to avoid duplicates from
1019+
# both success and failure paths
1020+
if test_result.message and panic_summary in test_result.message:
1021+
return
1022+
1023+
if test_result.message:
1024+
test_result.message += f"\n\n{panic_summary}"
1025+
else:
1026+
test_result.message = panic_summary
1027+
1028+
1029+
def check_test_panic(
1030+
content: str,
1031+
stage: str,
1032+
log: "Logger",
1033+
test_result: Optional["TestResult"] = None,
1034+
node_name: str = "",
1035+
source: str = "test log",
1036+
) -> None:
1037+
log.debug("checking test panic...")
1038+
panics = [
1039+
x
1040+
for sublist in find_patterns_in_lines(str(content), TEST_PANIC_PATTERNS)
1041+
for x in sublist
1042+
if x
1043+
]
1044+
1045+
if panics:
1046+
if test_result is not None:
1047+
# Append panic info to existing test result message, don't raise
1048+
append_test_panic_to_test_result(test_result, node_name, panics)
1049+
else:
1050+
# Only raise exception if no test_result context
1051+
raise PostTestPanicDetectedError(stage, panics, source)
1052+
1053+
9821054
def check_rootfs_failure(content: str, log: "Logger") -> None:
9831055
"""
9841056
Check if console log contains root filesystem mount failure message.

0 commit comments

Comments
 (0)