Add test panic detection for CH and stress-ng

vyadavmsft · vyadavmsft · commit 28e2d6d20cfa · 2026-02-12T20:08:26.000-08:00
Implements test-level panic detection to catch Rust panics in Cloud Hypervisor
integration tests and stress-ng test runs. Unlike kernel panics, these are
test framework panics that don't cause VM crashes but still indicate test failures.

Key features:
- New TestPanicException class for test-level panics (distinct from KernelPanicException)
- TEST_PANIC_PATTERNS: Detects 'panicked at' and 'stack backtrace:' markers
- check_test_panic() function scans logs and appends panic details to TestResult.message
- Preserves original failure message (e.g., test names) and adds panic context

This enables catching test panics in CH integration tests and stress workloads
while maintaining detailed failure information for debugging.
diff --git a/lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py b/lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py
@@ -30,7 +30,12 @@
     Sed,
     Whoami,
 )
-from lisa.util import LisaException, UnsupportedDistroException, find_groups_in_lines
+from lisa.util import (
+    LisaException,
+    UnsupportedDistroException,
+    check_test_panic,
+    find_groups_in_lines,
+)
 
 
 @dataclass
@@ -302,6 +307,7 @@ def _process_test_results(
         hypervisor: str,
         log_path: Path,
         subtests: Set[str],
+        test_name: str,
     ) -> None:
         """Process test results and handle various failure scenarios."""
         # Report subtest results and collect logs before doing any assertions.
@@ -318,6 +324,14 @@ def _process_test_results(
 
         self._save_kernel_logs(log_path)
 
+        self._check_test_panic_from_logs(
+            test_result=test_result,
+            log_path=log_path,
+            content=result.stdout,
+            stage=f"{test_type} tests",
+            test_name=test_name,
+        )
+
         has_failures = len(failures) > 0
         if result.is_timeout and has_failures:
             self._handle_timeout_failure(
@@ -384,6 +398,7 @@ def run_tests(
             hypervisor,
             log_path,
             subtests["subtest_set"],
+            test_name,
         )
 
     def run_metrics_tests(
@@ -438,6 +453,14 @@ def run_metrics_tests(
             )
             self._write_testcase_log(log_path, testcase, trace)
 
+            self._check_test_panic_from_logs(
+                test_result=test_result,
+                log_path=log_path,
+                content=trace,
+                stage=f"metrics test {testcase}",
+                test_name=testcase,
+            )
+
         self._save_kernel_logs(log_path)
 
         # Check for kernel panic after all tests complete
@@ -727,6 +750,37 @@ def _extract_diagnostic_info(
 
         return ""
 
+    def _check_test_panic_from_logs(
+        self,
+        test_result: TestResult,
+        log_path: Path,
+        content: str,
+        stage: str,
+        test_name: str,
+    ) -> None:
+        # Collect test output from provided content and log file
+        test_output = content or ""
+
+        log_file = log_path / f"{test_name}.log"
+        if log_file.exists():
+            try:
+                with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
+                    test_output = f"{test_output}\n{f.read()}"
+            except Exception:
+                # Best-effort only; continue with existing content
+                pass
+
+        # Check the collected output for panic markers
+        if test_output.strip():
+            check_test_panic(
+                test_output,
+                stage,
+                self._log,
+                test_result=test_result,
+                node_name=self.node.name,
+                source=f"{test_name}.log",
+            )
+
     def _extract_stdout_diagnostics(self, stdout: str) -> List[str]:
         """Extract diagnostic information from stdout."""
         diagnostic_messages: List[str] = []
diff --git a/lisa/microsoft/testsuites/stress/stress_ng_suite.py b/lisa/microsoft/testsuites/stress/stress_ng_suite.py
@@ -26,7 +26,7 @@
 from lisa.messages import TestStatus, send_sub_test_result_message
 from lisa.testsuite import TestResult
 from lisa.tools import StressNg
-from lisa.util import KernelPanicException, SkippedException
+from lisa.util import KernelPanicException, SkippedException, check_test_panic
 from lisa.util.logger import Logger
 from lisa.util.process import Process
 
@@ -268,7 +268,11 @@ def _run_stress_ng_job(
             )
 
             execution_status, execution_summary = self._monitor_stress_execution(
-                stress_processes, nodes, log, job_file_name
+                stress_processes,
+                nodes,
+                log,
+                job_file_name,
+                test_result,
             )
 
         except Exception as execution_error:
@@ -339,6 +343,7 @@ def _monitor_stress_execution(
         nodes: List[RemoteNode],
         log: Logger,
         job_file_name: str,
+        test_result: TestResult,
     ) -> Tuple[TestStatus, str]:
         """
         Monitor stress-ng execution and capture stress-ng info output.
@@ -355,9 +360,24 @@ def _monitor_stress_execution(
         for i, process in enumerate(stress_processes):
             node_name = nodes[i].name
             try:
-                process.wait_result(timeout=self.TIME_OUT, expected_exit_code=0)
+                result = process.wait_result(
+                    timeout=self.TIME_OUT,
+                    expected_exit_code=0,
+                )
                 log.info(f"{node_name} completed successfully")
 
+                # Check test output for panic markers
+                test_output = f"{result.stdout}\n{result.stderr}".strip()
+                if test_output:
+                    check_test_panic(
+                        test_output,
+                        stage=f"stress-ng job {job_file_name}",
+                        log=log,
+                        test_result=test_result,
+                        node_name=node_name,
+                        source="stress-ng output",
+                    )
+
                 # Process YAML output if applicable
                 node_output = self._process_yaml_output(nodes[i], job_file_name, log)
 
@@ -371,6 +391,36 @@ def _monitor_stress_execution(
                 # Store the exception to re-raise after collecting all outputs
                 exceptions_to_raise.append(e)
 
+                # Check test output for panic markers even on failure
+                # Try multiple sources since log_buffer may not always be populated
+                try:
+                    outputs = []
+                    buf = getattr(process, "log_buffer", None)
+                    if buf:
+                        outputs.append(buf.getvalue())
+
+                    # If exception carries stdout/stderr, grab them too
+                    outputs.append(getattr(e, "stdout", ""))
+                    outputs.append(getattr(e, "stderr", ""))
+
+                    # Fallback: exception message itself may contain output context
+                    outputs.append(str(e))
+
+                    test_output = "\n".join([x for x in outputs if x]).strip()
+                    if test_output:
+                        check_test_panic(
+                            test_output,
+                            stage=f"stress-ng job {job_file_name} (failed)",
+                            log=log,
+                            test_result=test_result,
+                            node_name=node_name,
+                            source="stress-ng captured output",
+                        )
+                except Exception:
+                    log.debug(
+                        f"Failed to check test panic on {node_name}", exc_info=True
+                    )
+
         # Combine all node outputs, including node names for clarity
         execution_summary = f"Job: {job_file_name}\n\n"
         for i, node_output in enumerate(node_outputs):
diff --git a/lisa/util/__init__.py b/lisa/util/__init__.py
@@ -42,6 +42,7 @@
 
 if TYPE_CHECKING:
     from lisa.operating_system import OperatingSystem
+    from lisa.testsuite import TestResult
     from lisa.util.logger import Logger
 
 T = TypeVar("T")
@@ -145,6 +146,12 @@
     re.compile(r"(.*RIP: 0010:topology_sane.isra.*)$", re.MULTILINE),
 ]
 
+TEST_PANIC_PATTERNS: List[Pattern[str]] = [
+    # Rust panics - must have "panicked at" with backtrace markers
+    re.compile(r"^(.*panicked at .*)$", re.MULTILINE),
+    re.compile(r"^(.*stack backtrace:.*)$", re.MULTILINE | re.IGNORECASE),
+]
+
 # Root filesystem mount failure patterns
 ROOTFS_FAILURE_PATTERNS: List[Pattern[str]] = [
     # Warning: dracut-initqueue timeout - starting timeout scripts
@@ -387,6 +394,24 @@ def __str__(self) -> str:
         )
 
 
+class PostTestPanicDetectedError(LisaException):
+    """
+    This exception is raised when a test panic is detected in test output.
+    """
+
+    def __init__(self, stage: str, panics: List[Any], source: str = "test log") -> None:
+        self.stage = stage
+        self.panics = panics
+        self.source = source
+        super().__init__(str(self))
+
+    def __str__(self) -> str:
+        return (
+            f"{self.stage} found test panic in {self.source}. "
+            f"Detected Test Panic lines: {self.panics}"
+        )
+
+
 class RootFsMountFailedException(LisaException):
     """
     This exception is used to indicate root filesystem mount failure.
@@ -979,6 +1004,53 @@ def check_panic(content: str, stage: str, log: "Logger") -> None:
         raise KernelPanicException(stage, panics)
 
 
+def append_test_panic_to_test_result(
+    test_result: "TestResult", node_name: str, panics: List[str]
+) -> None:
+    # Remove duplicates while preserving order
+    unique_panics = list(dict.fromkeys(panics))
+    panic_lines = "\n".join(unique_panics)
+    panic_summary = (
+        f"TEST PANIC DETECTED on {node_name}\n"
+        f"Detected Test Panic Lines:\n{panic_lines}\n"
+    )
+
+    # Check if we've already added this panic to avoid duplicates from
+    # both success and failure paths
+    if test_result.message and panic_summary in test_result.message:
+        return
+
+    if test_result.message:
+        test_result.message += f"\n\n{panic_summary}"
+    else:
+        test_result.message = panic_summary
+
+
+def check_test_panic(
+    content: str,
+    stage: str,
+    log: "Logger",
+    test_result: Optional["TestResult"] = None,
+    node_name: str = "",
+    source: str = "test log",
+) -> None:
+    log.debug("checking test panic...")
+    panics = [
+        x
+        for sublist in find_patterns_in_lines(str(content), TEST_PANIC_PATTERNS)
+        for x in sublist
+        if x
+    ]
+
+    if panics:
+        if test_result is not None:
+            # Append panic info to existing test result message, don't raise
+            append_test_panic_to_test_result(test_result, node_name, panics)
+        else:
+            # Only raise exception if no test_result context
+            raise PostTestPanicDetectedError(stage, panics, source)
+
+
 def check_rootfs_failure(content: str, log: "Logger") -> None:
     """
     Check if console log contains root filesystem mount failure message.