Skip to content

Commit 6ab54d1

Browse files
feat(vm): add GPU passthrough with cloud-hypervisor backend and nvidia unbind hardening
Signed-off-by: Vincent Caux-Brisebois <vcauxbrisebo@nvidia.com>
1 parent 6957b78 commit 6ab54d1

35 files changed

+6019
-973
lines changed

.github/workflows/branch-checks.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,46 @@ jobs:
7474
if: always()
7575
run: mise x -- sccache --show-stats
7676

77+
vm:
78+
name: VM Checks
79+
runs-on: build-amd64
80+
container:
81+
image: ghcr.io/nvidia/openshell/ci:latest
82+
credentials:
83+
username: ${{ github.actor }}
84+
password: ${{ secrets.GITHUB_TOKEN }}
85+
steps:
86+
- uses: actions/checkout@v4
87+
88+
- name: Install tools
89+
run: mise install
90+
91+
- name: Configure sccache remote cache
92+
if: vars.SCCACHE_MEMCACHED_ENDPOINT != ''
93+
run: echo "SCCACHE_MEMCACHED_ENDPOINT=${{ vars.SCCACHE_MEMCACHED_ENDPOINT }}" >> "$GITHUB_ENV"
94+
95+
- name: Cache Rust target and registry
96+
uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2
97+
with:
98+
shared-key: rust-vm-checks
99+
cache-directories: .cache/sccache
100+
101+
- name: Compile openshell-vm
102+
run: cargo test -p openshell-vm --no-run
103+
104+
- name: Run openshell-vm unit tests
105+
run: cargo test -p openshell-vm --lib
106+
107+
- name: Run VM boot smoke test (skips without runtime bundle)
108+
run: cargo test -p openshell-vm --test vm_boot_smoke -- --nocapture
109+
110+
- name: Run GPU passthrough gate test (expect skip on non-GPU runner)
111+
run: cargo test -p openshell-vm --test gpu_passthrough_implementation -- --nocapture
112+
113+
- name: sccache stats
114+
if: always()
115+
run: mise x -- sccache --show-stats
116+
77117
python:
78118
name: Python (${{ matrix.runner }})
79119
strategy:

.github/workflows/gpu-ci.yml

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
name: GPU VM Passthrough CI
5+
6+
on:
7+
push:
8+
branches:
9+
- "pull-request/[0-9]+"
10+
workflow_dispatch: {}
11+
12+
permissions:
13+
contents: read
14+
pull-requests: read
15+
packages: write
16+
17+
jobs:
18+
pr_metadata:
19+
name: Resolve PR metadata
20+
runs-on: ubuntu-latest
21+
outputs:
22+
should_run: ${{ steps.gate.outputs.should_run }}
23+
steps:
24+
- id: get_pr_info
25+
if: github.event_name == 'push'
26+
continue-on-error: true
27+
uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf
28+
29+
- id: gate
30+
shell: bash
31+
env:
32+
EVENT_NAME: ${{ github.event_name }}
33+
GITHUB_SHA_VALUE: ${{ github.sha }}
34+
GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }}
35+
PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }}
36+
run: |
37+
set -euo pipefail
38+
39+
if [ "$EVENT_NAME" != "push" ]; then
40+
echo "should_run=true" >> "$GITHUB_OUTPUT"
41+
exit 0
42+
fi
43+
44+
if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then
45+
echo "should_run=false" >> "$GITHUB_OUTPUT"
46+
exit 0
47+
fi
48+
49+
head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")"
50+
has_label="$(jq -r '[.labels[].name] | index("test:vm-gpu") != null' <<< "$PR_INFO")"
51+
52+
if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_label" = "true" ]; then
53+
should_run=true
54+
else
55+
should_run=false
56+
fi
57+
58+
echo "should_run=$should_run" >> "$GITHUB_OUTPUT"
59+
60+
gpu-passthrough-test:
61+
name: "GPU Passthrough (${{ matrix.name }})"
62+
needs: [pr_metadata]
63+
if: needs.pr_metadata.outputs.should_run == 'true'
64+
runs-on: ${{ matrix.runner }}
65+
timeout-minutes: 30
66+
strategy:
67+
fail-fast: false
68+
matrix:
69+
include:
70+
- name: linux-arm64
71+
runner: linux-arm64-gpu-l4-latest-1
72+
- name: linux-amd64
73+
runner: linux-amd64-gpu-rtxpro6000-latest-1
74+
container:
75+
image: ghcr.io/nvidia/openshell/ci:latest
76+
credentials:
77+
username: ${{ github.actor }}
78+
password: ${{ secrets.GITHUB_TOKEN }}
79+
options: --privileged
80+
env:
81+
CARGO_TERM_COLOR: always
82+
CARGO_INCREMENTAL: "0"
83+
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
84+
OPENSHELL_VM_GPU_E2E: "1"
85+
# Match `configured_runtime_dir()` / embedded cache layout so GPU gate and CHV tests see cloud-hypervisor.
86+
OPENSHELL_VM_RUNTIME_DIR: ${{ github.workspace }}/target/debug/openshell-vm.runtime
87+
steps:
88+
- uses: actions/checkout@v4
89+
90+
- name: Install tools
91+
run: mise install
92+
93+
- name: Configure sccache remote cache
94+
if: vars.SCCACHE_MEMCACHED_ENDPOINT != ''
95+
run: echo "SCCACHE_MEMCACHED_ENDPOINT=${{ vars.SCCACHE_MEMCACHED_ENDPOINT }}" >> "$GITHUB_ENV"
96+
97+
- name: Cache Rust target and registry
98+
uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2
99+
with:
100+
shared-key: gpu-ci-${{ matrix.name }}
101+
cache-directories: .cache/sccache
102+
103+
- name: Build VM runtime
104+
run: mise run vm:build
105+
106+
- name: Run GPU passthrough gate test
107+
run: cargo test -p openshell-vm --test gpu_passthrough_implementation -- --nocapture
108+
109+
- name: Run VM boot smoke test
110+
env:
111+
OPENSHELL_VM_BACKEND: cloud-hypervisor
112+
run: cargo test -p openshell-vm --test vm_boot_smoke -- --nocapture
113+
114+
- name: sccache stats
115+
if: always()
116+
run: mise x -- sccache --show-stats
117+
118+
build-gateway:
119+
needs: [pr_metadata]
120+
if: needs.pr_metadata.outputs.should_run == 'true'
121+
uses: ./.github/workflows/docker-build.yml
122+
with:
123+
component: gateway
124+
125+
build-cluster:
126+
needs: [pr_metadata]
127+
if: needs.pr_metadata.outputs.should_run == 'true'
128+
uses: ./.github/workflows/docker-build.yml
129+
with:
130+
component: cluster
131+
132+
gpu-e2e:
133+
name: GPU E2E
134+
needs: [pr_metadata, build-gateway, build-cluster]
135+
if: needs.pr_metadata.outputs.should_run == 'true'
136+
uses: ./.github/workflows/e2e-gpu-test.yaml
137+
with:
138+
image-tag: ${{ github.sha }}

.github/workflows/test-gpu.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
- id: get_pr_info
2323
if: github.event_name == 'push'
2424
continue-on-error: true
25-
uses: nv-gha-runners/get-pr-info@main
25+
uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf
2626

2727
- id: gate
2828
shell: bash

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

architecture/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden
301301
| [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. |
302302
| [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. |
303303
| [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. |
304+
| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / cloud-hypervisor), kernel configuration, and build pipeline. |
305+
| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. |
304306
| [TUI](tui.md) | Terminal user interface for sandbox interaction. |

0 commit comments

Comments
 (0)