Skip to content

Commit 03a8341

Browse files
committed
feat(gpu): add WSL CDI spec watcher and set deviceIDStrategy to index
On WSL2 hosts the NVIDIA device plugin generates CDI specs that cannot be used directly by k3s containerd since it includes a single device name "all" and not one based on the index or UUID of the device. Add a background watch_cdi_specs function to cluster-entrypoint.sh that: - detects WSL2 via /dev/dxg presence - handles specs already present at gateway restart - uses inotifywait to watch for new/updated specs - transforms the spec with jq (cdiVersion=0.5.0, devices[0].name="0") Add inotify-tools and jq to the cluster image apt-get install block to support the watcher. Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 03486f8 commit 03a8341

File tree

3 files changed

+51
-0
lines changed

3 files changed

+51
-0
lines changed

deploy/docker/Dockerfile.images

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
244244
iptables \
245245
mount \
246246
dnsutils \
247+
inotify-tools \
248+
jq \
247249
&& apt-get install -y --only-upgrade gpgv \
248250
&& rm -rf /var/lib/apt/lists/*
249251

deploy/docker/cluster-entrypoint.sh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,45 @@ fi
381381
# the k3s manifests directory so the Helm controller installs it automatically.
382382
# The nvidia-container-runtime binary is already on PATH (baked into the image)
383383
# so k3s registers the "nvidia" RuntimeClass at startup.
384+
CDI_SPEC_DIR="/var/run/cdi"
385+
CDI_WSL_INPUT="${CDI_SPEC_DIR}/k8s.device-plugin.nvidia.com-gpu.json"
386+
CDI_WSL_OUTPUT="${CDI_SPEC_DIR}/openshell-wsl.json"
387+
388+
transform_wsl_cdi_spec() {
389+
local tmp="${CDI_WSL_OUTPUT}.tmp.$$"
390+
if jq '.cdiVersion = "0.5.0" | .devices[0].name = "0"' \
391+
"$CDI_WSL_INPUT" > "$tmp" 2>/dev/null; then
392+
mv "$tmp" "$CDI_WSL_OUTPUT"
393+
echo "CDI: transformed WSL spec -> $CDI_WSL_OUTPUT"
394+
else
395+
rm -f "$tmp"
396+
echo "CDI: failed to transform WSL spec (jq error)"
397+
fi
398+
}
399+
400+
watch_cdi_specs() {
401+
if ! command -v inotifywait > /dev/null 2>&1; then
402+
echo "CDI: inotifywait not found, skipping spec watcher"
403+
return 1
404+
fi
405+
406+
mkdir -p "$CDI_SPEC_DIR"
407+
408+
# Process spec already present at startup (e.g. gateway restart)
409+
if [ -f "$CDI_WSL_INPUT" ] && grep -q '/dev/dxg' "$CDI_WSL_INPUT" 2>/dev/null; then
410+
transform_wsl_cdi_spec
411+
fi
412+
413+
# Watch for the spec to appear or be updated
414+
inotifywait -m -e close_write,moved_to --format '%f' "$CDI_SPEC_DIR" 2>/dev/null \
415+
| while IFS= read -r filename; do
416+
if [ "$filename" = "k8s.device-plugin.nvidia.com-gpu.json" ] \
417+
&& grep -q '/dev/dxg' "$CDI_WSL_INPUT" 2>/dev/null; then
418+
transform_wsl_cdi_spec
419+
fi
420+
done
421+
}
422+
384423
if [ "${GPU_ENABLED:-}" = "true" ]; then
385424
echo "GPU support enabled — deploying NVIDIA device plugin"
386425

@@ -391,6 +430,11 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then
391430
cp "$manifest" "$K3S_MANIFESTS/"
392431
done
393432
fi
433+
434+
if [ -c /dev/dxg ]; then
435+
echo "WSL2 GPU detected (/dev/dxg present) — starting CDI spec watcher"
436+
watch_cdi_specs &
437+
fi
394438
fi
395439

396440
# ---------------------------------------------------------------------------

deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
# devices are injected via CDI hooks before container start. Sandbox pods only
1717
# need the nvidia.com/gpu resource request — no runtimeClassName is required.
1818
#
19+
# The device plugin is set to deviceIDStrategy=index so that device names are
20+
# numeric indices (e.g. "0"). This simplifies the conversion of CDI specs on WSL
21+
# systems, where we need to rename the *.nvidia.com/gpu=all device that is
22+
# generated by the device plugin to *.nvidia.com/gpu=0.
23+
#
1924
# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia"
2025
# RuntimeClass automatically, so no manual RuntimeClass manifest is needed.
2126

0 commit comments

Comments
 (0)