Load nvidia kernel module

2024-01-26 22:33:33 +01:00 · 2024-01-26 22:33:33 +01:00 · da2c90374b
parent 79bd824505
commit da2c90374b
2 changed files with 23 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -135,14 +135,6 @@ See [Advanced Networking](./NETWORKING.md) for more.

 The `jailmaker` script won't install Docker for you, but it can setup the jail with the capabilities required to run docker. You can manually install Docker inside the jail using the [official installation guide](https://docs.docker.com/engine/install/#server) or use [convenience script](https://get.docker.com).

-## Nvidia GPU
-
-To make passthrough of the nvidia GPU work, you need to schedule a Pre Init command. The reason is that TrueNAS SCALE by default doesn't load the nvidia kernel modules (and `jailmaker` doesn't do that either). [This screenshot](https://user-images.githubusercontent.com/1704047/222915803-d6dd51b0-c4dd-4189-84be-a04d38cca0b3.png) shows what the configuration should look like.
-
-```
-[ ! -f /dev/nvidia-uvm ] && modprobe nvidia-current-uvm && /usr/bin/nvidia-modprobe -c0 -u
-```
-
 ## Documentation

 Additional documentation contributed by the community can be found in [the docs directory](./docs/).
--- a/jlmkr.py
+++ b/jlmkr.py
@ -104,6 +104,17 @@ def passthrough_intel(gpu_passthrough_intel, systemd_nspawn_additional_args):
 def passthrough_nvidia(
    gpu_passthrough_nvidia, systemd_nspawn_additional_args, jail_name
 ):
+    # Load the nvidia kernel module
+    if subprocess.run(["modprobe", "nvidia-current-uvm"]).returncode != 0:
+        eprint(
+            dedent(
+                """
+            Failed to load nvidia-current-uvm kernel module.
+            Skip passthrough of nvidia GPU."""
+            )
+        )
+        return
+
    jail_rootfs_path = get_jail_rootfs_path(jail_name)
    ld_so_conf_path = Path(
        os.path.join(jail_rootfs_path), f"etc/ld.so.conf.d/{SYMLINK_NAME}-nvidia.conf"
@ -114,13 +125,11 @@ def passthrough_nvidia(
        ld_so_conf_path.unlink(missing_ok=True)
        return

-    try:
        # Run nvidia-smi to initialize the nvidia driver
        # If we can't run nvidia-smi successfully,
        # then nvidia-container-cli list will fail too:
        # we shouldn't continue with gpu passthrough
-        subprocess.run(["nvidia-smi", "-f", "/dev/null"], check=True)
-    except:
+    if subprocess.run(["nvidia-smi", "-f", "/dev/null"]).returncode != 0:
        eprint("Skip passthrough of nvidia GPU.")
        return

@ -316,6 +325,11 @@ def start_jail(jail_name, check_startup_enabled=False):
        f"--directory={JAIL_ROOTFS_NAME}",
    ]

+    # TODO: split the docker_compatible option into separate options
+    #   - privileged (to disable seccomp, set DevicePolicy=auto and add all capabilities)
+    #   - how to call the option to enable ip_forward and bridge-nf-call?
+    # TODO: always add --bind-ro=/sys/module? Or only for privileged jails?
+
    if config.get("docker_compatible") == "1":
        # Enable ip forwarding on the host (docker needs it)
        print(1, file=open("/proc/sys/net/ipv4/ip_forward", "w"))
@ -712,15 +726,16 @@ def create_jail(jail_name, distro="debian", release="bookworm"):

        gpu_passthrough_nvidia = 0

-        try:
+        if (
            subprocess.run(
-                ["nvidia-smi"],
-                check=True,
+                ["modprobe", "nvidia-current-uvm"],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
-            )
+            ).returncode
+            == 0
+        ):
            nvidia_detected = True
-        except:
+        else:
            nvidia_detected = False

        if nvidia_detected: