Fix podman rootless install instructions

Manually add the required capabilities to the `newuidmap` and `newgidmap` binaries
Log jail name after initial setup
2024-05-08 22:59:03 +02:00 · 2024-05-08 22:49:28 +02:00 · 2024-05-08 22:25:19 +02:00 · 2024-05-08 22:24:59 +02:00 · 2024-05-08 22:21:06 +02:00 · 2024-05-08 18:59:58 +02:00
9 changed files with 145 additions and 129 deletions
--- a/jlmkr.py
+++ b/jlmkr.py
@ -4,7 +4,7 @@
 with full access to all files via bind mounts, \
 thanks to systemd-nspawn!"""

-__version__ = "1.3.0"
+__version__ = "1.4.0"

 __disclaimer__ = """USE THIS SCRIPT AT YOUR OWN RISK!
 IT COMES WITHOUT WARRANTY AND IS NOT SUPPORTED BY IXSYSTEMS."""
@ -43,15 +43,13 @@ docker_compatible=0
 # Turning off seccomp filtering improves performance at the expense of security
 seccomp=1

-# Add additional systemd-nspawn flags
-# E.g. to mount host storage in the jail (--bind-ro for readonly):
-# --bind='/mnt/pool/dataset:/home' --bind-ro=/etc/certificates
-# E.g. macvlan networking:
-# --network-macvlan=eno1 --resolv-conf=bind-host
-# E.g. bridge networking:
-# --network-bridge=br1 --resolv-conf=bind-host
-# E.g. allow syscalls required by docker:
-# --system-call-filter='add_key keyctl bpf'
+# Below you may add additional systemd-nspawn flags behind systemd_nspawn_user_args=
+# To mount host storage in the jail, you may add: --bind='/mnt/pool/dataset:/home' 
+# To readonly mount host storage, you may add: --bind-ro=/etc/certificates
+# To use macvlan networking add: --network-macvlan=eno1 --resolv-conf=bind-host
+# To use bridge networking add: --network-bridge=br1 --resolv-conf=bind-host
+# Ensure to change eno1/br1 to the interface name you want to use
+# To allow syscalls required by docker add: --system-call-filter='add_key keyctl bpf'
 systemd_nspawn_user_args=

 # Specify command/script to run on the HOST before starting the jail
@ -73,10 +71,8 @@ post_stop_hook=
 distro=debian
 release=bookworm

-# Specify command/script to run IN THE JAIL before the first start
+# Specify command/script to run IN THE JAIL on the first start (once networking is ready in the jail)
 # Useful to install packages on top of the base rootfs
-# NOTE: this script will run in the host networking namespace and
-# ignores all systemd_nspawn_user_args such as bind mounts
 initial_setup=
 # initial_setup=bash -c 'apt-get update && apt-get -y upgrade'

@ -545,55 +541,6 @@ def start_jail(jail_name):

    seccomp = config.my_getboolean("seccomp")

-    # Handle initial setup
-    initial_setup = config.my_get("initial_setup")
-
-    # Alternative method to setup on first boot:
-    # https://www.undrground.org/2021/01/25/adding-a-single-run-task-via-systemd/
-    # If there's no machine-id, then this the first time the jail is started
-    if initial_setup and not os.path.exists(
-        os.path.join(jail_rootfs_path, "etc/machine-id")
-    ):
-        initial_setup_file = None
-
-        if initial_setup.startswith("#!"):
-            # Write a script file and call that
-            initial_setup_file = os.path.abspath(
-                os.path.join(jail_path, ".initial_setup")
-            )
-            print(initial_setup, file=open(initial_setup_file, "w"))
-            stat_chmod(initial_setup_file, 0o700)
-            cmd = [
-                "systemd-nspawn",
-                "-q",
-                "-D",
-                jail_rootfs_path,
-                f"--bind-ro={initial_setup_file}:/root/initial_startup",
-                "/root/initial_startup",
-            ]
-        else:
-            # Run the command directly if it doesn't start with a shebang
-            cmd = [
-                "systemd-nspawn",
-                "-q",
-                "-D",
-                jail_rootfs_path,
-                *shlex.split(initial_setup),
-            ]
-
-        returncode = subprocess.run(cmd).returncode
-
-        # Cleanup the initial_setup_file
-        if initial_setup_file:
-            Path(initial_setup_file).unlink(missing_ok=True)
-
-        if returncode != 0:
-            eprint("Failed to run initial setup:")
-            eprint(initial_setup)
-            eprint()
-            eprint("Abort starting jail.")
-            return returncode
-
    systemd_run_additional_args = [
        f"--unit={SYMLINK_NAME}-{jail_name}",
        f"--working-directory=./{jail_path}",
@ -605,6 +552,24 @@ def start_jail(jail_name):
        f"--directory={JAIL_ROOTFS_NAME}",
    ]

+    # The systemd-nspawn manual explicitly mentions:
+    # Device nodes may not be created
+    # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
+    # This means docker images containing device nodes can't be pulled
+    # https://github.com/moby/moby/issues/35245
+    #
+    # The solution is to use DevicePolicy=auto
+    # https://github.com/kinvolk/kube-spawn/pull/328
+    #
+    # DevicePolicy=auto is the default for systemd-run and allows access to all devices
+    # as long as we don't add any --property=DeviceAllow= flags
+    # https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
+    #
+    # We can now successfully run:
+    # mknod /dev/port c 1 4
+    # Or pull docker images containing device nodes:
+    # docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
+
    if config.my_getboolean("docker_compatible"):
        eprint("WARNING: DEPRECATED OPTION")
        eprint(
@ -704,23 +669,28 @@ def start_jail(jail_name):
            "--setenv=SYSTEMD_SECCOMP=0",
        ]

-    # The systemd-nspawn manual explicitly mentions:
-    # Device nodes may not be created
-    # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
-    # This means docker images containing device nodes can't be pulled
-    # https://github.com/moby/moby/issues/35245
-    #
-    # The solution is to use DevicePolicy=auto
-    # https://github.com/kinvolk/kube-spawn/pull/328
-    #
-    # DevicePolicy=auto is the default for systemd-run and allows access to all devices
-    # as long as we don't add any --property=DeviceAllow= flags
-    # https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
-    #
-    # We can now successfully run:
-    # mknod /dev/port c 1 4
-    # Or pull docker images containing device nodes:
-    # docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
+    initial_setup = False
+
+    # If there's no machine-id, then this the first time the jail is started
+    if not os.path.exists(os.path.join(jail_rootfs_path, "etc/machine-id")) and (
+        initial_setup := config.my_get("initial_setup")
+    ):
+        if not initial_setup.startswith("#!"):
+            initial_setup = "#!/bin/sh\n" + initial_setup
+
+        initial_setup_file_jailed_path = "/root/jlmkr-initial-setup"
+        initial_setup_file_host_path = os.path.abspath(
+            jail_rootfs_path + initial_setup_file_jailed_path
+        )
+
+        # Write a script file to call during initial setup
+        print(initial_setup, file=open(initial_setup_file_host_path, "w"))
+        stat_chmod(initial_setup_file_host_path, 0o700)
+
+        # Ensure the jail init system is ready before we start the initial_setup
+        systemd_nspawn_additional_args += [
+            "--notify-ready=yes",
+        ]

    cmd = [
        "systemd-run",
@ -755,6 +725,44 @@ def start_jail(jail_name):
            )
        )

+        return returncode
+
+    # Handle initial setup after jail is up and running (for the first time)
+    if initial_setup:
+        print("About to run the initial setup.")
+        print("Waiting for networking in the jail to be ready.")
+        print("Please wait (this may take 90s in case of bridge networking)...")
+        returncode = exec_jail(
+            jail_name,
+            [
+                "--",
+                "systemd-run",
+                f"--unit={os.path.basename(initial_setup_file_jailed_path)}",
+                "--quiet",
+                "--pipe",
+                "--wait",
+                "--service-type=exec",
+                "--property=After=network-online.target",
+                "--property=Wants=network-online.target",
+                initial_setup_file_jailed_path,
+            ],
+        )
+
+        # Cleanup the initial_setup_file_host_path
+        if initial_setup_file_host_path:
+            Path(initial_setup_file_host_path).unlink(missing_ok=True)
+
+        if returncode != 0:
+            eprint("Tried to run the following commands inside the jail:")
+            eprint(initial_setup)
+            eprint()
+            eprint(
+                f"""{RED}{BOLD}Failed to run initial setup... you may want to stop and remove the jail and try again.{NORMAL}"""
+            )
+            return returncode
+        else:
+            print(f"Done with initial setup of jail {jail_name}!")
+
    return returncode


@ -922,11 +930,12 @@ def get_zfs_dataset(path):
    """
    Get ZFS dataset path.
    """
+
    def clean_field(field):
        # Put back spaces which were encoded
        # https://github.com/openzfs/zfs/issues/11182
-        return field.replace('\\040', ' ')
-    
+        return field.replace("\\040", " ")
+
    path = os.path.realpath(path)
    with open("/proc/mounts", "r") as f:
        for line in f:
@ -1025,11 +1034,14 @@ def get_text_editor():
        if editor := os.environ.get(key):
            return shutil.which(editor)

-    return get_from_environ("VISUAL") \
-        or get_from_environ("EDITOR") \
-        or shutil.which("editor") \
-        or shutil.which("/usr/bin/editor") \
+    return (
+        get_from_environ("VISUAL")
+        or get_from_environ("EDITOR")
+        or shutil.which("editor")
+        or shutil.which("/usr/bin/editor")
        or "nano"
+    )
+

 def interactive_config():
    config = KeyValueParser()
@ -1517,9 +1529,7 @@ def edit_jail(jail_name):

    jail_config_path = get_jail_config_path(jail_name)

-    returncode = subprocess.run(
-        [get_text_editor(), jail_config_path]
-    ).returncode
+    returncode = subprocess.run([get_text_editor(), jail_config_path]).returncode

    if returncode != 0:
        eprint(f"An error occurred while editing {jail_config_path}.")
--- a/templates/docker/README.md
+++ b/templates/docker/README.md
@ -2,4 +2,4 @@

 ## Setup

-Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/docker/config mydockerjail`.
+Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/docker/config mydockerjail`. If you want the `nvidia-container-toolkit` to be installed, ensure you set `gpu_passthrough_nvidia=1` when creating the jail.
--- a/templates/docker/config
+++ b/templates/docker/config
@ -6,10 +6,10 @@ seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so docker can manage firewall rules
-# Alternatively use --network-bridge=br1 instead of --network-macvlan
+# Alternatively use --network-macvlan=eno1 instead of --network-bridge
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
-systemd_nspawn_user_args=--network-macvlan=eno1
+systemd_nspawn_user_args=--network-bridge=br1
    --resolv-conf=bind-host
    --system-call-filter='add_key keyctl bpf'

@ -29,8 +29,8 @@ release=bookworm

 # Install docker inside the jail:
 # https://docs.docker.com/engine/install/debian/#install-using-the-repository
-# NOTE: this script will run in the host networking namespace and ignores
-# all systemd_nspawn_user_args such as bind mounts
+# Will also install the NVIDIA Container Toolkit if gpu_passthrough_nvidia=1 during initial setup
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 initial_setup=#!/usr/bin/bash
    set -euo pipefail

@ -43,8 +43,26 @@ initial_setup=#!/usr/bin/bash
    "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
    $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
    tee /etc/apt/sources.list.d/docker.list > /dev/null
+    
    apt-get update
    apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+    
+    # The /usr/bin/nvidia-smi will be present when gpu_passthrough_nvidia=1
+    if [ -f /usr/bin/nvidia-smi ]; then
+        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey -o /etc/apt/keyrings/nvidia.asc
+        chmod a+r /etc/apt/keyrings/nvidia.asc
+        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+        sed 's#deb https://#deb [signed-by=/etc/apt/keyrings/nvidia.asc] https://#g' | \
+        tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+        apt-get update
+        apt-get install -y nvidia-container-toolkit
+
+        nvidia-ctk runtime configure --runtime=docker
+        systemctl restart docker
+    fi
+
+    docker info

 # You generally will not need to change the options below
 systemd_run_default_args=--property=KillMode=mixed
--- a/templates/incus/README.md
+++ b/templates/incus/README.md
@ -8,11 +8,10 @@

 Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/incus/config myincusjail`.

-Unfortunately incus doesn't want to install from the `initial_setup` script inside the config file. So we manually finish the setup by running the following after creating and starting the jail:
+We manually finish the setup by running the following after creating and starting the jail:

 ```bash
-jlmkr exec myincusjail bash -c 'apt-get -y install incus incus-ui-canonical &&
-    incus admin init'
+jlmkr exec myincusjail bash -c 'incus admin init'
 ```    

 Follow [First steps with Incus](https://linuxcontainers.org/incus/docs/main/tutorial/first_steps/).
--- a/templates/incus/config
+++ b/templates/incus/config
@ -3,15 +3,16 @@ startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
 # Turning off seccomp filtering improves performance at the expense of security
-seccomp=1
+# TODO: don't disable seccomp but specify which syscalls should be allowed
+seccomp=0

 # Use macvlan networking to provide an isolated network namespace,
 # so incus can manage firewall rules
-# Alternatively use --network-bridge=br1 instead of --network-macvlan
+# Alternatively use --network-macvlan=eno1 instead of --network-bridge
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
-systemd_nspawn_user_args=--network-macvlan=eno1
+systemd_nspawn_user_args=--network-bridge=br1
    --resolv-conf=bind-host
    --capability=all
    --bind=/dev/fuse
@ -36,8 +37,6 @@ release=bookworm

 # Install incus according to:
 # https://github.com/zabbly/incus#installation
-# NOTE: this script will run in the host networking namespace and ignores
-# all systemd_nspawn_user_args such as bind mounts
 initial_setup=#!/usr/bin/bash
    set -euo pipefail
    apt-get update && apt-get -y install curl
@ -54,6 +53,7 @@ initial_setup=#!/usr/bin/bash

    EOF'
    apt-get update
+    apt-get -y install incus incus-ui-canonical

 # You generally will not need to change the options below
 systemd_run_default_args=--property=KillMode=mixed
--- a/templates/lxd/README.md
+++ b/templates/lxd/README.md
@ -8,20 +8,7 @@

 Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/lxd/config mylxdjail`.

-Unfortunately snapd doesn't want to install from the `initial_setup` script inside the config file. So we manually finish the setup by running the following after creating and starting the jail:
-
-```bash
-# Repeat listing the jail until you see it has an IPv4 address
-jlmkr list
-
-# Install packages
-jlmkr exec mylxdjail bash -c 'apt-get update &&
-    apt-get install -y --no-install-recommends snapd &&
-    snap install lxd'
-
-```
-
-Choose the `dir` storage backend during `lxd init` and answer `yes` to "Would you like the LXD server to be available over the network?"
+We manually finish the setup by running the command below after creating and starting the jail. Choose the `dir` storage backend during `lxd init` and answer `yes` to "Would you like the LXD server to be available over the network?"

 ```bash
 jlmkr exec mylxdjail bash -c 'lxd init &&
--- a/templates/lxd/config
+++ b/templates/lxd/config
@ -3,11 +3,12 @@ startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
 # Turning off seccomp filtering improves performance at the expense of security
-seccomp=1
+# TODO: don't disable seccomp but specify which syscalls should be allowed
+seccomp=0

 # Use macvlan networking to provide an isolated network namespace,
 # so lxd can manage firewall rules
-# Alternatively use --network-bridge=br1 instead of --network-macvlan
+# Alternatively use --network-macvlan=eno1 instead of --network-bridge
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
@ -34,12 +35,13 @@ pre_start_hook=#!/usr/bin/bash
 distro=ubuntu
 release=jammy

-# NOTE: this script will run in the host networking namespace and ignores
-# all systemd_nspawn_user_args such as bind mounts
 initial_setup=#!/usr/bin/bash
    set -euo pipefail
    # https://discuss.linuxcontainers.org/t/snap-inside-privileged-lxd-container/13691/8
    ln -sf /bin/true /usr/local/bin/udevadm
+    apt-get update
+    apt-get install -y --no-install-recommends snapd
+    snap install lxd

 # You generally will not need to change the options below
 systemd_run_default_args=--property=KillMode=mixed
--- a/templates/podman/README.md
+++ b/templates/podman/README.md
@ -46,6 +46,10 @@ usermod --del-subuids 0-4294967295 --del-subgids 0-4294967295 rootless
 # Set a specific range, so it fits inside the number of available UIDs
 usermod --add-subuids 65536-131071 --add-subgids 65536-131071 rootless

+# Add the required capabilities to the `newuidmap` and `newgidmap` binaries
+setcap cap_setuid+eip /usr/bin/newuidmap
+setcap cap_setgid+eip /usr/bin/newgidmap
+
 # Check the assigned range
 cat /etc/subuid
 # Check the available range
@ -120,4 +124,7 @@ Resources mentioning `add_key keyctl bpf`
 Resources mentioning `@keyring`
 - https://github.com/systemd/systemd/issues/17606
 - https://github.com/systemd/systemd/blob/1c62c4fe0b54fb419b875cb2bae82a261518a745/src/shared/seccomp-util.c#L604
-`@keyring` also includes `request_key` but doesn't include `bpf`
+`@keyring` also includes `request_key` but doesn't include `bpf`
+Resources mentioning `cap_setuid+eip`, `cap_setgid+eip`, `newuidmap` and `newgidmap`
+- https://github.com/containers/podman/issues/2788#issuecomment-1016301663
+- https://github.com/containers/podman/issues/12637#issuecomment-996524341
--- a/templates/podman/config
+++ b/templates/podman/config
@ -6,10 +6,10 @@ seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so podman can manage firewall rules
-# Alternatively use --network-bridge=br1 instead of --network-macvlan
+# Alternatively use --network-macvlan=eno1 instead of --network-bridge
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
-systemd_nspawn_user_args=--network-macvlan=eno1
+systemd_nspawn_user_args=--network-bridge=br1
    --resolv-conf=bind-host
    --system-call-filter='add_key keyctl bpf'

@ -28,16 +28,9 @@ distro=fedora
 release=39

 # Install podman inside the jail
-# NOTE: this script will run in the host networking namespace and ignores
-# all systemd_nspawn_user_args such as bind mounts
 initial_setup=#!/usr/bin/bash
    set -euo pipefail
    dnf -y install podman
-    # Add the required capabilities to the `newuidmap` and `newgidmap` binaries
-    # https://github.com/containers/podman/issues/2788#issuecomment-1016301663
-    # https://github.com/containers/podman/issues/12637#issuecomment-996524341
-    setcap cap_setuid+eip /usr/bin/newuidmap
-    setcap cap_setgid+eip /usr/bin/newgidmap

 # You generally will not need to change the options below
 systemd_run_default_args=--property=KillMode=mixed
Author	SHA1	Message	Date
Jip-Hop	6851ad2cd1	Fix podman rootless install instructions Manually add the required capabilities to the `newuidmap` and `newgidmap` binaries	2024-05-08 22:59:03 +02:00
Jip-Hop	03fcb961b7	Log jail name after initial setup	2024-05-08 22:49:28 +02:00
Jip-Hop	2f4a113cf1	Bump version to 1.4.0	2024-05-08 22:25:19 +02:00
Jip-Hop	b849cb787c	Update config templates Updated config templates to benefit from initial_setup running after jail has fully started. Added conditional nvidia-container-toolkit install during initial_setup. Config templates now default to bridge networking.	2024-05-08 22:24:59 +02:00
Jip-Hop	4cd7c54c58	Run initial_setup once jail starts for first time First jail startup will now wait for init system and networking to be ready before running the initial_setup script inside the jail. All systemd_nspawn_user_args are now applied and the initial_setup script will have access to filed mounted via bind mounts.	2024-05-08 22:21:06 +02:00
Jip-Hop	93267b6ec6	Formatting	2024-05-08 18:59:58 +02:00