Deprecate docker_compatible and gpu_passthrough (#121)

Remove --property=DeviceAllow= so it won't interfere with DevicePolicy=auto Added seccomp config option Deprecated docker_compatible config option Deprecated gpu_passthrough config option Removed the docker_compatible question during interactive create Updated readme and config templates Closes https://github.com/Jip-Hop/jailmaker/issues/119
2024-04-22 20:50:40 +02:00 · 2024-04-22 20:50:40 +02:00 · 9661f4ff19
parent a7c4b9dbad
commit 9661f4ff19
6 changed files with 95 additions and 71 deletions
--- a/README.md
+++ b/README.md
@ -160,7 +160,7 @@ See [Advanced Networking](./NETWORKING.md) for more.

 ## Docker

-The `jailmaker` script won't install Docker for you, but it can setup the jail with the capabilities required to run docker. You can manually install Docker inside the jail using the [official installation guide](https://docs.docker.com/engine/install/#server) or use [convenience script](https://get.docker.com). Additionally you may use the [docker config template](./templates/docker/README.md).
+Using the [docker config template](./templates/docker/README.md) is recommended if you want to run docker inside the jail. You may of course manually install docker inside a jail. But keep in mind that you need to add `--system-call-filter='add_key keyctl bpf'` (or disable seccomp filtering). It is [not recommended to use host networking for a jail in which you run docker](https://github.com/Jip-Hop/jailmaker/issues/119). Docker needs to manage iptables rules, which it can safely do in its own networking namespace (when using [bridge or macvlan networking](./NETWORKING.md) for the jail).

 ## Documentation

--- a/jlmkr.py
+++ b/jlmkr.py
@ -4,7 +4,7 @@
 with full access to all files via bind mounts, \
 thanks to systemd-nspawn!"""

-__version__ = "1.1.4"
+__version__ = "1.1.5"

 __disclaimer__ = """USE THIS SCRIPT AT YOUR OWN RISK!
 IT COMES WITHOUT WARRANTY AND IS NOT SUPPORTED BY IXSYSTEMS."""
@ -38,7 +38,10 @@ from textwrap import dedent
 DEFAULT_CONFIG = """startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
+# The docker_compatible option is deprecated and will be removed in a future release
 docker_compatible=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1

 # Add additional systemd-nspawn flags
 # E.g. to mount host storage in the jail (--bind-ro for readonly):
@ -47,7 +50,7 @@ docker_compatible=0
 # --network-macvlan=eno1 --resolv-conf=bind-host
 # E.g. bridge networking:
 # --network-bridge=br1 --resolv-conf=bind-host
-# E.g. add capabilities required by docker:
+# E.g. allow syscalls required by docker:
 # --system-call-filter='add_key keyctl bpf'
 systemd_nspawn_user_args=

@ -135,8 +138,7 @@ else:
 DISCLAIMER = f"""{YELLOW}{BOLD}{__disclaimer__}{NORMAL}"""

 # Used in parser getters to indicate the default behavior when a specific
-# option is not found it to raise an exception. Created to enable `None` as
-# a valid fallback value.
+# option is not found. Created to enable `None` as a valid fallback value.
 _UNSET = object()


@ -249,8 +251,8 @@ class KeyValueParser(configparser.ConfigParser):
        super().set(self._section_name, option, value)

    # Return value for specified option key
-    def my_get(self, option):
-        return super().get(self._section_name, option)
+    def my_get(self, option, fallback=_UNSET):
+        return super().get(self._section_name, option, fallback=fallback)

    # Return value converted to boolean for specified option key
    def my_getboolean(self, option, fallback=_UNSET):
@ -302,6 +304,12 @@ def get_jail_rootfs_path(jail_name):
    return os.path.join(get_jail_path(jail_name), JAIL_ROOTFS_NAME)


+# Test intel GPU by decoding mp4 file (output is discarded)
+# Run the commands below in the jail:
+# curl -o bunny.mp4 https://www.w3schools.com/html/mov_bbb.mp4
+# ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i bunny.mp4 -f null - && echo 'SUCCESS!'
+
+
 def passthrough_intel(gpu_passthrough_intel, systemd_nspawn_additional_args):
    if not gpu_passthrough_intel:
        return
@ -536,6 +544,8 @@ def start_jail(jail_name):
        eprint("Aborting...")
        return 1

+    seccomp = config.my_getboolean("seccomp")
+
    # Handle initial setup
    initial_setup = config.my_get("initial_setup")

@ -596,19 +606,13 @@ def start_jail(jail_name):
        f"--directory={JAIL_ROOTFS_NAME}",
    ]

-    # TODO: split the docker_compatible option into separate options
-    #   - privileged (to disable seccomp, set DevicePolicy=auto and add all capabilities)
-    #   "The bottom line is that using the --privileged flag does not tell the container
-    #   engines to add additional security constraints. The --privileged flag does not add
-    #   any privilege over what the processes launching the containers have."
-    #   "Container engines user namespace is not affected by the --privileged flag"
-    #   Meaning in the context of systemd-nspawn I could have a privileged option,
-    #   which would also apply to jails with --private-users (user namespacing)
-    #   https://www.redhat.com/sysadmin/privileged-flag-container-engines
-    #   - how to call the option to enable ip_forward and bridge-nf-call?
-    #   - add CSV value for preloading kernel modules like linux.kernel_modules in LXC
-
    if config.my_getboolean("docker_compatible"):
+        eprint("WARNING: DEPRECATED OPTION")
+        eprint(
+            "The `docker_compatible` option is deprecated and will be removed in a future release."
+        )
+        eprint("Please refer to the recommended way to run docker in a jail:")
+        eprint("https://github.com/Jip-Hop/jailmaker/tree/main/templates/docker")
        # Enable ip forwarding on the host (docker needs it)
        print(1, file=open("/proc/sys/net/ipv4/ip_forward", "w"))

@ -635,32 +639,8 @@ def start_jail(jail_name):
                )
            )

-        # To properly run docker inside the jail, we need to lift restrictions
-        # Without DevicePolicy=auto images with device nodes may not be pulled
-        # For example docker pull ljishen/sysbench would fail
-        # Fortunately I didn't encounter many images with device nodes...
-        #
-        # Issue: https://github.com/moby/moby/issues/35245
-        #
-        # The systemd-nspawn manual explicitly mentions:
-        # Device nodes may not be created
-        # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
-        #
-        # Workaround: https://github.com/kinvolk/kube-spawn/pull/328
-        #
-        # As of 26-3-2024 on TrueNAS-SCALE-23.10.1.1 it seems to no longer be
-        # required to use DevicePolicy=auto
-        # Docker can successfully pull the ljishen/sysbench test image
-        # Running mknod /dev/port c 1 4 manually works too...
-        # Unknown why this suddenly started working...
-        # https://github.com/systemd/systemd/issues/21987
-        #
-        # Use SYSTEMD_SECCOMP=0: https://github.com/systemd/systemd/issues/18370
-
-        systemd_run_additional_args += [
-            "--setenv=SYSTEMD_SECCOMP=0",
-            "--property=DevicePolicy=auto",
-        ]
+        print("The `docker_compatible` option disables seccomp filtering...")
+        seccomp = False

        # Add additional flags required for docker
        systemd_nspawn_additional_args += [
@ -683,22 +663,66 @@ def start_jail(jail_name):
    )

    # Legacy gpu_passthrough config setting
-    # TODO: deprecate this and stop supporting it
    if config.my_getboolean("gpu_passthrough", False):
+        eprint("WARNING: DEPRECATED OPTION")
+        eprint(
+            "The `gpu_passthrough` option is deprecated and will be removed in a future release."
+        )
+        eprint(
+            "Please use `gpu_passthrough_intel` and/or `gpu_passthrough_nvidia` instead."
+        )
        gpu_passthrough_intel = True
        gpu_passthrough_nvidia = True
    else:
        gpu_passthrough_intel = config.my_getboolean("gpu_passthrough_intel")
        gpu_passthrough_nvidia = config.my_getboolean("gpu_passthrough_nvidia")

-    if gpu_passthrough_intel or gpu_passthrough_nvidia:
-        systemd_nspawn_additional_args.append("--property=DeviceAllow=char-drm rw")
-
    passthrough_intel(gpu_passthrough_intel, systemd_nspawn_additional_args)
    passthrough_nvidia(
        gpu_passthrough_nvidia, systemd_nspawn_additional_args, jail_name
    )

+    if seccomp is False:
+        # Disabling seccomp filtering by passing --setenv=SYSTEMD_SECCOMP=0 to systemd-run will improve performance
+        # at the expense of security: it allows syscalls which otherwise would be blocked or would have to be explicitly allowed by passing
+        # --system-call-filter to systemd-nspawn
+        # https://github.com/systemd/systemd/issues/18370
+        #
+        # However, and additional layer of seccomp filtering may be undesirable
+        # For example when using docker to run containers inside the jail created with systemd-nspawn
+        # Even though seccomp filtering is disabled for the systemd-nspawn jail itself, docker can still use seccomp filtering
+        # to restrict the actions available within its containers
+        #
+        # Proof that seccomp can be used inside a jail started with --setenv=SYSTEMD_SECCOMP=0:
+        # Run a command in a docker container which is blocked by the default docker seccomp profile:
+        # 	docker run --rm -it debian:jessie unshare --map-root-user --user sh -c whoami
+        # 	unshare: unshare failed: Operation not permitted
+        # Now run unconfined to show command runs successfully:
+        # 	docker run --rm -it --security-opt seccomp=unconfined debian:jessie unshare --map-root-user --user sh -c whoami
+        # 	root
+
+        systemd_run_additional_args += [
+            "--setenv=SYSTEMD_SECCOMP=0",
+        ]
+
+    # The systemd-nspawn manual explicitly mentions:
+    # Device nodes may not be created
+    # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
+    # This means docker images containing device nodes can't be pulled
+    # https://github.com/moby/moby/issues/35245
+    #
+    # The solution is to use DevicePolicy=auto
+    # https://github.com/kinvolk/kube-spawn/pull/328
+    #
+    # DevicePolicy=auto is the default for systemd-run and allows access to all devices
+    # as long as we don't add any --property=DeviceAllow= flags
+    # https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
+    #
+    # We can now successfully run:
+    # mknod /dev/port c 1 4
+    # Or pull docker images containing device nodes:
+    # docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
+
    cmd = [
        "systemd-run",
        *shlex.split(config.my_get("systemd_run_default_args")),
@ -1074,19 +1098,6 @@ def interactive_config():

        jail_name = ask_jail_name(jail_name)

-        print(
-            dedent(
-                f"""
-            Docker won't be installed by {COMMAND_NAME}.
-            But it can setup the jail with the capabilities required to run docker.
-            You can turn DOCKER_COMPATIBLE mode on/off post-install.
-        """
-            )
-        )
-
-        agree_with_default(
-            config, "docker_compatible", "Make jail docker compatible right now?"
-        )
        print()
        agree_with_default(
            config, "gpu_passthrough_intel", "Passthrough the intel GPU (if present)?"
@ -1261,11 +1272,16 @@ def create_jail(**kwargs):
            "gpu_passthrough_intel",
            "gpu_passthrough_nvidia",
            "release",
+            "seccomp",
            "startup",
            "systemd_nspawn_user_args",
        ]:
            value = kwargs.pop(option)
-            if value:
+            if (
+                value is not None
+                and len(value)
+                and value is not config.my_get(option, None)
+            ):
                # TODO: this will wipe all systemd_nspawn_user_args from the template...
                # Should there be an option to append them instead?
                print(f"Overriding {option} config value with {value}.")
@ -1641,10 +1657,9 @@ def list_jails():

        config = parse_config_file(get_jail_config_path(jail_name))
        if config:
-            # TODO: also list privileged once this setting is implemented
            jail["startup"] = config.my_getboolean("startup")

-            # TODO: deprecate gpu_passthrough and stop supporting it
+            # TODO: remove gpu_passthrough in future release
            if config.my_getboolean("gpu_passthrough", False):
                jail["gpu_intel"] = True
                jail["gpu_nvidia"] = True
@ -2005,6 +2020,13 @@ def main():
        "--docker_compatible",  #
        type=int,
        choices=[0, 1],
+        help="DEPRECATED",
+    )
+    commands["create"].add_argument(
+        "--seccomp",  #
+        type=int,
+        choices=[0, 1],
+        help="turning off seccomp filtering improves performance at the expense of security",
    )
    commands["create"].add_argument(
        "-c",  #
--- a/templates/docker/config
+++ b/templates/docker/config
@ -1,6 +1,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so docker can manage firewall rules
--- a/templates/incus/config
+++ b/templates/incus/config
@ -2,6 +2,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so incus can manage firewall rules
@ -9,7 +11,6 @@ gpu_passthrough_nvidia=0
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
-# TODO: or add and use privileged flag?
 systemd_nspawn_user_args=--network-macvlan=eno1
    --resolv-conf=bind-host
    --capability=all
@ -63,8 +64,6 @@ systemd_run_default_args=--property=KillMode=mixed
    --property=TasksMax=infinity
    --collect
    --setenv=SYSTEMD_NSPAWN_LOCK=0
-# TODO: add below if required:
-# --property=DevicePolicy=auto

 systemd_nspawn_default_args=--keep-unit
    --quiet
--- a/templates/lxd/config
+++ b/templates/lxd/config
@ -2,6 +2,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so lxd can manage firewall rules
@ -9,7 +11,6 @@ gpu_passthrough_nvidia=0
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
-# TODO: or add and use privileged flag?
 systemd_nspawn_user_args=--network-bridge=br1
    --resolv-conf=bind-host
    --capability=all
@ -49,8 +50,6 @@ systemd_run_default_args=--property=KillMode=mixed
    --property=TasksMax=infinity
    --collect
    --setenv=SYSTEMD_NSPAWN_LOCK=0
-# TODO: add below if required:
-# --property=DevicePolicy=auto

 systemd_nspawn_default_args=--keep-unit
    --quiet
--- a/templates/podman/config
+++ b/templates/podman/config
@ -1,6 +1,8 @@
 startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1

 # Use macvlan networking to provide an isolated network namespace,
 # so podman can manage firewall rules