From 9661f4ff19b44470319b7e258f562c8076405dda Mon Sep 17 00:00:00 2001
From: Jip-Hop <2871973+Jip-Hop@users.noreply.github.com>
Date: Mon, 22 Apr 2024 20:50:40 +0200
Subject: [PATCH] Deprecate docker_compatible and gpu_passthrough (#121)

Remove --property=DeviceAllow= so it won't interfere with DevicePolicy=auto
Added seccomp config option
Deprecated docker_compatible config option
Deprecated gpu_passthrough config option
Removed the docker_compatible question during interactive create
Updated readme and config templates
Closes https://github.com/Jip-Hop/jailmaker/issues/119
---
 README.md               |   2 +-
 jlmkr.py                | 150 +++++++++++++++++++++++-----------------
 templates/docker/config |   2 +
 templates/incus/config  |   5 +-
 templates/lxd/config    |   5 +-
 templates/podman/config |   2 +
 6 files changed, 95 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 6245220..67d3beb 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ See [Advanced Networking](./NETWORKING.md) for more.
 
 ## Docker
 
-The `jailmaker` script won't install Docker for you, but it can setup the jail with the capabilities required to run docker. You can manually install Docker inside the jail using the [official installation guide](https://docs.docker.com/engine/install/#server) or use [convenience script](https://get.docker.com). Additionally you may use the [docker config template](./templates/docker/README.md).
+Using the [docker config template](./templates/docker/README.md) is recommended if you want to run docker inside the jail. You may of course manually install docker inside a jail. But keep in mind that you need to add `--system-call-filter='add_key keyctl bpf'` (or disable seccomp filtering). It is [not recommended to use host networking for a jail in which you run docker](https://github.com/Jip-Hop/jailmaker/issues/119). Docker needs to manage iptables rules, which it can safely do in its own networking namespace (when using [bridge or macvlan networking](./NETWORKING.md) for the jail).
 
 ## Documentation
 
diff --git a/jlmkr.py b/jlmkr.py
index f2fc612..7949e1d 100755
--- a/jlmkr.py
+++ b/jlmkr.py
@@ -4,7 +4,7 @@
 with full access to all files via bind mounts, \
 thanks to systemd-nspawn!"""
 
-__version__ = "1.1.4"
+__version__ = "1.1.5"
 
 __disclaimer__ = """USE THIS SCRIPT AT YOUR OWN RISK!
 IT COMES WITHOUT WARRANTY AND IS NOT SUPPORTED BY IXSYSTEMS."""
@@ -38,7 +38,10 @@ from textwrap import dedent
 DEFAULT_CONFIG = """startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
+# The docker_compatible option is deprecated and will be removed in a future release
 docker_compatible=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1
 
 # Add additional systemd-nspawn flags
 # E.g. to mount host storage in the jail (--bind-ro for readonly):
@@ -47,7 +50,7 @@ docker_compatible=0
 # --network-macvlan=eno1 --resolv-conf=bind-host
 # E.g. bridge networking:
 # --network-bridge=br1 --resolv-conf=bind-host
-# E.g. add capabilities required by docker:
+# E.g. allow syscalls required by docker:
 # --system-call-filter='add_key keyctl bpf'
 systemd_nspawn_user_args=
 
@@ -135,8 +138,7 @@ else:
 DISCLAIMER = f"""{YELLOW}{BOLD}{__disclaimer__}{NORMAL}"""
 
 # Used in parser getters to indicate the default behavior when a specific
-# option is not found it to raise an exception. Created to enable `None` as
-# a valid fallback value.
+# option is not found. Created to enable `None` as a valid fallback value.
 _UNSET = object()
 
 
@@ -249,8 +251,8 @@ class KeyValueParser(configparser.ConfigParser):
         super().set(self._section_name, option, value)
 
     # Return value for specified option key
-    def my_get(self, option):
-        return super().get(self._section_name, option)
+    def my_get(self, option, fallback=_UNSET):
+        return super().get(self._section_name, option, fallback=fallback)
 
     # Return value converted to boolean for specified option key
     def my_getboolean(self, option, fallback=_UNSET):
@@ -302,6 +304,12 @@ def get_jail_rootfs_path(jail_name):
     return os.path.join(get_jail_path(jail_name), JAIL_ROOTFS_NAME)
 
 
+# Test intel GPU by decoding mp4 file (output is discarded)
+# Run the commands below in the jail:
+# curl -o bunny.mp4 https://www.w3schools.com/html/mov_bbb.mp4
+# ffmpeg -hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi -i bunny.mp4 -f null - && echo 'SUCCESS!'
+
+
 def passthrough_intel(gpu_passthrough_intel, systemd_nspawn_additional_args):
     if not gpu_passthrough_intel:
         return
@@ -536,6 +544,8 @@ def start_jail(jail_name):
         eprint("Aborting...")
         return 1
 
+    seccomp = config.my_getboolean("seccomp")
+
     # Handle initial setup
     initial_setup = config.my_get("initial_setup")
 
@@ -596,19 +606,13 @@ def start_jail(jail_name):
         f"--directory={JAIL_ROOTFS_NAME}",
     ]
 
-    # TODO: split the docker_compatible option into separate options
-    #   - privileged (to disable seccomp, set DevicePolicy=auto and add all capabilities)
-    #   "The bottom line is that using the --privileged flag does not tell the container
-    #   engines to add additional security constraints. The --privileged flag does not add
-    #   any privilege over what the processes launching the containers have."
-    #   "Container engines user namespace is not affected by the --privileged flag"
-    #   Meaning in the context of systemd-nspawn I could have a privileged option,
-    #   which would also apply to jails with --private-users (user namespacing)
-    #   https://www.redhat.com/sysadmin/privileged-flag-container-engines
-    #   - how to call the option to enable ip_forward and bridge-nf-call?
-    #   - add CSV value for preloading kernel modules like linux.kernel_modules in LXC
-
     if config.my_getboolean("docker_compatible"):
+        eprint("WARNING: DEPRECATED OPTION")
+        eprint(
+            "The `docker_compatible` option is deprecated and will be removed in a future release."
+        )
+        eprint("Please refer to the recommended way to run docker in a jail:")
+        eprint("https://github.com/Jip-Hop/jailmaker/tree/main/templates/docker")
         # Enable ip forwarding on the host (docker needs it)
         print(1, file=open("/proc/sys/net/ipv4/ip_forward", "w"))
 
@@ -635,32 +639,8 @@ def start_jail(jail_name):
                 )
             )
 
-        # To properly run docker inside the jail, we need to lift restrictions
-        # Without DevicePolicy=auto images with device nodes may not be pulled
-        # For example docker pull ljishen/sysbench would fail
-        # Fortunately I didn't encounter many images with device nodes...
-        #
-        # Issue: https://github.com/moby/moby/issues/35245
-        #
-        # The systemd-nspawn manual explicitly mentions:
-        # Device nodes may not be created
-        # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
-        #
-        # Workaround: https://github.com/kinvolk/kube-spawn/pull/328
-        #
-        # As of 26-3-2024 on TrueNAS-SCALE-23.10.1.1 it seems to no longer be
-        # required to use DevicePolicy=auto
-        # Docker can successfully pull the ljishen/sysbench test image
-        # Running mknod /dev/port c 1 4 manually works too...
-        # Unknown why this suddenly started working...
-        # https://github.com/systemd/systemd/issues/21987
-        #
-        # Use SYSTEMD_SECCOMP=0: https://github.com/systemd/systemd/issues/18370
-
-        systemd_run_additional_args += [
-            "--setenv=SYSTEMD_SECCOMP=0",
-            "--property=DevicePolicy=auto",
-        ]
+        print("The `docker_compatible` option disables seccomp filtering...")
+        seccomp = False
 
         # Add additional flags required for docker
         systemd_nspawn_additional_args += [
@@ -683,22 +663,66 @@ def start_jail(jail_name):
     )
 
     # Legacy gpu_passthrough config setting
-    # TODO: deprecate this and stop supporting it
     if config.my_getboolean("gpu_passthrough", False):
+        eprint("WARNING: DEPRECATED OPTION")
+        eprint(
+            "The `gpu_passthrough` option is deprecated and will be removed in a future release."
+        )
+        eprint(
+            "Please use `gpu_passthrough_intel` and/or `gpu_passthrough_nvidia` instead."
+        )
         gpu_passthrough_intel = True
         gpu_passthrough_nvidia = True
     else:
         gpu_passthrough_intel = config.my_getboolean("gpu_passthrough_intel")
         gpu_passthrough_nvidia = config.my_getboolean("gpu_passthrough_nvidia")
 
-    if gpu_passthrough_intel or gpu_passthrough_nvidia:
-        systemd_nspawn_additional_args.append("--property=DeviceAllow=char-drm rw")
-
     passthrough_intel(gpu_passthrough_intel, systemd_nspawn_additional_args)
     passthrough_nvidia(
         gpu_passthrough_nvidia, systemd_nspawn_additional_args, jail_name
     )
 
+    if seccomp is False:
+        # Disabling seccomp filtering by passing --setenv=SYSTEMD_SECCOMP=0 to systemd-run will improve performance
+        # at the expense of security: it allows syscalls which otherwise would be blocked or would have to be explicitly allowed by passing
+        # --system-call-filter to systemd-nspawn
+        # https://github.com/systemd/systemd/issues/18370
+        #
+        # However, and additional layer of seccomp filtering may be undesirable
+        # For example when using docker to run containers inside the jail created with systemd-nspawn
+        # Even though seccomp filtering is disabled for the systemd-nspawn jail itself, docker can still use seccomp filtering
+        # to restrict the actions available within its containers
+        #
+        # Proof that seccomp can be used inside a jail started with --setenv=SYSTEMD_SECCOMP=0:
+        # Run a command in a docker container which is blocked by the default docker seccomp profile:
+        # 	docker run --rm -it debian:jessie unshare --map-root-user --user sh -c whoami
+        # 	unshare: unshare failed: Operation not permitted
+        # Now run unconfined to show command runs successfully:
+        # 	docker run --rm -it --security-opt seccomp=unconfined debian:jessie unshare --map-root-user --user sh -c whoami
+        # 	root
+
+        systemd_run_additional_args += [
+            "--setenv=SYSTEMD_SECCOMP=0",
+        ]
+
+    # The systemd-nspawn manual explicitly mentions:
+    # Device nodes may not be created
+    # https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
+    # This means docker images containing device nodes can't be pulled
+    # https://github.com/moby/moby/issues/35245
+    #
+    # The solution is to use DevicePolicy=auto
+    # https://github.com/kinvolk/kube-spawn/pull/328
+    #
+    # DevicePolicy=auto is the default for systemd-run and allows access to all devices
+    # as long as we don't add any --property=DeviceAllow= flags
+    # https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
+    #
+    # We can now successfully run:
+    # mknod /dev/port c 1 4
+    # Or pull docker images containing device nodes:
+    # docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
+
     cmd = [
         "systemd-run",
         *shlex.split(config.my_get("systemd_run_default_args")),
@@ -1074,19 +1098,6 @@ def interactive_config():
 
         jail_name = ask_jail_name(jail_name)
 
-        print(
-            dedent(
-                f"""
-            Docker won't be installed by {COMMAND_NAME}.
-            But it can setup the jail with the capabilities required to run docker.
-            You can turn DOCKER_COMPATIBLE mode on/off post-install.
-        """
-            )
-        )
-
-        agree_with_default(
-            config, "docker_compatible", "Make jail docker compatible right now?"
-        )
         print()
         agree_with_default(
             config, "gpu_passthrough_intel", "Passthrough the intel GPU (if present)?"
@@ -1261,11 +1272,16 @@ def create_jail(**kwargs):
             "gpu_passthrough_intel",
             "gpu_passthrough_nvidia",
             "release",
+            "seccomp",
             "startup",
             "systemd_nspawn_user_args",
         ]:
             value = kwargs.pop(option)
-            if value:
+            if (
+                value is not None
+                and len(value)
+                and value is not config.my_get(option, None)
+            ):
                 # TODO: this will wipe all systemd_nspawn_user_args from the template...
                 # Should there be an option to append them instead?
                 print(f"Overriding {option} config value with {value}.")
@@ -1641,10 +1657,9 @@ def list_jails():
 
         config = parse_config_file(get_jail_config_path(jail_name))
         if config:
-            # TODO: also list privileged once this setting is implemented
             jail["startup"] = config.my_getboolean("startup")
 
-            # TODO: deprecate gpu_passthrough and stop supporting it
+            # TODO: remove gpu_passthrough in future release
             if config.my_getboolean("gpu_passthrough", False):
                 jail["gpu_intel"] = True
                 jail["gpu_nvidia"] = True
@@ -2005,6 +2020,13 @@ def main():
         "--docker_compatible",  #
         type=int,
         choices=[0, 1],
+        help="DEPRECATED",
+    )
+    commands["create"].add_argument(
+        "--seccomp",  #
+        type=int,
+        choices=[0, 1],
+        help="turning off seccomp filtering improves performance at the expense of security",
     )
     commands["create"].add_argument(
         "-c",  #
diff --git a/templates/docker/config b/templates/docker/config
index c141ec8..e862e4d 100644
--- a/templates/docker/config
+++ b/templates/docker/config
@@ -1,6 +1,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1
 
 # Use macvlan networking to provide an isolated network namespace,
 # so docker can manage firewall rules
diff --git a/templates/incus/config b/templates/incus/config
index 2b82dd8..6a57e0c 100644
--- a/templates/incus/config
+++ b/templates/incus/config
@@ -2,6 +2,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1
 
 # Use macvlan networking to provide an isolated network namespace,
 # so incus can manage firewall rules
@@ -9,7 +11,6 @@ gpu_passthrough_nvidia=0
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
-# TODO: or add and use privileged flag?
 systemd_nspawn_user_args=--network-macvlan=eno1
     --resolv-conf=bind-host
     --capability=all
@@ -63,8 +64,6 @@ systemd_run_default_args=--property=KillMode=mixed
     --property=TasksMax=infinity
     --collect
     --setenv=SYSTEMD_NSPAWN_LOCK=0
-# TODO: add below if required:
-# --property=DevicePolicy=auto
 
 systemd_nspawn_default_args=--keep-unit
     --quiet
diff --git a/templates/lxd/config b/templates/lxd/config
index 2c1e46e..4cb5e19 100644
--- a/templates/lxd/config
+++ b/templates/lxd/config
@@ -2,6 +2,8 @@
 startup=0
 gpu_passthrough_intel=1
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1
 
 # Use macvlan networking to provide an isolated network namespace,
 # so lxd can manage firewall rules
@@ -9,7 +11,6 @@ gpu_passthrough_nvidia=0
 # Ensure to change eno1/br1 to the interface name you want to use
 # You may want to add additional options here, e.g. bind mounts
 # TODO: don't use --capability=all but specify only the required capabilities
-# TODO: or add and use privileged flag?
 systemd_nspawn_user_args=--network-bridge=br1
     --resolv-conf=bind-host
     --capability=all
@@ -49,8 +50,6 @@ systemd_run_default_args=--property=KillMode=mixed
     --property=TasksMax=infinity
     --collect
     --setenv=SYSTEMD_NSPAWN_LOCK=0
-# TODO: add below if required:
-# --property=DevicePolicy=auto
 
 systemd_nspawn_default_args=--keep-unit
     --quiet
diff --git a/templates/podman/config b/templates/podman/config
index 4675e07..989c686 100644
--- a/templates/podman/config
+++ b/templates/podman/config
@@ -1,6 +1,8 @@
 startup=0
 gpu_passthrough_intel=0
 gpu_passthrough_nvidia=0
+# Turning off seccomp filtering improves performance at the expense of security
+seccomp=1
 
 # Use macvlan networking to provide an isolated network namespace,
 # so podman can manage firewall rules