Compare commits

...

6 Commits

Author SHA1 Message Date
Jip-Hop 6851ad2cd1 Fix podman rootless install instructions
Manually add the required capabilities to the `newuidmap` and `newgidmap` binaries
2024-05-08 22:59:03 +02:00
Jip-Hop 03fcb961b7 Log jail name after initial setup 2024-05-08 22:49:28 +02:00
Jip-Hop 2f4a113cf1 Bump version to 1.4.0 2024-05-08 22:25:19 +02:00
Jip-Hop b849cb787c Update config templates
Updated config templates to benefit from initial_setup running after jail has fully started.

Added conditional nvidia-container-toolkit install during initial_setup.

Config templates now default to bridge networking.
2024-05-08 22:24:59 +02:00
Jip-Hop 4cd7c54c58 Run initial_setup once jail starts for first time
First jail startup will now wait for init system and networking to be ready before running the initial_setup script inside the jail. All systemd_nspawn_user_args are now applied and the initial_setup script will have access to filed mounted via bind mounts.
2024-05-08 22:21:06 +02:00
Jip-Hop 93267b6ec6 Formatting 2024-05-08 18:59:58 +02:00
9 changed files with 145 additions and 129 deletions

186
jlmkr.py
View File

@ -4,7 +4,7 @@
with full access to all files via bind mounts, \
thanks to systemd-nspawn!"""
__version__ = "1.3.0"
__version__ = "1.4.0"
__disclaimer__ = """USE THIS SCRIPT AT YOUR OWN RISK!
IT COMES WITHOUT WARRANTY AND IS NOT SUPPORTED BY IXSYSTEMS."""
@ -43,15 +43,13 @@ docker_compatible=0
# Turning off seccomp filtering improves performance at the expense of security
seccomp=1
# Add additional systemd-nspawn flags
# E.g. to mount host storage in the jail (--bind-ro for readonly):
# --bind='/mnt/pool/dataset:/home' --bind-ro=/etc/certificates
# E.g. macvlan networking:
# --network-macvlan=eno1 --resolv-conf=bind-host
# E.g. bridge networking:
# --network-bridge=br1 --resolv-conf=bind-host
# E.g. allow syscalls required by docker:
# --system-call-filter='add_key keyctl bpf'
# Below you may add additional systemd-nspawn flags behind systemd_nspawn_user_args=
# To mount host storage in the jail, you may add: --bind='/mnt/pool/dataset:/home'
# To readonly mount host storage, you may add: --bind-ro=/etc/certificates
# To use macvlan networking add: --network-macvlan=eno1 --resolv-conf=bind-host
# To use bridge networking add: --network-bridge=br1 --resolv-conf=bind-host
# Ensure to change eno1/br1 to the interface name you want to use
# To allow syscalls required by docker add: --system-call-filter='add_key keyctl bpf'
systemd_nspawn_user_args=
# Specify command/script to run on the HOST before starting the jail
@ -73,10 +71,8 @@ post_stop_hook=
distro=debian
release=bookworm
# Specify command/script to run IN THE JAIL before the first start
# Specify command/script to run IN THE JAIL on the first start (once networking is ready in the jail)
# Useful to install packages on top of the base rootfs
# NOTE: this script will run in the host networking namespace and
# ignores all systemd_nspawn_user_args such as bind mounts
initial_setup=
# initial_setup=bash -c 'apt-get update && apt-get -y upgrade'
@ -545,55 +541,6 @@ def start_jail(jail_name):
seccomp = config.my_getboolean("seccomp")
# Handle initial setup
initial_setup = config.my_get("initial_setup")
# Alternative method to setup on first boot:
# https://www.undrground.org/2021/01/25/adding-a-single-run-task-via-systemd/
# If there's no machine-id, then this the first time the jail is started
if initial_setup and not os.path.exists(
os.path.join(jail_rootfs_path, "etc/machine-id")
):
initial_setup_file = None
if initial_setup.startswith("#!"):
# Write a script file and call that
initial_setup_file = os.path.abspath(
os.path.join(jail_path, ".initial_setup")
)
print(initial_setup, file=open(initial_setup_file, "w"))
stat_chmod(initial_setup_file, 0o700)
cmd = [
"systemd-nspawn",
"-q",
"-D",
jail_rootfs_path,
f"--bind-ro={initial_setup_file}:/root/initial_startup",
"/root/initial_startup",
]
else:
# Run the command directly if it doesn't start with a shebang
cmd = [
"systemd-nspawn",
"-q",
"-D",
jail_rootfs_path,
*shlex.split(initial_setup),
]
returncode = subprocess.run(cmd).returncode
# Cleanup the initial_setup_file
if initial_setup_file:
Path(initial_setup_file).unlink(missing_ok=True)
if returncode != 0:
eprint("Failed to run initial setup:")
eprint(initial_setup)
eprint()
eprint("Abort starting jail.")
return returncode
systemd_run_additional_args = [
f"--unit={SYMLINK_NAME}-{jail_name}",
f"--working-directory=./{jail_path}",
@ -605,6 +552,24 @@ def start_jail(jail_name):
f"--directory={JAIL_ROOTFS_NAME}",
]
# The systemd-nspawn manual explicitly mentions:
# Device nodes may not be created
# https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
# This means docker images containing device nodes can't be pulled
# https://github.com/moby/moby/issues/35245
#
# The solution is to use DevicePolicy=auto
# https://github.com/kinvolk/kube-spawn/pull/328
#
# DevicePolicy=auto is the default for systemd-run and allows access to all devices
# as long as we don't add any --property=DeviceAllow= flags
# https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
#
# We can now successfully run:
# mknod /dev/port c 1 4
# Or pull docker images containing device nodes:
# docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
if config.my_getboolean("docker_compatible"):
eprint("WARNING: DEPRECATED OPTION")
eprint(
@ -704,23 +669,28 @@ def start_jail(jail_name):
"--setenv=SYSTEMD_SECCOMP=0",
]
# The systemd-nspawn manual explicitly mentions:
# Device nodes may not be created
# https://www.freedesktop.org/software/systemd/man/systemd-nspawn.html
# This means docker images containing device nodes can't be pulled
# https://github.com/moby/moby/issues/35245
#
# The solution is to use DevicePolicy=auto
# https://github.com/kinvolk/kube-spawn/pull/328
#
# DevicePolicy=auto is the default for systemd-run and allows access to all devices
# as long as we don't add any --property=DeviceAllow= flags
# https://manpages.debian.org/bookworm/systemd/systemd.resource-control.5.en.html
#
# We can now successfully run:
# mknod /dev/port c 1 4
# Or pull docker images containing device nodes:
# docker pull oraclelinux@sha256:d49469769e4701925d5145c2676d5a10c38c213802cf13270ec3a12c9c84d643
initial_setup = False
# If there's no machine-id, then this the first time the jail is started
if not os.path.exists(os.path.join(jail_rootfs_path, "etc/machine-id")) and (
initial_setup := config.my_get("initial_setup")
):
if not initial_setup.startswith("#!"):
initial_setup = "#!/bin/sh\n" + initial_setup
initial_setup_file_jailed_path = "/root/jlmkr-initial-setup"
initial_setup_file_host_path = os.path.abspath(
jail_rootfs_path + initial_setup_file_jailed_path
)
# Write a script file to call during initial setup
print(initial_setup, file=open(initial_setup_file_host_path, "w"))
stat_chmod(initial_setup_file_host_path, 0o700)
# Ensure the jail init system is ready before we start the initial_setup
systemd_nspawn_additional_args += [
"--notify-ready=yes",
]
cmd = [
"systemd-run",
@ -755,6 +725,44 @@ def start_jail(jail_name):
)
)
return returncode
# Handle initial setup after jail is up and running (for the first time)
if initial_setup:
print("About to run the initial setup.")
print("Waiting for networking in the jail to be ready.")
print("Please wait (this may take 90s in case of bridge networking)...")
returncode = exec_jail(
jail_name,
[
"--",
"systemd-run",
f"--unit={os.path.basename(initial_setup_file_jailed_path)}",
"--quiet",
"--pipe",
"--wait",
"--service-type=exec",
"--property=After=network-online.target",
"--property=Wants=network-online.target",
initial_setup_file_jailed_path,
],
)
# Cleanup the initial_setup_file_host_path
if initial_setup_file_host_path:
Path(initial_setup_file_host_path).unlink(missing_ok=True)
if returncode != 0:
eprint("Tried to run the following commands inside the jail:")
eprint(initial_setup)
eprint()
eprint(
f"""{RED}{BOLD}Failed to run initial setup... you may want to stop and remove the jail and try again.{NORMAL}"""
)
return returncode
else:
print(f"Done with initial setup of jail {jail_name}!")
return returncode
@ -922,11 +930,12 @@ def get_zfs_dataset(path):
"""
Get ZFS dataset path.
"""
def clean_field(field):
# Put back spaces which were encoded
# https://github.com/openzfs/zfs/issues/11182
return field.replace('\\040', ' ')
return field.replace("\\040", " ")
path = os.path.realpath(path)
with open("/proc/mounts", "r") as f:
for line in f:
@ -1025,11 +1034,14 @@ def get_text_editor():
if editor := os.environ.get(key):
return shutil.which(editor)
return get_from_environ("VISUAL") \
or get_from_environ("EDITOR") \
or shutil.which("editor") \
or shutil.which("/usr/bin/editor") \
return (
get_from_environ("VISUAL")
or get_from_environ("EDITOR")
or shutil.which("editor")
or shutil.which("/usr/bin/editor")
or "nano"
)
def interactive_config():
config = KeyValueParser()
@ -1517,9 +1529,7 @@ def edit_jail(jail_name):
jail_config_path = get_jail_config_path(jail_name)
returncode = subprocess.run(
[get_text_editor(), jail_config_path]
).returncode
returncode = subprocess.run([get_text_editor(), jail_config_path]).returncode
if returncode != 0:
eprint(f"An error occurred while editing {jail_config_path}.")

View File

@ -2,4 +2,4 @@
## Setup
Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/docker/config mydockerjail`.
Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/docker/config mydockerjail`. If you want the `nvidia-container-toolkit` to be installed, ensure you set `gpu_passthrough_nvidia=1` when creating the jail.

View File

@ -6,10 +6,10 @@ seccomp=1
# Use macvlan networking to provide an isolated network namespace,
# so docker can manage firewall rules
# Alternatively use --network-bridge=br1 instead of --network-macvlan
# Alternatively use --network-macvlan=eno1 instead of --network-bridge
# Ensure to change eno1/br1 to the interface name you want to use
# You may want to add additional options here, e.g. bind mounts
systemd_nspawn_user_args=--network-macvlan=eno1
systemd_nspawn_user_args=--network-bridge=br1
--resolv-conf=bind-host
--system-call-filter='add_key keyctl bpf'
@ -29,8 +29,8 @@ release=bookworm
# Install docker inside the jail:
# https://docs.docker.com/engine/install/debian/#install-using-the-repository
# NOTE: this script will run in the host networking namespace and ignores
# all systemd_nspawn_user_args such as bind mounts
# Will also install the NVIDIA Container Toolkit if gpu_passthrough_nvidia=1 during initial setup
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
initial_setup=#!/usr/bin/bash
set -euo pipefail
@ -43,8 +43,26 @@ initial_setup=#!/usr/bin/bash
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
tee /etc/apt/sources.list.d/docker.list > /dev/null
apt-get update
apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
# The /usr/bin/nvidia-smi will be present when gpu_passthrough_nvidia=1
if [ -f /usr/bin/nvidia-smi ]; then
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey -o /etc/apt/keyrings/nvidia.asc
chmod a+r /etc/apt/keyrings/nvidia.asc
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/etc/apt/keyrings/nvidia.asc] https://#g' | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
nvidia-ctk runtime configure --runtime=docker
systemctl restart docker
fi
docker info
# You generally will not need to change the options below
systemd_run_default_args=--property=KillMode=mixed

View File

@ -8,11 +8,10 @@
Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/incus/config myincusjail`.
Unfortunately incus doesn't want to install from the `initial_setup` script inside the config file. So we manually finish the setup by running the following after creating and starting the jail:
We manually finish the setup by running the following after creating and starting the jail:
```bash
jlmkr exec myincusjail bash -c 'apt-get -y install incus incus-ui-canonical &&
incus admin init'
jlmkr exec myincusjail bash -c 'incus admin init'
```
Follow [First steps with Incus](https://linuxcontainers.org/incus/docs/main/tutorial/first_steps/).

View File

@ -3,15 +3,16 @@ startup=0
gpu_passthrough_intel=0
gpu_passthrough_nvidia=0
# Turning off seccomp filtering improves performance at the expense of security
seccomp=1
# TODO: don't disable seccomp but specify which syscalls should be allowed
seccomp=0
# Use macvlan networking to provide an isolated network namespace,
# so incus can manage firewall rules
# Alternatively use --network-bridge=br1 instead of --network-macvlan
# Alternatively use --network-macvlan=eno1 instead of --network-bridge
# Ensure to change eno1/br1 to the interface name you want to use
# You may want to add additional options here, e.g. bind mounts
# TODO: don't use --capability=all but specify only the required capabilities
systemd_nspawn_user_args=--network-macvlan=eno1
systemd_nspawn_user_args=--network-bridge=br1
--resolv-conf=bind-host
--capability=all
--bind=/dev/fuse
@ -36,8 +37,6 @@ release=bookworm
# Install incus according to:
# https://github.com/zabbly/incus#installation
# NOTE: this script will run in the host networking namespace and ignores
# all systemd_nspawn_user_args such as bind mounts
initial_setup=#!/usr/bin/bash
set -euo pipefail
apt-get update && apt-get -y install curl
@ -54,6 +53,7 @@ initial_setup=#!/usr/bin/bash
EOF'
apt-get update
apt-get -y install incus incus-ui-canonical
# You generally will not need to change the options below
systemd_run_default_args=--property=KillMode=mixed

View File

@ -8,20 +8,7 @@
Check out the [config](./config) template file. You may provide it when asked during `jlmkr create` or, if you have the template file stored on your NAS, you may provide it directly by running `jlmkr create --start --config /mnt/tank/path/to/lxd/config mylxdjail`.
Unfortunately snapd doesn't want to install from the `initial_setup` script inside the config file. So we manually finish the setup by running the following after creating and starting the jail:
```bash
# Repeat listing the jail until you see it has an IPv4 address
jlmkr list
# Install packages
jlmkr exec mylxdjail bash -c 'apt-get update &&
apt-get install -y --no-install-recommends snapd &&
snap install lxd'
```
Choose the `dir` storage backend during `lxd init` and answer `yes` to "Would you like the LXD server to be available over the network?"
We manually finish the setup by running the command below after creating and starting the jail. Choose the `dir` storage backend during `lxd init` and answer `yes` to "Would you like the LXD server to be available over the network?"
```bash
jlmkr exec mylxdjail bash -c 'lxd init &&

View File

@ -3,11 +3,12 @@ startup=0
gpu_passthrough_intel=0
gpu_passthrough_nvidia=0
# Turning off seccomp filtering improves performance at the expense of security
seccomp=1
# TODO: don't disable seccomp but specify which syscalls should be allowed
seccomp=0
# Use macvlan networking to provide an isolated network namespace,
# so lxd can manage firewall rules
# Alternatively use --network-bridge=br1 instead of --network-macvlan
# Alternatively use --network-macvlan=eno1 instead of --network-bridge
# Ensure to change eno1/br1 to the interface name you want to use
# You may want to add additional options here, e.g. bind mounts
# TODO: don't use --capability=all but specify only the required capabilities
@ -34,12 +35,13 @@ pre_start_hook=#!/usr/bin/bash
distro=ubuntu
release=jammy
# NOTE: this script will run in the host networking namespace and ignores
# all systemd_nspawn_user_args such as bind mounts
initial_setup=#!/usr/bin/bash
set -euo pipefail
# https://discuss.linuxcontainers.org/t/snap-inside-privileged-lxd-container/13691/8
ln -sf /bin/true /usr/local/bin/udevadm
apt-get update
apt-get install -y --no-install-recommends snapd
snap install lxd
# You generally will not need to change the options below
systemd_run_default_args=--property=KillMode=mixed

View File

@ -46,6 +46,10 @@ usermod --del-subuids 0-4294967295 --del-subgids 0-4294967295 rootless
# Set a specific range, so it fits inside the number of available UIDs
usermod --add-subuids 65536-131071 --add-subgids 65536-131071 rootless
# Add the required capabilities to the `newuidmap` and `newgidmap` binaries
setcap cap_setuid+eip /usr/bin/newuidmap
setcap cap_setgid+eip /usr/bin/newgidmap
# Check the assigned range
cat /etc/subuid
# Check the available range
@ -120,4 +124,7 @@ Resources mentioning `add_key keyctl bpf`
Resources mentioning `@keyring`
- https://github.com/systemd/systemd/issues/17606
- https://github.com/systemd/systemd/blob/1c62c4fe0b54fb419b875cb2bae82a261518a745/src/shared/seccomp-util.c#L604
`@keyring` also includes `request_key` but doesn't include `bpf`
`@keyring` also includes `request_key` but doesn't include `bpf`
Resources mentioning `cap_setuid+eip`, `cap_setgid+eip`, `newuidmap` and `newgidmap`
- https://github.com/containers/podman/issues/2788#issuecomment-1016301663
- https://github.com/containers/podman/issues/12637#issuecomment-996524341

View File

@ -6,10 +6,10 @@ seccomp=1
# Use macvlan networking to provide an isolated network namespace,
# so podman can manage firewall rules
# Alternatively use --network-bridge=br1 instead of --network-macvlan
# Alternatively use --network-macvlan=eno1 instead of --network-bridge
# Ensure to change eno1/br1 to the interface name you want to use
# You may want to add additional options here, e.g. bind mounts
systemd_nspawn_user_args=--network-macvlan=eno1
systemd_nspawn_user_args=--network-bridge=br1
--resolv-conf=bind-host
--system-call-filter='add_key keyctl bpf'
@ -28,16 +28,9 @@ distro=fedora
release=39
# Install podman inside the jail
# NOTE: this script will run in the host networking namespace and ignores
# all systemd_nspawn_user_args such as bind mounts
initial_setup=#!/usr/bin/bash
set -euo pipefail
dnf -y install podman
# Add the required capabilities to the `newuidmap` and `newgidmap` binaries
# https://github.com/containers/podman/issues/2788#issuecomment-1016301663
# https://github.com/containers/podman/issues/12637#issuecomment-996524341
setcap cap_setuid+eip /usr/bin/newuidmap
setcap cap_setgid+eip /usr/bin/newgidmap
# You generally will not need to change the options below
systemd_run_default_args=--property=KillMode=mixed