Commit | Line | Data |
---|---|---|
c1f6a0c2 DT |
1 | ;;; GNU Guix --- Functional package management for GNU |
2 | ;;; Copyright © 2015 David Thompson <davet@gnu.org> | |
af76c020 | 3 | ;;; Copyright © 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org> |
c1f6a0c2 DT |
4 | ;;; |
5 | ;;; This file is part of GNU Guix. | |
6 | ;;; | |
7 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
8 | ;;; under the terms of the GNU General Public License as published by | |
9 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
10 | ;;; your option) any later version. | |
11 | ;;; | |
12 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
13 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;;; GNU General Public License for more details. | |
16 | ;;; | |
17 | ;;; You should have received a copy of the GNU General Public License | |
18 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
19 | ||
20 | (define-module (gnu build linux-container) | |
21 | #:use-module (ice-9 format) | |
22 | #:use-module (ice-9 match) | |
b7d48312 | 23 | #:use-module (ice-9 rdelim) |
c1f6a0c2 | 24 | #:use-module (srfi srfi-98) |
c1f6a0c2 DT |
25 | #:use-module (guix build utils) |
26 | #:use-module (guix build syscalls) | |
5970e8e2 | 27 | #:use-module (gnu system file-systems) ;<file-system> |
c1f6a0c2 | 28 | #:use-module ((gnu build file-systems) #:select (mount-file-system)) |
b7d48312 DT |
29 | #:export (user-namespace-supported? |
30 | unprivileged-user-namespace-supported? | |
31 | setgroups-supported? | |
32 | %namespaces | |
c1f6a0c2 DT |
33 | run-container |
34 | call-with-container | |
c90db25f LC |
35 | container-excursion |
36 | container-excursion*)) | |
c1f6a0c2 | 37 | |
b7d48312 DT |
38 | (define (user-namespace-supported?) |
39 | "Return #t if user namespaces are supported on this system." | |
40 | (file-exists? "/proc/self/ns/user")) | |
41 | ||
42 | (define (unprivileged-user-namespace-supported?) | |
43 | "Return #t if user namespaces can be created by unprivileged users." | |
44 | (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone")) | |
45 | (if (file-exists? userns-file) | |
c5184468 | 46 | (eqv? #\1 (call-with-input-file userns-file read-char)) |
b7d48312 DT |
47 | #t))) |
48 | ||
49 | (define (setgroups-supported?) | |
50 | "Return #t if the setgroups proc file, introduced in Linux-libre 3.19, | |
51 | exists." | |
52 | (file-exists? "/proc/self/setgroups")) | |
53 | ||
c1f6a0c2 DT |
54 | (define %namespaces |
55 | '(mnt pid ipc uts user net)) | |
56 | ||
57 | (define (call-with-clean-exit thunk) | |
58 | "Apply THUNK, but exit with a status code of 1 if it fails." | |
59 | (dynamic-wind | |
60 | (const #t) | |
a72ccbc2 DT |
61 | (lambda () |
62 | (thunk) | |
95aa64bc LC |
63 | |
64 | ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon | |
65 | ;; exit (coming from fd finalizers) when used by the Shepherd. To work | |
66 | ;; around that, exit forcefully so fd finalizers don't have a chance to | |
67 | ;; run and fail. | |
68 | (primitive-_exit 0)) | |
c1f6a0c2 | 69 | (lambda () |
95aa64bc | 70 | (primitive-_exit 1)))) |
c1f6a0c2 DT |
71 | |
72 | (define (purify-environment) | |
73 | "Unset all environment variables." | |
74 | (for-each unsetenv | |
75 | (match (get-environment-variables) | |
76 | (((names . _) ...) names)))) | |
77 | ||
78 | ;; The container setup procedure closely resembles that of the Docker | |
79 | ;; specification: | |
80 | ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md | |
81 | (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?) | |
5970e8e2 LC |
82 | "Mount the essential file systems and the those in MOUNTS, a list of |
83 | <file-system> objects, relative to ROOT; then make ROOT the new root directory | |
84 | for the process." | |
c1f6a0c2 DT |
85 | (define (scope dir) |
86 | (string-append root dir)) | |
87 | ||
4949ada9 DT |
88 | (define (touch file-name) |
89 | (call-with-output-file file-name (const #t))) | |
90 | ||
c1f6a0c2 DT |
91 | (define (bind-mount src dest) |
92 | (mount src dest "none" MS_BIND)) | |
93 | ||
94 | ;; Like mount, but creates the mount point if it doesn't exist. | |
95 | (define* (mount* source target type #:optional (flags 0) options | |
96 | #:key (update-mtab? #f)) | |
97 | (mkdir-p target) | |
98 | (mount source target type flags options #:update-mtab? update-mtab?)) | |
99 | ||
100 | ;; The container's file system is completely ephemeral, sans directories | |
101 | ;; bind-mounted from the host. | |
102 | (mount "none" root "tmpfs") | |
103 | ||
104 | ;; A proc mount requires a new pid namespace. | |
105 | (when mount-/proc? | |
106 | (mount* "none" (scope "/proc") "proc" | |
107 | (logior MS_NOEXEC MS_NOSUID MS_NODEV))) | |
108 | ||
109 | ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in | |
110 | ;; the current network namespace. | |
111 | (when mount-/sys? | |
112 | (mount* "none" (scope "/sys") "sysfs" | |
113 | (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))) | |
114 | ||
115 | (mount* "none" (scope "/dev") "tmpfs" | |
116 | (logior MS_NOEXEC MS_STRICTATIME) | |
117 | "mode=755") | |
118 | ||
119 | ;; Create essential device nodes via bind-mounting them from the | |
120 | ;; host, because a process within a user namespace cannot create | |
121 | ;; device nodes. | |
122 | (for-each (lambda (device) | |
123 | (when (file-exists? device) | |
124 | ;; Create the mount point file. | |
4949ada9 | 125 | (touch (scope device)) |
c1f6a0c2 DT |
126 | (bind-mount device (scope device)))) |
127 | '("/dev/null" | |
128 | "/dev/zero" | |
129 | "/dev/full" | |
130 | "/dev/random" | |
131 | "/dev/urandom" | |
132 | "/dev/tty" | |
c1f6a0c2 DT |
133 | "/dev/fuse")) |
134 | ||
935e79af LC |
135 | ;; Mount a new devpts instance on /dev/pts. |
136 | (when (file-exists? "/dev/ptmx") | |
137 | (mount* "none" (scope "/dev/pts") "devpts" 0 | |
138 | "newinstance,mode=0620") | |
139 | (symlink "/dev/pts/ptmx" (scope "/dev/ptmx"))) | |
140 | ||
4949ada9 | 141 | ;; Setup the container's /dev/console by bind mounting the pseudo-terminal |
168aba29 LC |
142 | ;; associated with standard input when there is one. |
143 | (let* ((in (current-input-port)) | |
144 | (tty (catch 'system-error | |
145 | (lambda () | |
146 | ;; This call throws if IN does not correspond to a tty. | |
147 | ;; This is more reliable than 'isatty?'. | |
148 | (ttyname in)) | |
149 | (const #f))) | |
150 | (console (scope "/dev/console"))) | |
151 | (when tty | |
4949ada9 DT |
152 | (touch console) |
153 | (chmod console #o600) | |
168aba29 | 154 | (bind-mount tty console))) |
4949ada9 | 155 | |
c1f6a0c2 DT |
156 | ;; Setup standard input/output/error. |
157 | (symlink "/proc/self/fd" (scope "/dev/fd")) | |
158 | (symlink "/proc/self/fd/0" (scope "/dev/stdin")) | |
159 | (symlink "/proc/self/fd/1" (scope "/dev/stdout")) | |
160 | (symlink "/proc/self/fd/2" (scope "/dev/stderr")) | |
161 | ||
162 | ;; Mount user-specified file systems. | |
5970e8e2 | 163 | (for-each (lambda (file-system) |
1c65cca5 | 164 | (mount-file-system file-system #:root root)) |
c1f6a0c2 DT |
165 | mounts) |
166 | ||
167 | ;; Jail the process inside the container's root file system. | |
168 | (let ((put-old (string-append root "/real-root"))) | |
169 | (mkdir put-old) | |
170 | (pivot-root root put-old) | |
171 | (chdir "/") | |
172 | (umount "real-root" MNT_DETACH) | |
173 | (rmdir "real-root"))) | |
174 | ||
af76c020 LC |
175 | (define* (initialize-user-namespace pid host-uids |
176 | #:key (guest-uid 0) (guest-gid 0)) | |
831bc146 | 177 | "Configure the user namespace for PID. HOST-UIDS specifies the number of |
af76c020 LC |
178 | host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID |
179 | specify the first UID (respectively GID) that host UIDs (respectively GIDs) | |
180 | map to in the namespace." | |
c1f6a0c2 DT |
181 | (define proc-dir |
182 | (string-append "/proc/" (number->string pid))) | |
183 | ||
184 | (define (scope file) | |
185 | (string-append proc-dir file)) | |
186 | ||
831bc146 DT |
187 | (let ((uid (getuid)) |
188 | (gid (getgid))) | |
c1f6a0c2 DT |
189 | |
190 | ;; Only root can write to the gid map without first disabling the | |
191 | ;; setgroups syscall. | |
192 | (unless (and (zero? uid) (zero? gid)) | |
193 | (call-with-output-file (scope "/setgroups") | |
194 | (lambda (port) | |
195 | (display "deny" port)))) | |
196 | ||
197 | ;; Map the user/group that created the container to the root user | |
198 | ;; within the container. | |
199 | (call-with-output-file (scope "/uid_map") | |
200 | (lambda (port) | |
af76c020 | 201 | (format port "~d ~d ~d" guest-uid uid host-uids))) |
c1f6a0c2 DT |
202 | (call-with-output-file (scope "/gid_map") |
203 | (lambda (port) | |
af76c020 | 204 | (format port "~d ~d ~d" guest-gid gid host-uids))))) |
c1f6a0c2 DT |
205 | |
206 | (define (namespaces->bit-mask namespaces) | |
207 | "Return the number suitable for the 'flags' argument of 'clone' that | |
208 | corresponds to the symbols in NAMESPACES." | |
ee78d024 | 209 | ;; Use the same flags as fork(3) in addition to the namespace flags. |
35b50a75 | 210 | (apply logior SIGCHLD |
c1f6a0c2 DT |
211 | (map (match-lambda |
212 | ('mnt CLONE_NEWNS) | |
213 | ('uts CLONE_NEWUTS) | |
214 | ('ipc CLONE_NEWIPC) | |
215 | ('user CLONE_NEWUSER) | |
216 | ('pid CLONE_NEWPID) | |
217 | ('net CLONE_NEWNET)) | |
218 | namespaces))) | |
219 | ||
af76c020 LC |
220 | (define* (run-container root mounts namespaces host-uids thunk |
221 | #:key (guest-uid 0) (guest-gid 0)) | |
c1f6a0c2 | 222 | "Run THUNK in a new container process and return its PID. ROOT specifies |
5970e8e2 LC |
223 | the root directory for the container. MOUNTS is a list of <file-system> |
224 | objects that specify file systems to mount inside the container. NAMESPACES | |
c1f6a0c2 | 225 | is a list of symbols that correspond to the possible Linux namespaces: mnt, |
af76c020 LC |
226 | ipc, uts, user, and net. |
227 | ||
228 | HOST-UIDS specifies the number of host user identifiers to map into the user | |
229 | namespace. GUEST-UID and GUEST-GID specify the first UID (respectively GID) | |
230 | that host UIDs (respectively GIDs) map to in the namespace." | |
c1f6a0c2 DT |
231 | ;; The parent process must initialize the user namespace for the child |
232 | ;; before it can boot. To negotiate this, a pipe is used such that the | |
233 | ;; child process blocks until the parent writes to it. | |
c06f6db7 LC |
234 | (match (socketpair PF_UNIX SOCK_STREAM 0) |
235 | ((child . parent) | |
c1f6a0c2 DT |
236 | (let ((flags (namespaces->bit-mask namespaces))) |
237 | (match (clone flags) | |
238 | (0 | |
239 | (call-with-clean-exit | |
240 | (lambda () | |
c06f6db7 | 241 | (close-port parent) |
c1f6a0c2 | 242 | ;; Wait for parent to set things up. |
c06f6db7 | 243 | (match (read child) |
4c14d4ea | 244 | ('ready |
4c14d4ea LC |
245 | (purify-environment) |
246 | (when (memq 'mnt namespaces) | |
c06f6db7 LC |
247 | (catch #t |
248 | (lambda () | |
249 | (mount-file-systems root mounts | |
250 | #:mount-/proc? (memq 'pid namespaces) | |
251 | #:mount-/sys? (memq 'net | |
252 | namespaces))) | |
253 | (lambda args | |
254 | ;; Forward the exception to the parent process. | |
36c4917c LC |
255 | ;; FIXME: SRFI-35 conditions and non-trivial objects |
256 | ;; cannot be 'read' so they shouldn't be written as is. | |
c06f6db7 LC |
257 | (write args child) |
258 | (primitive-exit 3)))) | |
4c14d4ea | 259 | ;; TODO: Manage capabilities. |
c06f6db7 LC |
260 | (write 'ready child) |
261 | (close-port child) | |
4c14d4ea LC |
262 | (thunk)) |
263 | (_ ;parent died or something | |
264 | (primitive-exit 2)))))) | |
c1f6a0c2 | 265 | (pid |
c06f6db7 | 266 | (close-port child) |
c1f6a0c2 | 267 | (when (memq 'user namespaces) |
af76c020 LC |
268 | (initialize-user-namespace pid host-uids |
269 | #:guest-uid guest-uid | |
270 | #:guest-gid guest-gid)) | |
c1f6a0c2 | 271 | ;; TODO: Initialize cgroups. |
c06f6db7 LC |
272 | (write 'ready parent) |
273 | (newline parent) | |
274 | ||
275 | ;; Check whether the child process' setup phase succeeded. | |
276 | (let ((message (read parent))) | |
277 | (close-port parent) | |
278 | (match message | |
279 | ('ready ;success | |
280 | pid) | |
281 | (((? symbol? key) args ...) ;exception | |
282 | (apply throw key args)) | |
283 | (_ ;unexpected termination | |
284 | #f))))))))) | |
c1f6a0c2 | 285 | |
40c369b2 LC |
286 | ;; FIXME: This is copied from (guix utils), which we cannot use because it |
287 | ;; would pull (guix config) and all. | |
288 | (define (call-with-temporary-directory proc) | |
289 | "Call PROC with a name of a temporary directory; close the directory and | |
290 | delete it when leaving the dynamic extent of this call." | |
291 | (let* ((directory (or (getenv "TMPDIR") "/tmp")) | |
292 | (template (string-append directory "/guix-directory.XXXXXX")) | |
293 | (tmp-dir (mkdtemp! template))) | |
294 | (dynamic-wind | |
295 | (const #t) | |
296 | (lambda () | |
297 | (proc tmp-dir)) | |
298 | (lambda () | |
299 | (false-if-exception (delete-file-recursively tmp-dir)))))) | |
300 | ||
831bc146 | 301 | (define* (call-with-container mounts thunk #:key (namespaces %namespaces) |
d236cd16 LC |
302 | (host-uids 1) (guest-uid 0) (guest-gid 0) |
303 | (process-spawned-hook (const #t))) | |
304 | "Run THUNK in a new container process and return its exit status; call | |
305 | PROCESS-SPAWNED-HOOK with the PID of the new process that has been spawned. | |
5970e8e2 LC |
306 | MOUNTS is a list of <file-system> objects that specify file systems to mount |
307 | inside the container. NAMESPACES is a list of symbols corresponding to | |
c1f6a0c2 | 308 | the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By |
af76c020 LC |
309 | default, all namespaces are used. |
310 | ||
311 | HOST-UIDS is the number of host user identifiers to map into the container's | |
312 | user namespace, if there is one. By default, only a single uid/gid, that of | |
313 | the current user, is mapped into the container. The host user that creates | |
314 | the container is the root user (uid/gid 0) within the container. Only root | |
315 | can map more than a single uid/gid. | |
316 | ||
317 | GUEST-UID and GUEST-GID specify the first UID (respectively GID) that host | |
318 | UIDs (respectively GIDs) map to in the namespace. | |
c1f6a0c2 DT |
319 | |
320 | Note that if THUNK needs to load any additional Guile modules, the relevant | |
321 | module files must be present in one of the mappings in MOUNTS and the Guile | |
322 | load path must be adjusted as needed." | |
323 | (call-with-temporary-directory | |
324 | (lambda (root) | |
af76c020 LC |
325 | (let ((pid (run-container root mounts namespaces host-uids thunk |
326 | #:guest-uid guest-uid | |
327 | #:guest-gid guest-gid))) | |
c1f6a0c2 DT |
328 | ;; Catch SIGINT and kill the container process. |
329 | (sigaction SIGINT | |
330 | (lambda (signum) | |
331 | (false-if-exception | |
332 | (kill pid SIGKILL)))) | |
333 | ||
d236cd16 | 334 | (process-spawned-hook pid) |
c1f6a0c2 DT |
335 | (match (waitpid pid) |
336 | ((_ . status) status)))))) | |
337 | ||
338 | (define (container-excursion pid thunk) | |
339 | "Run THUNK as a child process within the namespaces of process PID and | |
340 | return the exit status." | |
341 | (define (namespace-file pid namespace) | |
342 | (string-append "/proc/" (number->string pid) "/ns/" namespace)) | |
343 | ||
344 | (match (primitive-fork) | |
345 | (0 | |
346 | (call-with-clean-exit | |
347 | (lambda () | |
348 | (for-each (lambda (ns) | |
7fee5b53 LC |
349 | (let ((source (namespace-file (getpid) ns)) |
350 | (target (namespace-file pid ns))) | |
351 | ;; Joining the namespace that the process already | |
352 | ;; belongs to would throw an error so avoid that. | |
353 | ;; XXX: This /proc interface leads to TOCTTOU. | |
354 | (unless (string=? (readlink source) (readlink target)) | |
355 | (call-with-input-file source | |
356 | (lambda (current-ns-port) | |
357 | (call-with-input-file target | |
358 | (lambda (new-ns-port) | |
359 | (setns (fileno new-ns-port) 0)))))))) | |
c1f6a0c2 DT |
360 | ;; It's important that the user namespace is joined first, |
361 | ;; so that the user will have the privileges to join the | |
362 | ;; other namespaces. Furthermore, it's important that the | |
363 | ;; mount namespace is joined last, otherwise the /proc mount | |
364 | ;; point would no longer be accessible. | |
365 | '("user" "ipc" "uts" "net" "pid" "mnt")) | |
366 | (purify-environment) | |
367 | (chdir "/") | |
368 | (thunk)))) | |
369 | (pid | |
370 | (match (waitpid pid) | |
371 | ((_ . status) | |
372 | (status:exit-val status)))))) | |
c90db25f LC |
373 | |
374 | (define (container-excursion* pid thunk) | |
375 | "Like 'container-excursion', but return the return value of THUNK." | |
376 | (match (pipe) | |
377 | ((in . out) | |
378 | (match (container-excursion pid | |
379 | (lambda () | |
380 | (close-port in) | |
95aa64bc LC |
381 | (write (thunk) out) |
382 | (close-port out))) | |
c90db25f LC |
383 | (0 |
384 | (close-port out) | |
385 | (let ((result (read in))) | |
386 | (close-port in) | |
387 | result)) | |
388 | (_ ;maybe PID died already | |
389 | (close-port out) | |
390 | (close-port in) | |
391 | #f))))) |