tests: nfs: Improve "nfs-root-fs".
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2015 David Thompson <davet@gnu.org>
3 ;;; Copyright © 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (gnu build linux-container)
21 #:use-module (ice-9 format)
22 #:use-module (ice-9 match)
23 #:use-module (ice-9 rdelim)
24 #:use-module (srfi srfi-98)
25 #:use-module (guix build utils)
26 #:use-module (guix build syscalls)
27 #:use-module (gnu system file-systems) ;<file-system>
28 #:use-module ((gnu build file-systems) #:select (mount-file-system))
29 #:export (user-namespace-supported?
30 unprivileged-user-namespace-supported?
31 setgroups-supported?
32 %namespaces
33 run-container
34 call-with-container
35 container-excursion
36 container-excursion*))
37
38 (define (user-namespace-supported?)
39 "Return #t if user namespaces are supported on this system."
40 (file-exists? "/proc/self/ns/user"))
41
42 (define (unprivileged-user-namespace-supported?)
43 "Return #t if user namespaces can be created by unprivileged users."
44 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
45 (if (file-exists? userns-file)
46 (eqv? #\1 (call-with-input-file userns-file read-char))
47 #t)))
48
49 (define (setgroups-supported?)
50 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
51 exists."
52 (file-exists? "/proc/self/setgroups"))
53
54 (define %namespaces
55 '(mnt pid ipc uts user net))
56
57 (define (call-with-clean-exit thunk)
58 "Apply THUNK, but exit with a status code of 1 if it fails."
59 (dynamic-wind
60 (const #t)
61 (lambda ()
62 (thunk)
63
64 ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon
65 ;; exit (coming from fd finalizers) when used by the Shepherd. To work
66 ;; around that, exit forcefully so fd finalizers don't have a chance to
67 ;; run and fail.
68 (primitive-_exit 0))
69 (lambda ()
70 (primitive-_exit 1))))
71
72 (define (purify-environment)
73 "Unset all environment variables."
74 (for-each unsetenv
75 (match (get-environment-variables)
76 (((names . _) ...) names))))
77
78 ;; The container setup procedure closely resembles that of the Docker
79 ;; specification:
80 ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
81 (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
82 "Mount the essential file systems and the those in MOUNTS, a list of
83 <file-system> objects, relative to ROOT; then make ROOT the new root directory
84 for the process."
85 (define (scope dir)
86 (string-append root dir))
87
88 (define (touch file-name)
89 (call-with-output-file file-name (const #t)))
90
91 (define (bind-mount src dest)
92 (mount src dest "none" MS_BIND))
93
94 ;; Like mount, but creates the mount point if it doesn't exist.
95 (define* (mount* source target type #:optional (flags 0) options
96 #:key (update-mtab? #f))
97 (mkdir-p target)
98 (mount source target type flags options #:update-mtab? update-mtab?))
99
100 ;; The container's file system is completely ephemeral, sans directories
101 ;; bind-mounted from the host.
102 (mount "none" root "tmpfs")
103
104 ;; A proc mount requires a new pid namespace.
105 (when mount-/proc?
106 (mount* "none" (scope "/proc") "proc"
107 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
108
109 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
110 ;; the current network namespace.
111 (when mount-/sys?
112 (mount* "none" (scope "/sys") "sysfs"
113 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
114
115 (mount* "none" (scope "/dev") "tmpfs"
116 (logior MS_NOEXEC MS_STRICTATIME)
117 "mode=755")
118
119 ;; Create essential device nodes via bind-mounting them from the
120 ;; host, because a process within a user namespace cannot create
121 ;; device nodes.
122 (for-each (lambda (device)
123 (when (file-exists? device)
124 ;; Create the mount point file.
125 (touch (scope device))
126 (bind-mount device (scope device))))
127 '("/dev/null"
128 "/dev/zero"
129 "/dev/full"
130 "/dev/random"
131 "/dev/urandom"
132 "/dev/tty"
133 "/dev/fuse"))
134
135 ;; Mount a new devpts instance on /dev/pts.
136 (when (file-exists? "/dev/ptmx")
137 (mount* "none" (scope "/dev/pts") "devpts" 0
138 "newinstance,mode=0620")
139 (symlink "/dev/pts/ptmx" (scope "/dev/ptmx")))
140
141 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
142 ;; associated with standard input when there is one.
143 (let* ((in (current-input-port))
144 (tty (catch 'system-error
145 (lambda ()
146 ;; This call throws if IN does not correspond to a tty.
147 ;; This is more reliable than 'isatty?'.
148 (ttyname in))
149 (const #f)))
150 (console (scope "/dev/console")))
151 (when tty
152 (touch console)
153 (chmod console #o600)
154 (bind-mount tty console)))
155
156 ;; Setup standard input/output/error.
157 (symlink "/proc/self/fd" (scope "/dev/fd"))
158 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
159 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
160 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
161
162 ;; Mount user-specified file systems.
163 (for-each (lambda (file-system)
164 (mount-file-system file-system #:root root))
165 mounts)
166
167 ;; Jail the process inside the container's root file system.
168 (let ((put-old (string-append root "/real-root")))
169 (mkdir put-old)
170 (pivot-root root put-old)
171 (chdir "/")
172 (umount "real-root" MNT_DETACH)
173 (rmdir "real-root")))
174
175 (define* (initialize-user-namespace pid host-uids
176 #:key (guest-uid 0) (guest-gid 0))
177 "Configure the user namespace for PID. HOST-UIDS specifies the number of
178 host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID
179 specify the first UID (respectively GID) that host UIDs (respectively GIDs)
180 map to in the namespace."
181 (define proc-dir
182 (string-append "/proc/" (number->string pid)))
183
184 (define (scope file)
185 (string-append proc-dir file))
186
187 (let ((uid (getuid))
188 (gid (getgid)))
189
190 ;; Only root can write to the gid map without first disabling the
191 ;; setgroups syscall.
192 (unless (and (zero? uid) (zero? gid))
193 (call-with-output-file (scope "/setgroups")
194 (lambda (port)
195 (display "deny" port))))
196
197 ;; Map the user/group that created the container to the root user
198 ;; within the container.
199 (call-with-output-file (scope "/uid_map")
200 (lambda (port)
201 (format port "~d ~d ~d" guest-uid uid host-uids)))
202 (call-with-output-file (scope "/gid_map")
203 (lambda (port)
204 (format port "~d ~d ~d" guest-gid gid host-uids)))))
205
206 (define (namespaces->bit-mask namespaces)
207 "Return the number suitable for the 'flags' argument of 'clone' that
208 corresponds to the symbols in NAMESPACES."
209 ;; Use the same flags as fork(3) in addition to the namespace flags.
210 (apply logior SIGCHLD
211 (map (match-lambda
212 ('mnt CLONE_NEWNS)
213 ('uts CLONE_NEWUTS)
214 ('ipc CLONE_NEWIPC)
215 ('user CLONE_NEWUSER)
216 ('pid CLONE_NEWPID)
217 ('net CLONE_NEWNET))
218 namespaces)))
219
220 (define* (run-container root mounts namespaces host-uids thunk
221 #:key (guest-uid 0) (guest-gid 0))
222 "Run THUNK in a new container process and return its PID. ROOT specifies
223 the root directory for the container. MOUNTS is a list of <file-system>
224 objects that specify file systems to mount inside the container. NAMESPACES
225 is a list of symbols that correspond to the possible Linux namespaces: mnt,
226 ipc, uts, user, and net.
227
228 HOST-UIDS specifies the number of host user identifiers to map into the user
229 namespace. GUEST-UID and GUEST-GID specify the first UID (respectively GID)
230 that host UIDs (respectively GIDs) map to in the namespace."
231 ;; The parent process must initialize the user namespace for the child
232 ;; before it can boot. To negotiate this, a pipe is used such that the
233 ;; child process blocks until the parent writes to it.
234 (match (socketpair PF_UNIX SOCK_STREAM 0)
235 ((child . parent)
236 (let ((flags (namespaces->bit-mask namespaces)))
237 (match (clone flags)
238 (0
239 (call-with-clean-exit
240 (lambda ()
241 (close-port parent)
242 ;; Wait for parent to set things up.
243 (match (read child)
244 ('ready
245 (purify-environment)
246 (when (and (memq 'mnt namespaces)
247 (not (string=? root "/")))
248 (catch #t
249 (lambda ()
250 (mount-file-systems root mounts
251 #:mount-/proc? (memq 'pid namespaces)
252 #:mount-/sys? (memq 'net
253 namespaces)))
254 (lambda args
255 ;; Forward the exception to the parent process.
256 ;; FIXME: SRFI-35 conditions and non-trivial objects
257 ;; cannot be 'read' so they shouldn't be written as is.
258 (write args child)
259 (primitive-exit 3))))
260 ;; TODO: Manage capabilities.
261 (write 'ready child)
262 (close-port child)
263 (thunk))
264 (_ ;parent died or something
265 (primitive-exit 2))))))
266 (pid
267 (close-port child)
268 (when (memq 'user namespaces)
269 (initialize-user-namespace pid host-uids
270 #:guest-uid guest-uid
271 #:guest-gid guest-gid))
272 ;; TODO: Initialize cgroups.
273 (write 'ready parent)
274 (newline parent)
275
276 ;; Check whether the child process' setup phase succeeded.
277 (let ((message (read parent)))
278 (close-port parent)
279 (match message
280 ('ready ;success
281 pid)
282 (((? symbol? key) args ...) ;exception
283 (apply throw key args))
284 (_ ;unexpected termination
285 #f)))))))))
286
287 ;; FIXME: This is copied from (guix utils), which we cannot use because it
288 ;; would pull (guix config) and all.
289 (define (call-with-temporary-directory proc)
290 "Call PROC with a name of a temporary directory; close the directory and
291 delete it when leaving the dynamic extent of this call."
292 (let* ((directory (or (getenv "TMPDIR") "/tmp"))
293 (template (string-append directory "/guix-directory.XXXXXX"))
294 (tmp-dir (mkdtemp! template)))
295 (dynamic-wind
296 (const #t)
297 (lambda ()
298 (proc tmp-dir))
299 (lambda ()
300 (false-if-exception (delete-file-recursively tmp-dir))))))
301
302 (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
303 (host-uids 1) (guest-uid 0) (guest-gid 0)
304 (process-spawned-hook (const #t)))
305 "Run THUNK in a new container process and return its exit status; call
306 PROCESS-SPAWNED-HOOK with the PID of the new process that has been spawned.
307 MOUNTS is a list of <file-system> objects that specify file systems to mount
308 inside the container. NAMESPACES is a list of symbols corresponding to
309 the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
310 default, all namespaces are used.
311
312 HOST-UIDS is the number of host user identifiers to map into the container's
313 user namespace, if there is one. By default, only a single uid/gid, that of
314 the current user, is mapped into the container. The host user that creates
315 the container is the root user (uid/gid 0) within the container. Only root
316 can map more than a single uid/gid.
317
318 GUEST-UID and GUEST-GID specify the first UID (respectively GID) that host
319 UIDs (respectively GIDs) map to in the namespace.
320
321 Note that if THUNK needs to load any additional Guile modules, the relevant
322 module files must be present in one of the mappings in MOUNTS and the Guile
323 load path must be adjusted as needed."
324 (call-with-temporary-directory
325 (lambda (root)
326 (let ((pid (run-container root mounts namespaces host-uids thunk
327 #:guest-uid guest-uid
328 #:guest-gid guest-gid)))
329 ;; Catch SIGINT and kill the container process.
330 (sigaction SIGINT
331 (lambda (signum)
332 (false-if-exception
333 (kill pid SIGKILL))))
334
335 (process-spawned-hook pid)
336 (match (waitpid pid)
337 ((_ . status) status))))))
338
339 (define (container-excursion pid thunk)
340 "Run THUNK as a child process within the namespaces of process PID and
341 return the exit status."
342 (define (namespace-file pid namespace)
343 (string-append "/proc/" (number->string pid) "/ns/" namespace))
344
345 (match (primitive-fork)
346 (0
347 (call-with-clean-exit
348 (lambda ()
349 (for-each (lambda (ns)
350 (let ((source (namespace-file (getpid) ns))
351 (target (namespace-file pid ns)))
352 ;; Joining the namespace that the process already
353 ;; belongs to would throw an error so avoid that.
354 ;; XXX: This /proc interface leads to TOCTTOU.
355 (unless (string=? (readlink source) (readlink target))
356 (call-with-input-file source
357 (lambda (current-ns-port)
358 (call-with-input-file target
359 (lambda (new-ns-port)
360 (setns (fileno new-ns-port) 0))))))))
361 ;; It's important that the user namespace is joined first,
362 ;; so that the user will have the privileges to join the
363 ;; other namespaces. Furthermore, it's important that the
364 ;; mount namespace is joined last, otherwise the /proc mount
365 ;; point would no longer be accessible.
366 '("user" "ipc" "uts" "net" "pid" "mnt"))
367 (purify-environment)
368 (chdir "/")
369 (thunk))))
370 (pid
371 (match (waitpid pid)
372 ((_ . status)
373 (status:exit-val status))))))
374
375 (define (container-excursion* pid thunk)
376 "Like 'container-excursion', but return the return value of THUNK."
377 (match (pipe)
378 ((in . out)
379 (match (container-excursion pid
380 (lambda ()
381 (close-port in)
382 (write (thunk) out)
383 (close-port out)))
384 (0
385 (close-port out)
386 (let ((result (read in)))
387 (close-port in)
388 result))
389 (_ ;maybe PID died already
390 (close-port out)
391 (close-port in)
392 #f)))))