file-systems: 'mount-file-system' preserves source flags for bind mounts.
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
CommitLineData
c1f6a0c2
DT
1;;; GNU Guix --- Functional package management for GNU
2;;; Copyright © 2015 David Thompson <davet@gnu.org>
af76c020 3;;; Copyright © 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
c1f6a0c2
DT
4;;;
5;;; This file is part of GNU Guix.
6;;;
7;;; GNU Guix is free software; you can redistribute it and/or modify it
8;;; under the terms of the GNU General Public License as published by
9;;; the Free Software Foundation; either version 3 of the License, or (at
10;;; your option) any later version.
11;;;
12;;; GNU Guix is distributed in the hope that it will be useful, but
13;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;;; GNU General Public License for more details.
16;;;
17;;; You should have received a copy of the GNU General Public License
18;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20(define-module (gnu build linux-container)
21 #:use-module (ice-9 format)
22 #:use-module (ice-9 match)
b7d48312 23 #:use-module (ice-9 rdelim)
c1f6a0c2 24 #:use-module (srfi srfi-98)
c1f6a0c2
DT
25 #:use-module (guix build utils)
26 #:use-module (guix build syscalls)
5970e8e2 27 #:use-module (gnu system file-systems) ;<file-system>
c1f6a0c2 28 #:use-module ((gnu build file-systems) #:select (mount-file-system))
b7d48312
DT
29 #:export (user-namespace-supported?
30 unprivileged-user-namespace-supported?
31 setgroups-supported?
32 %namespaces
c1f6a0c2
DT
33 run-container
34 call-with-container
c90db25f
LC
35 container-excursion
36 container-excursion*))
c1f6a0c2 37
b7d48312
DT
38(define (user-namespace-supported?)
39 "Return #t if user namespaces are supported on this system."
40 (file-exists? "/proc/self/ns/user"))
41
42(define (unprivileged-user-namespace-supported?)
43 "Return #t if user namespaces can be created by unprivileged users."
44 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
45 (if (file-exists? userns-file)
c5184468 46 (eqv? #\1 (call-with-input-file userns-file read-char))
329fa5bd 47 #t)))
b7d48312
DT
48
49(define (setgroups-supported?)
50 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
51exists."
52 (file-exists? "/proc/self/setgroups"))
53
c1f6a0c2
DT
54(define %namespaces
55 '(mnt pid ipc uts user net))
56
57(define (call-with-clean-exit thunk)
58 "Apply THUNK, but exit with a status code of 1 if it fails."
59 (dynamic-wind
60 (const #t)
a72ccbc2
DT
61 (lambda ()
62 (thunk)
95aa64bc
LC
63
64 ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon
65 ;; exit (coming from fd finalizers) when used by the Shepherd. To work
66 ;; around that, exit forcefully so fd finalizers don't have a chance to
67 ;; run and fail.
68 (primitive-_exit 0))
c1f6a0c2 69 (lambda ()
95aa64bc 70 (primitive-_exit 1))))
c1f6a0c2
DT
71
72(define (purify-environment)
73 "Unset all environment variables."
74 (for-each unsetenv
75 (match (get-environment-variables)
76 (((names . _) ...) names))))
77
78;; The container setup procedure closely resembles that of the Docker
79;; specification:
80;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
81(define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
5970e8e2
LC
82 "Mount the essential file systems and the those in MOUNTS, a list of
83<file-system> objects, relative to ROOT; then make ROOT the new root directory
84for the process."
c1f6a0c2
DT
85 (define (scope dir)
86 (string-append root dir))
87
4949ada9
DT
88 (define (touch file-name)
89 (call-with-output-file file-name (const #t)))
90
c1f6a0c2
DT
91 (define (bind-mount src dest)
92 (mount src dest "none" MS_BIND))
93
94 ;; Like mount, but creates the mount point if it doesn't exist.
95 (define* (mount* source target type #:optional (flags 0) options
96 #:key (update-mtab? #f))
97 (mkdir-p target)
98 (mount source target type flags options #:update-mtab? update-mtab?))
99
100 ;; The container's file system is completely ephemeral, sans directories
101 ;; bind-mounted from the host.
102 (mount "none" root "tmpfs")
103
104 ;; A proc mount requires a new pid namespace.
105 (when mount-/proc?
106 (mount* "none" (scope "/proc") "proc"
107 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
108
109 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
110 ;; the current network namespace.
111 (when mount-/sys?
112 (mount* "none" (scope "/sys") "sysfs"
113 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
114
115 (mount* "none" (scope "/dev") "tmpfs"
116 (logior MS_NOEXEC MS_STRICTATIME)
117 "mode=755")
118
119 ;; Create essential device nodes via bind-mounting them from the
120 ;; host, because a process within a user namespace cannot create
121 ;; device nodes.
122 (for-each (lambda (device)
123 (when (file-exists? device)
124 ;; Create the mount point file.
4949ada9 125 (touch (scope device))
c1f6a0c2
DT
126 (bind-mount device (scope device))))
127 '("/dev/null"
128 "/dev/zero"
129 "/dev/full"
130 "/dev/random"
131 "/dev/urandom"
132 "/dev/tty"
c1f6a0c2
DT
133 "/dev/fuse"))
134
935e79af
LC
135 ;; Mount a new devpts instance on /dev/pts.
136 (when (file-exists? "/dev/ptmx")
137 (mount* "none" (scope "/dev/pts") "devpts" 0
138 "newinstance,mode=0620")
139 (symlink "/dev/pts/ptmx" (scope "/dev/ptmx")))
140
4949ada9 141 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
168aba29
LC
142 ;; associated with standard input when there is one.
143 (let* ((in (current-input-port))
144 (tty (catch 'system-error
145 (lambda ()
146 ;; This call throws if IN does not correspond to a tty.
147 ;; This is more reliable than 'isatty?'.
148 (ttyname in))
149 (const #f)))
150 (console (scope "/dev/console")))
151 (when tty
4949ada9
DT
152 (touch console)
153 (chmod console #o600)
168aba29 154 (bind-mount tty console)))
4949ada9 155
c1f6a0c2
DT
156 ;; Setup standard input/output/error.
157 (symlink "/proc/self/fd" (scope "/dev/fd"))
158 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
159 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
160 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
161
162 ;; Mount user-specified file systems.
5970e8e2 163 (for-each (lambda (file-system)
1c65cca5 164 (mount-file-system file-system #:root root))
c1f6a0c2
DT
165 mounts)
166
167 ;; Jail the process inside the container's root file system.
168 (let ((put-old (string-append root "/real-root")))
169 (mkdir put-old)
170 (pivot-root root put-old)
171 (chdir "/")
172 (umount "real-root" MNT_DETACH)
e7481835
JL
173 (rmdir "real-root")
174 (chmod "/" #o755)))
c1f6a0c2 175
af76c020
LC
176(define* (initialize-user-namespace pid host-uids
177 #:key (guest-uid 0) (guest-gid 0))
831bc146 178 "Configure the user namespace for PID. HOST-UIDS specifies the number of
af76c020
LC
179host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID
180specify the first UID (respectively GID) that host UIDs (respectively GIDs)
181map to in the namespace."
c1f6a0c2
DT
182 (define proc-dir
183 (string-append "/proc/" (number->string pid)))
184
185 (define (scope file)
186 (string-append proc-dir file))
187
831bc146
DT
188 (let ((uid (getuid))
189 (gid (getgid)))
c1f6a0c2
DT
190
191 ;; Only root can write to the gid map without first disabling the
192 ;; setgroups syscall.
193 (unless (and (zero? uid) (zero? gid))
194 (call-with-output-file (scope "/setgroups")
195 (lambda (port)
196 (display "deny" port))))
197
198 ;; Map the user/group that created the container to the root user
199 ;; within the container.
200 (call-with-output-file (scope "/uid_map")
201 (lambda (port)
af76c020 202 (format port "~d ~d ~d" guest-uid uid host-uids)))
c1f6a0c2
DT
203 (call-with-output-file (scope "/gid_map")
204 (lambda (port)
af76c020 205 (format port "~d ~d ~d" guest-gid gid host-uids)))))
c1f6a0c2
DT
206
207(define (namespaces->bit-mask namespaces)
208 "Return the number suitable for the 'flags' argument of 'clone' that
209corresponds to the symbols in NAMESPACES."
ee78d024 210 ;; Use the same flags as fork(3) in addition to the namespace flags.
35b50a75 211 (apply logior SIGCHLD
c1f6a0c2
DT
212 (map (match-lambda
213 ('mnt CLONE_NEWNS)
214 ('uts CLONE_NEWUTS)
215 ('ipc CLONE_NEWIPC)
216 ('user CLONE_NEWUSER)
217 ('pid CLONE_NEWPID)
218 ('net CLONE_NEWNET))
219 namespaces)))
220
af76c020
LC
221(define* (run-container root mounts namespaces host-uids thunk
222 #:key (guest-uid 0) (guest-gid 0))
c1f6a0c2 223 "Run THUNK in a new container process and return its PID. ROOT specifies
5970e8e2
LC
224the root directory for the container. MOUNTS is a list of <file-system>
225objects that specify file systems to mount inside the container. NAMESPACES
c1f6a0c2 226is a list of symbols that correspond to the possible Linux namespaces: mnt,
af76c020
LC
227ipc, uts, user, and net.
228
229HOST-UIDS specifies the number of host user identifiers to map into the user
230namespace. GUEST-UID and GUEST-GID specify the first UID (respectively GID)
231that host UIDs (respectively GIDs) map to in the namespace."
c1f6a0c2
DT
232 ;; The parent process must initialize the user namespace for the child
233 ;; before it can boot. To negotiate this, a pipe is used such that the
234 ;; child process blocks until the parent writes to it.
c06f6db7
LC
235 (match (socketpair PF_UNIX SOCK_STREAM 0)
236 ((child . parent)
c1f6a0c2
DT
237 (let ((flags (namespaces->bit-mask namespaces)))
238 (match (clone flags)
239 (0
240 (call-with-clean-exit
241 (lambda ()
c06f6db7 242 (close-port parent)
c1f6a0c2 243 ;; Wait for parent to set things up.
c06f6db7 244 (match (read child)
4c14d4ea 245 ('ready
4c14d4ea 246 (purify-environment)
b3a83f1e
MO
247 (when (and (memq 'mnt namespaces)
248 (not (string=? root "/")))
c06f6db7
LC
249 (catch #t
250 (lambda ()
251 (mount-file-systems root mounts
252 #:mount-/proc? (memq 'pid namespaces)
253 #:mount-/sys? (memq 'net
254 namespaces)))
255 (lambda args
256 ;; Forward the exception to the parent process.
36c4917c
LC
257 ;; FIXME: SRFI-35 conditions and non-trivial objects
258 ;; cannot be 'read' so they shouldn't be written as is.
c06f6db7
LC
259 (write args child)
260 (primitive-exit 3))))
4c14d4ea 261 ;; TODO: Manage capabilities.
c06f6db7
LC
262 (write 'ready child)
263 (close-port child)
4c14d4ea
LC
264 (thunk))
265 (_ ;parent died or something
266 (primitive-exit 2))))))
c1f6a0c2 267 (pid
c06f6db7 268 (close-port child)
c1f6a0c2 269 (when (memq 'user namespaces)
af76c020
LC
270 (initialize-user-namespace pid host-uids
271 #:guest-uid guest-uid
272 #:guest-gid guest-gid))
c1f6a0c2 273 ;; TODO: Initialize cgroups.
c06f6db7
LC
274 (write 'ready parent)
275 (newline parent)
276
277 ;; Check whether the child process' setup phase succeeded.
278 (let ((message (read parent)))
279 (close-port parent)
280 (match message
281 ('ready ;success
282 pid)
283 (((? symbol? key) args ...) ;exception
284 (apply throw key args))
285 (_ ;unexpected termination
286 #f)))))))))
c1f6a0c2 287
40c369b2
LC
288;; FIXME: This is copied from (guix utils), which we cannot use because it
289;; would pull (guix config) and all.
290(define (call-with-temporary-directory proc)
291 "Call PROC with a name of a temporary directory; close the directory and
292delete it when leaving the dynamic extent of this call."
293 (let* ((directory (or (getenv "TMPDIR") "/tmp"))
294 (template (string-append directory "/guix-directory.XXXXXX"))
295 (tmp-dir (mkdtemp! template)))
296 (dynamic-wind
297 (const #t)
298 (lambda ()
299 (proc tmp-dir))
300 (lambda ()
301 (false-if-exception (delete-file-recursively tmp-dir))))))
302
831bc146 303(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
d236cd16
LC
304 (host-uids 1) (guest-uid 0) (guest-gid 0)
305 (process-spawned-hook (const #t)))
306 "Run THUNK in a new container process and return its exit status; call
307PROCESS-SPAWNED-HOOK with the PID of the new process that has been spawned.
5970e8e2
LC
308MOUNTS is a list of <file-system> objects that specify file systems to mount
309inside the container. NAMESPACES is a list of symbols corresponding to
c1f6a0c2 310the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
af76c020
LC
311default, all namespaces are used.
312
313HOST-UIDS is the number of host user identifiers to map into the container's
314user namespace, if there is one. By default, only a single uid/gid, that of
315the current user, is mapped into the container. The host user that creates
316the container is the root user (uid/gid 0) within the container. Only root
317can map more than a single uid/gid.
318
319GUEST-UID and GUEST-GID specify the first UID (respectively GID) that host
320UIDs (respectively GIDs) map to in the namespace.
c1f6a0c2
DT
321
322Note that if THUNK needs to load any additional Guile modules, the relevant
323module files must be present in one of the mappings in MOUNTS and the Guile
324load path must be adjusted as needed."
325 (call-with-temporary-directory
326 (lambda (root)
af76c020
LC
327 (let ((pid (run-container root mounts namespaces host-uids thunk
328 #:guest-uid guest-uid
329 #:guest-gid guest-gid)))
c1f6a0c2
DT
330 ;; Catch SIGINT and kill the container process.
331 (sigaction SIGINT
332 (lambda (signum)
333 (false-if-exception
334 (kill pid SIGKILL))))
335
d236cd16 336 (process-spawned-hook pid)
c1f6a0c2
DT
337 (match (waitpid pid)
338 ((_ . status) status))))))
339
340(define (container-excursion pid thunk)
341 "Run THUNK as a child process within the namespaces of process PID and
342return the exit status."
343 (define (namespace-file pid namespace)
344 (string-append "/proc/" (number->string pid) "/ns/" namespace))
345
346 (match (primitive-fork)
347 (0
348 (call-with-clean-exit
349 (lambda ()
350 (for-each (lambda (ns)
7fee5b53
LC
351 (let ((source (namespace-file (getpid) ns))
352 (target (namespace-file pid ns)))
353 ;; Joining the namespace that the process already
354 ;; belongs to would throw an error so avoid that.
355 ;; XXX: This /proc interface leads to TOCTTOU.
356 (unless (string=? (readlink source) (readlink target))
357 (call-with-input-file source
358 (lambda (current-ns-port)
359 (call-with-input-file target
360 (lambda (new-ns-port)
361 (setns (fileno new-ns-port) 0))))))))
c1f6a0c2
DT
362 ;; It's important that the user namespace is joined first,
363 ;; so that the user will have the privileges to join the
364 ;; other namespaces. Furthermore, it's important that the
365 ;; mount namespace is joined last, otherwise the /proc mount
366 ;; point would no longer be accessible.
367 '("user" "ipc" "uts" "net" "pid" "mnt"))
368 (purify-environment)
369 (chdir "/")
370 (thunk))))
371 (pid
372 (match (waitpid pid)
373 ((_ . status)
374 (status:exit-val status))))))
c90db25f
LC
375
376(define (container-excursion* pid thunk)
377 "Like 'container-excursion', but return the return value of THUNK."
378 (match (pipe)
379 ((in . out)
380 (match (container-excursion pid
381 (lambda ()
382 (close-port in)
95aa64bc
LC
383 (write (thunk) out)
384 (close-port out)))
c90db25f
LC
385 (0
386 (close-port out)
387 (let ((result (read in)))
388 (close-port in)
389 result))
390 (_ ;maybe PID died already
391 (close-port out)
392 (close-port in)
393 #f)))))