gnu: python-pkginfo: Update to 1.4.2.
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2015 David Thompson <davet@gnu.org>
3 ;;; Copyright © 2017, 2018 Ludovic Courtès <ludo@gnu.org>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (gnu build linux-container)
21 #:use-module (ice-9 format)
22 #:use-module (ice-9 match)
23 #:use-module (ice-9 rdelim)
24 #:use-module (srfi srfi-98)
25 #:use-module (guix utils)
26 #:use-module (guix build utils)
27 #:use-module (guix build syscalls)
28 #:use-module (gnu system file-systems) ;<file-system>
29 #:use-module ((gnu build file-systems) #:select (mount-file-system))
30 #:export (user-namespace-supported?
31 unprivileged-user-namespace-supported?
32 setgroups-supported?
33 %namespaces
34 run-container
35 call-with-container
36 container-excursion
37 container-excursion*))
38
39 (define (user-namespace-supported?)
40 "Return #t if user namespaces are supported on this system."
41 (file-exists? "/proc/self/ns/user"))
42
43 (define (unprivileged-user-namespace-supported?)
44 "Return #t if user namespaces can be created by unprivileged users."
45 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
46 (if (file-exists? userns-file)
47 (eqv? #\1 (call-with-input-file userns-file read-char))
48 #t)))
49
50 (define (setgroups-supported?)
51 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
52 exists."
53 (file-exists? "/proc/self/setgroups"))
54
55 (define %namespaces
56 '(mnt pid ipc uts user net))
57
58 (define (call-with-clean-exit thunk)
59 "Apply THUNK, but exit with a status code of 1 if it fails."
60 (dynamic-wind
61 (const #t)
62 (lambda ()
63 (thunk)
64
65 ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon
66 ;; exit (coming from fd finalizers) when used by the Shepherd. To work
67 ;; around that, exit forcefully so fd finalizers don't have a chance to
68 ;; run and fail.
69 (primitive-_exit 0))
70 (lambda ()
71 (primitive-_exit 1))))
72
73 (define (purify-environment)
74 "Unset all environment variables."
75 (for-each unsetenv
76 (match (get-environment-variables)
77 (((names . _) ...) names))))
78
79 ;; The container setup procedure closely resembles that of the Docker
80 ;; specification:
81 ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
82 (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
83 "Mount the essential file systems and the those in MOUNTS, a list of
84 <file-system> objects, relative to ROOT; then make ROOT the new root directory
85 for the process."
86 (define (scope dir)
87 (string-append root dir))
88
89 (define (touch file-name)
90 (call-with-output-file file-name (const #t)))
91
92 (define (bind-mount src dest)
93 (mount src dest "none" MS_BIND))
94
95 ;; Like mount, but creates the mount point if it doesn't exist.
96 (define* (mount* source target type #:optional (flags 0) options
97 #:key (update-mtab? #f))
98 (mkdir-p target)
99 (mount source target type flags options #:update-mtab? update-mtab?))
100
101 ;; The container's file system is completely ephemeral, sans directories
102 ;; bind-mounted from the host.
103 (mount "none" root "tmpfs")
104
105 ;; A proc mount requires a new pid namespace.
106 (when mount-/proc?
107 (mount* "none" (scope "/proc") "proc"
108 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
109
110 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
111 ;; the current network namespace.
112 (when mount-/sys?
113 (mount* "none" (scope "/sys") "sysfs"
114 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
115
116 (mount* "none" (scope "/dev") "tmpfs"
117 (logior MS_NOEXEC MS_STRICTATIME)
118 "mode=755")
119
120 ;; Create essential device nodes via bind-mounting them from the
121 ;; host, because a process within a user namespace cannot create
122 ;; device nodes.
123 (for-each (lambda (device)
124 (when (file-exists? device)
125 ;; Create the mount point file.
126 (touch (scope device))
127 (bind-mount device (scope device))))
128 '("/dev/null"
129 "/dev/zero"
130 "/dev/full"
131 "/dev/random"
132 "/dev/urandom"
133 "/dev/tty"
134 "/dev/ptmx"
135 "/dev/fuse"))
136
137 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
138 ;; associated with standard input when there is one.
139 (let* ((in (current-input-port))
140 (tty (catch 'system-error
141 (lambda ()
142 ;; This call throws if IN does not correspond to a tty.
143 ;; This is more reliable than 'isatty?'.
144 (ttyname in))
145 (const #f)))
146 (console (scope "/dev/console")))
147 (when tty
148 (touch console)
149 (chmod console #o600)
150 (bind-mount tty console)))
151
152 ;; Setup standard input/output/error.
153 (symlink "/proc/self/fd" (scope "/dev/fd"))
154 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
155 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
156 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
157
158 ;; Mount user-specified file systems.
159 (for-each (lambda (file-system)
160 (mount-file-system file-system #:root root))
161 mounts)
162
163 ;; Jail the process inside the container's root file system.
164 (let ((put-old (string-append root "/real-root")))
165 (mkdir put-old)
166 (pivot-root root put-old)
167 (chdir "/")
168 (umount "real-root" MNT_DETACH)
169 (rmdir "real-root")))
170
171 (define (initialize-user-namespace pid host-uids)
172 "Configure the user namespace for PID. HOST-UIDS specifies the number of
173 host user identifiers to map into the user namespace."
174 (define proc-dir
175 (string-append "/proc/" (number->string pid)))
176
177 (define (scope file)
178 (string-append proc-dir file))
179
180 (let ((uid (getuid))
181 (gid (getgid)))
182
183 ;; Only root can write to the gid map without first disabling the
184 ;; setgroups syscall.
185 (unless (and (zero? uid) (zero? gid))
186 (call-with-output-file (scope "/setgroups")
187 (lambda (port)
188 (display "deny" port))))
189
190 ;; Map the user/group that created the container to the root user
191 ;; within the container.
192 (call-with-output-file (scope "/uid_map")
193 (lambda (port)
194 (format port "0 ~d ~d" uid host-uids)))
195 (call-with-output-file (scope "/gid_map")
196 (lambda (port)
197 (format port "0 ~d ~d" gid host-uids)))))
198
199 (define (namespaces->bit-mask namespaces)
200 "Return the number suitable for the 'flags' argument of 'clone' that
201 corresponds to the symbols in NAMESPACES."
202 ;; Use the same flags as fork(3) in addition to the namespace flags.
203 (apply logior SIGCHLD
204 (map (match-lambda
205 ('mnt CLONE_NEWNS)
206 ('uts CLONE_NEWUTS)
207 ('ipc CLONE_NEWIPC)
208 ('user CLONE_NEWUSER)
209 ('pid CLONE_NEWPID)
210 ('net CLONE_NEWNET))
211 namespaces)))
212
213 (define (run-container root mounts namespaces host-uids thunk)
214 "Run THUNK in a new container process and return its PID. ROOT specifies
215 the root directory for the container. MOUNTS is a list of <file-system>
216 objects that specify file systems to mount inside the container. NAMESPACES
217 is a list of symbols that correspond to the possible Linux namespaces: mnt,
218 ipc, uts, user, and net. HOST-UIDS specifies the number of
219 host user identifiers to map into the user namespace."
220 ;; The parent process must initialize the user namespace for the child
221 ;; before it can boot. To negotiate this, a pipe is used such that the
222 ;; child process blocks until the parent writes to it.
223 (match (socketpair PF_UNIX SOCK_STREAM 0)
224 ((child . parent)
225 (let ((flags (namespaces->bit-mask namespaces)))
226 (match (clone flags)
227 (0
228 (call-with-clean-exit
229 (lambda ()
230 (close-port parent)
231 ;; Wait for parent to set things up.
232 (match (read child)
233 ('ready
234 (purify-environment)
235 (when (memq 'mnt namespaces)
236 (catch #t
237 (lambda ()
238 (mount-file-systems root mounts
239 #:mount-/proc? (memq 'pid namespaces)
240 #:mount-/sys? (memq 'net
241 namespaces)))
242 (lambda args
243 ;; Forward the exception to the parent process.
244 ;; FIXME: SRFI-35 conditions and non-trivial objects
245 ;; cannot be 'read' so they shouldn't be written as is.
246 (write args child)
247 (primitive-exit 3))))
248 ;; TODO: Manage capabilities.
249 (write 'ready child)
250 (close-port child)
251 (thunk))
252 (_ ;parent died or something
253 (primitive-exit 2))))))
254 (pid
255 (close-port child)
256 (when (memq 'user namespaces)
257 (initialize-user-namespace pid host-uids))
258 ;; TODO: Initialize cgroups.
259 (write 'ready parent)
260 (newline parent)
261
262 ;; Check whether the child process' setup phase succeeded.
263 (let ((message (read parent)))
264 (close-port parent)
265 (match message
266 ('ready ;success
267 pid)
268 (((? symbol? key) args ...) ;exception
269 (apply throw key args))
270 (_ ;unexpected termination
271 #f)))))))))
272
273 (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
274 (host-uids 1))
275 "Run THUNK in a new container process and return its exit status.
276 MOUNTS is a list of <file-system> objects that specify file systems to mount
277 inside the container. NAMESPACES is a list of symbols corresponding to
278 the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
279 default, all namespaces are used. HOST-UIDS is the number of host user
280 identifiers to map into the container's user namespace, if there is one. By
281 default, only a single uid/gid, that of the current user, is mapped into the
282 container. The host user that creates the container is the root user (uid/gid
283 0) within the container. Only root can map more than a single uid/gid.
284
285 Note that if THUNK needs to load any additional Guile modules, the relevant
286 module files must be present in one of the mappings in MOUNTS and the Guile
287 load path must be adjusted as needed."
288 (call-with-temporary-directory
289 (lambda (root)
290 (let ((pid (run-container root mounts namespaces host-uids thunk)))
291 ;; Catch SIGINT and kill the container process.
292 (sigaction SIGINT
293 (lambda (signum)
294 (false-if-exception
295 (kill pid SIGKILL))))
296
297 (match (waitpid pid)
298 ((_ . status) status))))))
299
300 (define (container-excursion pid thunk)
301 "Run THUNK as a child process within the namespaces of process PID and
302 return the exit status."
303 (define (namespace-file pid namespace)
304 (string-append "/proc/" (number->string pid) "/ns/" namespace))
305
306 (match (primitive-fork)
307 (0
308 (call-with-clean-exit
309 (lambda ()
310 (for-each (lambda (ns)
311 (let ((source (namespace-file (getpid) ns))
312 (target (namespace-file pid ns)))
313 ;; Joining the namespace that the process already
314 ;; belongs to would throw an error so avoid that.
315 ;; XXX: This /proc interface leads to TOCTTOU.
316 (unless (string=? (readlink source) (readlink target))
317 (call-with-input-file source
318 (lambda (current-ns-port)
319 (call-with-input-file target
320 (lambda (new-ns-port)
321 (setns (fileno new-ns-port) 0))))))))
322 ;; It's important that the user namespace is joined first,
323 ;; so that the user will have the privileges to join the
324 ;; other namespaces. Furthermore, it's important that the
325 ;; mount namespace is joined last, otherwise the /proc mount
326 ;; point would no longer be accessible.
327 '("user" "ipc" "uts" "net" "pid" "mnt"))
328 (purify-environment)
329 (chdir "/")
330 (thunk))))
331 (pid
332 (match (waitpid pid)
333 ((_ . status)
334 (status:exit-val status))))))
335
336 (define (container-excursion* pid thunk)
337 "Like 'container-excursion', but return the return value of THUNK."
338 (match (pipe)
339 ((in . out)
340 (match (container-excursion pid
341 (lambda ()
342 (close-port in)
343 (write (thunk) out)
344 (close-port out)))
345 (0
346 (close-port out)
347 (let ((result (read in)))
348 (close-port in)
349 result))
350 (_ ;maybe PID died already
351 (close-port out)
352 (close-port in)
353 #f)))))