Merge branch 'master' into core-updates
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
CommitLineData
c1f6a0c2
DT
1;;; GNU Guix --- Functional package management for GNU
2;;; Copyright © 2015 David Thompson <davet@gnu.org>
c90db25f 3;;; Copyright © 2017 Ludovic Courtès <ludo@gnu.org>
c1f6a0c2
DT
4;;;
5;;; This file is part of GNU Guix.
6;;;
7;;; GNU Guix is free software; you can redistribute it and/or modify it
8;;; under the terms of the GNU General Public License as published by
9;;; the Free Software Foundation; either version 3 of the License, or (at
10;;; your option) any later version.
11;;;
12;;; GNU Guix is distributed in the hope that it will be useful, but
13;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;;; GNU General Public License for more details.
16;;;
17;;; You should have received a copy of the GNU General Public License
18;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20(define-module (gnu build linux-container)
21 #:use-module (ice-9 format)
22 #:use-module (ice-9 match)
b7d48312 23 #:use-module (ice-9 rdelim)
c1f6a0c2
DT
24 #:use-module (srfi srfi-98)
25 #:use-module (guix utils)
26 #:use-module (guix build utils)
27 #:use-module (guix build syscalls)
5970e8e2 28 #:use-module (gnu system file-systems) ;<file-system>
c1f6a0c2 29 #:use-module ((gnu build file-systems) #:select (mount-file-system))
b7d48312
DT
30 #:export (user-namespace-supported?
31 unprivileged-user-namespace-supported?
32 setgroups-supported?
33 %namespaces
c1f6a0c2
DT
34 run-container
35 call-with-container
c90db25f
LC
36 container-excursion
37 container-excursion*))
c1f6a0c2 38
b7d48312
DT
39(define (user-namespace-supported?)
40 "Return #t if user namespaces are supported on this system."
41 (file-exists? "/proc/self/ns/user"))
42
43(define (unprivileged-user-namespace-supported?)
44 "Return #t if user namespaces can be created by unprivileged users."
45 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
46 (if (file-exists? userns-file)
c5184468 47 (eqv? #\1 (call-with-input-file userns-file read-char))
b7d48312
DT
48 #t)))
49
50(define (setgroups-supported?)
51 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
52exists."
53 (file-exists? "/proc/self/setgroups"))
54
c1f6a0c2
DT
55(define %namespaces
56 '(mnt pid ipc uts user net))
57
58(define (call-with-clean-exit thunk)
59 "Apply THUNK, but exit with a status code of 1 if it fails."
60 (dynamic-wind
61 (const #t)
a72ccbc2
DT
62 (lambda ()
63 (thunk)
64 (primitive-exit 0))
c1f6a0c2
DT
65 (lambda ()
66 (primitive-exit 1))))
67
68(define (purify-environment)
69 "Unset all environment variables."
70 (for-each unsetenv
71 (match (get-environment-variables)
72 (((names . _) ...) names))))
73
74;; The container setup procedure closely resembles that of the Docker
75;; specification:
76;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
77(define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
5970e8e2
LC
78 "Mount the essential file systems and the those in MOUNTS, a list of
79<file-system> objects, relative to ROOT; then make ROOT the new root directory
80for the process."
c1f6a0c2
DT
81 (define (scope dir)
82 (string-append root dir))
83
4949ada9
DT
84 (define (touch file-name)
85 (call-with-output-file file-name (const #t)))
86
c1f6a0c2
DT
87 (define (bind-mount src dest)
88 (mount src dest "none" MS_BIND))
89
90 ;; Like mount, but creates the mount point if it doesn't exist.
91 (define* (mount* source target type #:optional (flags 0) options
92 #:key (update-mtab? #f))
93 (mkdir-p target)
94 (mount source target type flags options #:update-mtab? update-mtab?))
95
96 ;; The container's file system is completely ephemeral, sans directories
97 ;; bind-mounted from the host.
98 (mount "none" root "tmpfs")
99
100 ;; A proc mount requires a new pid namespace.
101 (when mount-/proc?
102 (mount* "none" (scope "/proc") "proc"
103 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
104
105 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
106 ;; the current network namespace.
107 (when mount-/sys?
108 (mount* "none" (scope "/sys") "sysfs"
109 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
110
111 (mount* "none" (scope "/dev") "tmpfs"
112 (logior MS_NOEXEC MS_STRICTATIME)
113 "mode=755")
114
115 ;; Create essential device nodes via bind-mounting them from the
116 ;; host, because a process within a user namespace cannot create
117 ;; device nodes.
118 (for-each (lambda (device)
119 (when (file-exists? device)
120 ;; Create the mount point file.
4949ada9 121 (touch (scope device))
c1f6a0c2
DT
122 (bind-mount device (scope device))))
123 '("/dev/null"
124 "/dev/zero"
125 "/dev/full"
126 "/dev/random"
127 "/dev/urandom"
128 "/dev/tty"
129 "/dev/ptmx"
130 "/dev/fuse"))
131
4949ada9 132 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
168aba29
LC
133 ;; associated with standard input when there is one.
134 (let* ((in (current-input-port))
135 (tty (catch 'system-error
136 (lambda ()
137 ;; This call throws if IN does not correspond to a tty.
138 ;; This is more reliable than 'isatty?'.
139 (ttyname in))
140 (const #f)))
141 (console (scope "/dev/console")))
142 (when tty
4949ada9
DT
143 (touch console)
144 (chmod console #o600)
168aba29 145 (bind-mount tty console)))
4949ada9 146
c1f6a0c2
DT
147 ;; Setup standard input/output/error.
148 (symlink "/proc/self/fd" (scope "/dev/fd"))
149 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
150 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
151 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
152
153 ;; Mount user-specified file systems.
5970e8e2 154 (for-each (lambda (file-system)
1c65cca5 155 (mount-file-system file-system #:root root))
c1f6a0c2
DT
156 mounts)
157
158 ;; Jail the process inside the container's root file system.
159 (let ((put-old (string-append root "/real-root")))
160 (mkdir put-old)
161 (pivot-root root put-old)
162 (chdir "/")
163 (umount "real-root" MNT_DETACH)
164 (rmdir "real-root")))
165
831bc146
DT
166(define (initialize-user-namespace pid host-uids)
167 "Configure the user namespace for PID. HOST-UIDS specifies the number of
168host user identifiers to map into the user namespace."
c1f6a0c2
DT
169 (define proc-dir
170 (string-append "/proc/" (number->string pid)))
171
172 (define (scope file)
173 (string-append proc-dir file))
174
831bc146
DT
175 (let ((uid (getuid))
176 (gid (getgid)))
c1f6a0c2
DT
177
178 ;; Only root can write to the gid map without first disabling the
179 ;; setgroups syscall.
180 (unless (and (zero? uid) (zero? gid))
181 (call-with-output-file (scope "/setgroups")
182 (lambda (port)
183 (display "deny" port))))
184
185 ;; Map the user/group that created the container to the root user
186 ;; within the container.
187 (call-with-output-file (scope "/uid_map")
188 (lambda (port)
831bc146 189 (format port "0 ~d ~d" uid host-uids)))
c1f6a0c2
DT
190 (call-with-output-file (scope "/gid_map")
191 (lambda (port)
831bc146 192 (format port "0 ~d ~d" gid host-uids)))))
c1f6a0c2
DT
193
194(define (namespaces->bit-mask namespaces)
195 "Return the number suitable for the 'flags' argument of 'clone' that
196corresponds to the symbols in NAMESPACES."
ee78d024 197 ;; Use the same flags as fork(3) in addition to the namespace flags.
35b50a75 198 (apply logior SIGCHLD
c1f6a0c2
DT
199 (map (match-lambda
200 ('mnt CLONE_NEWNS)
201 ('uts CLONE_NEWUTS)
202 ('ipc CLONE_NEWIPC)
203 ('user CLONE_NEWUSER)
204 ('pid CLONE_NEWPID)
205 ('net CLONE_NEWNET))
206 namespaces)))
207
831bc146 208(define (run-container root mounts namespaces host-uids thunk)
c1f6a0c2 209 "Run THUNK in a new container process and return its PID. ROOT specifies
5970e8e2
LC
210the root directory for the container. MOUNTS is a list of <file-system>
211objects that specify file systems to mount inside the container. NAMESPACES
c1f6a0c2 212is a list of symbols that correspond to the possible Linux namespaces: mnt,
831bc146
DT
213ipc, uts, user, and net. HOST-UIDS specifies the number of
214host user identifiers to map into the user namespace."
c1f6a0c2
DT
215 ;; The parent process must initialize the user namespace for the child
216 ;; before it can boot. To negotiate this, a pipe is used such that the
217 ;; child process blocks until the parent writes to it.
c06f6db7
LC
218 (match (socketpair PF_UNIX SOCK_STREAM 0)
219 ((child . parent)
c1f6a0c2
DT
220 (let ((flags (namespaces->bit-mask namespaces)))
221 (match (clone flags)
222 (0
223 (call-with-clean-exit
224 (lambda ()
c06f6db7 225 (close-port parent)
c1f6a0c2 226 ;; Wait for parent to set things up.
c06f6db7 227 (match (read child)
4c14d4ea 228 ('ready
4c14d4ea
LC
229 (purify-environment)
230 (when (memq 'mnt namespaces)
c06f6db7
LC
231 (catch #t
232 (lambda ()
233 (mount-file-systems root mounts
234 #:mount-/proc? (memq 'pid namespaces)
235 #:mount-/sys? (memq 'net
236 namespaces)))
237 (lambda args
238 ;; Forward the exception to the parent process.
36c4917c
LC
239 ;; FIXME: SRFI-35 conditions and non-trivial objects
240 ;; cannot be 'read' so they shouldn't be written as is.
c06f6db7
LC
241 (write args child)
242 (primitive-exit 3))))
4c14d4ea 243 ;; TODO: Manage capabilities.
c06f6db7
LC
244 (write 'ready child)
245 (close-port child)
4c14d4ea
LC
246 (thunk))
247 (_ ;parent died or something
248 (primitive-exit 2))))))
c1f6a0c2 249 (pid
c06f6db7 250 (close-port child)
c1f6a0c2 251 (when (memq 'user namespaces)
831bc146 252 (initialize-user-namespace pid host-uids))
c1f6a0c2 253 ;; TODO: Initialize cgroups.
c06f6db7
LC
254 (write 'ready parent)
255 (newline parent)
256
257 ;; Check whether the child process' setup phase succeeded.
258 (let ((message (read parent)))
259 (close-port parent)
260 (match message
261 ('ready ;success
262 pid)
263 (((? symbol? key) args ...) ;exception
264 (apply throw key args))
265 (_ ;unexpected termination
266 #f)))))))))
c1f6a0c2 267
831bc146
DT
268(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
269 (host-uids 1))
c1f6a0c2 270 "Run THUNK in a new container process and return its exit status.
5970e8e2
LC
271MOUNTS is a list of <file-system> objects that specify file systems to mount
272inside the container. NAMESPACES is a list of symbols corresponding to
c1f6a0c2 273the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
831bc146
DT
274default, all namespaces are used. HOST-UIDS is the number of host user
275identifiers to map into the container's user namespace, if there is one. By
276default, only a single uid/gid, that of the current user, is mapped into the
277container. The host user that creates the container is the root user (uid/gid
2780) within the container. Only root can map more than a single uid/gid.
c1f6a0c2
DT
279
280Note that if THUNK needs to load any additional Guile modules, the relevant
281module files must be present in one of the mappings in MOUNTS and the Guile
282load path must be adjusted as needed."
283 (call-with-temporary-directory
284 (lambda (root)
831bc146 285 (let ((pid (run-container root mounts namespaces host-uids thunk)))
c1f6a0c2
DT
286 ;; Catch SIGINT and kill the container process.
287 (sigaction SIGINT
288 (lambda (signum)
289 (false-if-exception
290 (kill pid SIGKILL))))
291
292 (match (waitpid pid)
293 ((_ . status) status))))))
294
295(define (container-excursion pid thunk)
296 "Run THUNK as a child process within the namespaces of process PID and
297return the exit status."
298 (define (namespace-file pid namespace)
299 (string-append "/proc/" (number->string pid) "/ns/" namespace))
300
301 (match (primitive-fork)
302 (0
303 (call-with-clean-exit
304 (lambda ()
305 (for-each (lambda (ns)
7fee5b53
LC
306 (let ((source (namespace-file (getpid) ns))
307 (target (namespace-file pid ns)))
308 ;; Joining the namespace that the process already
309 ;; belongs to would throw an error so avoid that.
310 ;; XXX: This /proc interface leads to TOCTTOU.
311 (unless (string=? (readlink source) (readlink target))
312 (call-with-input-file source
313 (lambda (current-ns-port)
314 (call-with-input-file target
315 (lambda (new-ns-port)
316 (setns (fileno new-ns-port) 0))))))))
c1f6a0c2
DT
317 ;; It's important that the user namespace is joined first,
318 ;; so that the user will have the privileges to join the
319 ;; other namespaces. Furthermore, it's important that the
320 ;; mount namespace is joined last, otherwise the /proc mount
321 ;; point would no longer be accessible.
322 '("user" "ipc" "uts" "net" "pid" "mnt"))
323 (purify-environment)
324 (chdir "/")
325 (thunk))))
326 (pid
327 (match (waitpid pid)
328 ((_ . status)
329 (status:exit-val status))))))
c90db25f
LC
330
331(define (container-excursion* pid thunk)
332 "Like 'container-excursion', but return the return value of THUNK."
333 (match (pipe)
334 ((in . out)
335 (match (container-excursion pid
336 (lambda ()
337 (close-port in)
338 (write (thunk) out)))
339 (0
340 (close-port out)
341 (let ((result (read in)))
342 (close-port in)
343 result))
344 (_ ;maybe PID died already
345 (close-port out)
346 (close-port in)
347 #f)))))