gnu: python-3.4: Update to 3.4.5.
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2015 David Thompson <davet@gnu.org>
3 ;;;
4 ;;; This file is part of GNU Guix.
5 ;;;
6 ;;; GNU Guix is free software; you can redistribute it and/or modify it
7 ;;; under the terms of the GNU General Public License as published by
8 ;;; the Free Software Foundation; either version 3 of the License, or (at
9 ;;; your option) any later version.
10 ;;;
11 ;;; GNU Guix is distributed in the hope that it will be useful, but
12 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;;; GNU General Public License for more details.
15 ;;;
16 ;;; You should have received a copy of the GNU General Public License
17 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
18
19 (define-module (gnu build linux-container)
20 #:use-module (ice-9 format)
21 #:use-module (ice-9 match)
22 #:use-module (ice-9 rdelim)
23 #:use-module (srfi srfi-98)
24 #:use-module (guix utils)
25 #:use-module (guix build utils)
26 #:use-module (guix build syscalls)
27 #:use-module ((gnu build file-systems) #:select (mount-file-system))
28 #:export (user-namespace-supported?
29 unprivileged-user-namespace-supported?
30 setgroups-supported?
31 %namespaces
32 run-container
33 call-with-container
34 container-excursion))
35
36 (define (user-namespace-supported?)
37 "Return #t if user namespaces are supported on this system."
38 (file-exists? "/proc/self/ns/user"))
39
40 (define (unprivileged-user-namespace-supported?)
41 "Return #t if user namespaces can be created by unprivileged users."
42 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
43 (if (file-exists? userns-file)
44 (eqv? #\1 (call-with-input-file userns-file read-char))
45 #t)))
46
47 (define (setgroups-supported?)
48 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
49 exists."
50 (file-exists? "/proc/self/setgroups"))
51
52 (define %namespaces
53 '(mnt pid ipc uts user net))
54
55 (define (call-with-clean-exit thunk)
56 "Apply THUNK, but exit with a status code of 1 if it fails."
57 (dynamic-wind
58 (const #t)
59 (lambda ()
60 (thunk)
61 (primitive-exit 0))
62 (lambda ()
63 (primitive-exit 1))))
64
65 (define (purify-environment)
66 "Unset all environment variables."
67 (for-each unsetenv
68 (match (get-environment-variables)
69 (((names . _) ...) names))))
70
71 ;; The container setup procedure closely resembles that of the Docker
72 ;; specification:
73 ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
74 (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
75 "Mount the essential file systems and the those in the MOUNTS list relative
76 to ROOT, then make ROOT the new root directory for the process."
77 (define (scope dir)
78 (string-append root dir))
79
80 (define (touch file-name)
81 (call-with-output-file file-name (const #t)))
82
83 (define (bind-mount src dest)
84 (mount src dest "none" MS_BIND))
85
86 ;; Like mount, but creates the mount point if it doesn't exist.
87 (define* (mount* source target type #:optional (flags 0) options
88 #:key (update-mtab? #f))
89 (mkdir-p target)
90 (mount source target type flags options #:update-mtab? update-mtab?))
91
92 ;; The container's file system is completely ephemeral, sans directories
93 ;; bind-mounted from the host.
94 (mount "none" root "tmpfs")
95
96 ;; A proc mount requires a new pid namespace.
97 (when mount-/proc?
98 (mount* "none" (scope "/proc") "proc"
99 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
100
101 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
102 ;; the current network namespace.
103 (when mount-/sys?
104 (mount* "none" (scope "/sys") "sysfs"
105 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
106
107 (mount* "none" (scope "/dev") "tmpfs"
108 (logior MS_NOEXEC MS_STRICTATIME)
109 "mode=755")
110
111 ;; Create essential device nodes via bind-mounting them from the
112 ;; host, because a process within a user namespace cannot create
113 ;; device nodes.
114 (for-each (lambda (device)
115 (when (file-exists? device)
116 ;; Create the mount point file.
117 (touch (scope device))
118 (bind-mount device (scope device))))
119 '("/dev/null"
120 "/dev/zero"
121 "/dev/full"
122 "/dev/random"
123 "/dev/urandom"
124 "/dev/tty"
125 "/dev/ptmx"
126 "/dev/fuse"))
127
128 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
129 ;; associated with standard input.
130 (let ((in (current-input-port))
131 (console (scope "/dev/console")))
132 (when (isatty? in)
133 (touch console)
134 (chmod console #o600)
135 (bind-mount (ttyname in) console)))
136
137 ;; Setup standard input/output/error.
138 (symlink "/proc/self/fd" (scope "/dev/fd"))
139 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
140 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
141 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
142
143 ;; Mount user-specified file systems.
144 (for-each (lambda (spec)
145 (mount-file-system spec #:root root))
146 mounts)
147
148 ;; Jail the process inside the container's root file system.
149 (let ((put-old (string-append root "/real-root")))
150 (mkdir put-old)
151 (pivot-root root put-old)
152 (chdir "/")
153 (umount "real-root" MNT_DETACH)
154 (rmdir "real-root")))
155
156 (define (initialize-user-namespace pid host-uids)
157 "Configure the user namespace for PID. HOST-UIDS specifies the number of
158 host user identifiers to map into the user namespace."
159 (define proc-dir
160 (string-append "/proc/" (number->string pid)))
161
162 (define (scope file)
163 (string-append proc-dir file))
164
165 (let ((uid (getuid))
166 (gid (getgid)))
167
168 ;; Only root can write to the gid map without first disabling the
169 ;; setgroups syscall.
170 (unless (and (zero? uid) (zero? gid))
171 (call-with-output-file (scope "/setgroups")
172 (lambda (port)
173 (display "deny" port))))
174
175 ;; Map the user/group that created the container to the root user
176 ;; within the container.
177 (call-with-output-file (scope "/uid_map")
178 (lambda (port)
179 (format port "0 ~d ~d" uid host-uids)))
180 (call-with-output-file (scope "/gid_map")
181 (lambda (port)
182 (format port "0 ~d ~d" gid host-uids)))))
183
184 (define (namespaces->bit-mask namespaces)
185 "Return the number suitable for the 'flags' argument of 'clone' that
186 corresponds to the symbols in NAMESPACES."
187 ;; Use the same flags as fork(3) in addition to the namespace flags.
188 (apply logior SIGCHLD
189 (map (match-lambda
190 ('mnt CLONE_NEWNS)
191 ('uts CLONE_NEWUTS)
192 ('ipc CLONE_NEWIPC)
193 ('user CLONE_NEWUSER)
194 ('pid CLONE_NEWPID)
195 ('net CLONE_NEWNET))
196 namespaces)))
197
198 (define (run-container root mounts namespaces host-uids thunk)
199 "Run THUNK in a new container process and return its PID. ROOT specifies
200 the root directory for the container. MOUNTS is a list of file system specs
201 that specify the mapping of host file systems into the container. NAMESPACES
202 is a list of symbols that correspond to the possible Linux namespaces: mnt,
203 ipc, uts, user, and net. HOST-UIDS specifies the number of
204 host user identifiers to map into the user namespace."
205 ;; The parent process must initialize the user namespace for the child
206 ;; before it can boot. To negotiate this, a pipe is used such that the
207 ;; child process blocks until the parent writes to it.
208 (match (socketpair PF_UNIX SOCK_STREAM 0)
209 ((child . parent)
210 (let ((flags (namespaces->bit-mask namespaces)))
211 (match (clone flags)
212 (0
213 (call-with-clean-exit
214 (lambda ()
215 (close-port parent)
216 ;; Wait for parent to set things up.
217 (match (read child)
218 ('ready
219 (purify-environment)
220 (when (memq 'mnt namespaces)
221 (catch #t
222 (lambda ()
223 (mount-file-systems root mounts
224 #:mount-/proc? (memq 'pid namespaces)
225 #:mount-/sys? (memq 'net
226 namespaces)))
227 (lambda args
228 ;; Forward the exception to the parent process.
229 (write args child)
230 (primitive-exit 3))))
231 ;; TODO: Manage capabilities.
232 (write 'ready child)
233 (close-port child)
234 (thunk))
235 (_ ;parent died or something
236 (primitive-exit 2))))))
237 (pid
238 (close-port child)
239 (when (memq 'user namespaces)
240 (initialize-user-namespace pid host-uids))
241 ;; TODO: Initialize cgroups.
242 (write 'ready parent)
243 (newline parent)
244
245 ;; Check whether the child process' setup phase succeeded.
246 (let ((message (read parent)))
247 (close-port parent)
248 (match message
249 ('ready ;success
250 pid)
251 (((? symbol? key) args ...) ;exception
252 (apply throw key args))
253 (_ ;unexpected termination
254 #f)))))))))
255
256 (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
257 (host-uids 1))
258 "Run THUNK in a new container process and return its exit status.
259 MOUNTS is a list of file system specs that specify the mapping of host file
260 systems into the container. NAMESPACES is a list of symbols corresponding to
261 the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
262 default, all namespaces are used. HOST-UIDS is the number of host user
263 identifiers to map into the container's user namespace, if there is one. By
264 default, only a single uid/gid, that of the current user, is mapped into the
265 container. The host user that creates the container is the root user (uid/gid
266 0) within the container. Only root can map more than a single uid/gid.
267
268 Note that if THUNK needs to load any additional Guile modules, the relevant
269 module files must be present in one of the mappings in MOUNTS and the Guile
270 load path must be adjusted as needed."
271 (call-with-temporary-directory
272 (lambda (root)
273 (let ((pid (run-container root mounts namespaces host-uids thunk)))
274 ;; Catch SIGINT and kill the container process.
275 (sigaction SIGINT
276 (lambda (signum)
277 (false-if-exception
278 (kill pid SIGKILL))))
279
280 (match (waitpid pid)
281 ((_ . status) status))))))
282
283 (define (container-excursion pid thunk)
284 "Run THUNK as a child process within the namespaces of process PID and
285 return the exit status."
286 (define (namespace-file pid namespace)
287 (string-append "/proc/" (number->string pid) "/ns/" namespace))
288
289 (match (primitive-fork)
290 (0
291 (call-with-clean-exit
292 (lambda ()
293 (for-each (lambda (ns)
294 (call-with-input-file (namespace-file (getpid) ns)
295 (lambda (current-ns-port)
296 (call-with-input-file (namespace-file pid ns)
297 (lambda (new-ns-port)
298 ;; Joining the namespace that the process
299 ;; already belongs to would throw an error.
300 (unless (= (port->fdes current-ns-port)
301 (port->fdes new-ns-port))
302 (setns (port->fdes new-ns-port) 0)))))))
303 ;; It's important that the user namespace is joined first,
304 ;; so that the user will have the privileges to join the
305 ;; other namespaces. Furthermore, it's important that the
306 ;; mount namespace is joined last, otherwise the /proc mount
307 ;; point would no longer be accessible.
308 '("user" "ipc" "uts" "net" "pid" "mnt"))
309 (purify-environment)
310 (chdir "/")
311 (thunk))))
312 (pid
313 (match (waitpid pid)
314 ((_ . status)
315 (status:exit-val status))))))