Commit | Line | Data |
---|---|---|
c1f6a0c2 DT |
1 | ;;; GNU Guix --- Functional package management for GNU |
2 | ;;; Copyright © 2015 David Thompson <davet@gnu.org> | |
c90db25f | 3 | ;;; Copyright © 2017 Ludovic Courtès <ludo@gnu.org> |
c1f6a0c2 DT |
4 | ;;; |
5 | ;;; This file is part of GNU Guix. | |
6 | ;;; | |
7 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
8 | ;;; under the terms of the GNU General Public License as published by | |
9 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
10 | ;;; your option) any later version. | |
11 | ;;; | |
12 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
13 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;;; GNU General Public License for more details. | |
16 | ;;; | |
17 | ;;; You should have received a copy of the GNU General Public License | |
18 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
19 | ||
20 | (define-module (gnu build linux-container) | |
21 | #:use-module (ice-9 format) | |
22 | #:use-module (ice-9 match) | |
b7d48312 | 23 | #:use-module (ice-9 rdelim) |
c1f6a0c2 DT |
24 | #:use-module (srfi srfi-98) |
25 | #:use-module (guix utils) | |
26 | #:use-module (guix build utils) | |
27 | #:use-module (guix build syscalls) | |
5970e8e2 | 28 | #:use-module (gnu system file-systems) ;<file-system> |
c1f6a0c2 | 29 | #:use-module ((gnu build file-systems) #:select (mount-file-system)) |
b7d48312 DT |
30 | #:export (user-namespace-supported? |
31 | unprivileged-user-namespace-supported? | |
32 | setgroups-supported? | |
33 | %namespaces | |
c1f6a0c2 DT |
34 | run-container |
35 | call-with-container | |
c90db25f LC |
36 | container-excursion |
37 | container-excursion*)) | |
c1f6a0c2 | 38 | |
b7d48312 DT |
39 | (define (user-namespace-supported?) |
40 | "Return #t if user namespaces are supported on this system." | |
41 | (file-exists? "/proc/self/ns/user")) | |
42 | ||
43 | (define (unprivileged-user-namespace-supported?) | |
44 | "Return #t if user namespaces can be created by unprivileged users." | |
45 | (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone")) | |
46 | (if (file-exists? userns-file) | |
c5184468 | 47 | (eqv? #\1 (call-with-input-file userns-file read-char)) |
b7d48312 DT |
48 | #t))) |
49 | ||
50 | (define (setgroups-supported?) | |
51 | "Return #t if the setgroups proc file, introduced in Linux-libre 3.19, | |
52 | exists." | |
53 | (file-exists? "/proc/self/setgroups")) | |
54 | ||
c1f6a0c2 DT |
55 | (define %namespaces |
56 | '(mnt pid ipc uts user net)) | |
57 | ||
58 | (define (call-with-clean-exit thunk) | |
59 | "Apply THUNK, but exit with a status code of 1 if it fails." | |
60 | (dynamic-wind | |
61 | (const #t) | |
a72ccbc2 DT |
62 | (lambda () |
63 | (thunk) | |
64 | (primitive-exit 0)) | |
c1f6a0c2 DT |
65 | (lambda () |
66 | (primitive-exit 1)))) | |
67 | ||
68 | (define (purify-environment) | |
69 | "Unset all environment variables." | |
70 | (for-each unsetenv | |
71 | (match (get-environment-variables) | |
72 | (((names . _) ...) names)))) | |
73 | ||
74 | ;; The container setup procedure closely resembles that of the Docker | |
75 | ;; specification: | |
76 | ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md | |
77 | (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?) | |
5970e8e2 LC |
78 | "Mount the essential file systems and the those in MOUNTS, a list of |
79 | <file-system> objects, relative to ROOT; then make ROOT the new root directory | |
80 | for the process." | |
c1f6a0c2 DT |
81 | (define (scope dir) |
82 | (string-append root dir)) | |
83 | ||
4949ada9 DT |
84 | (define (touch file-name) |
85 | (call-with-output-file file-name (const #t))) | |
86 | ||
c1f6a0c2 DT |
87 | (define (bind-mount src dest) |
88 | (mount src dest "none" MS_BIND)) | |
89 | ||
90 | ;; Like mount, but creates the mount point if it doesn't exist. | |
91 | (define* (mount* source target type #:optional (flags 0) options | |
92 | #:key (update-mtab? #f)) | |
93 | (mkdir-p target) | |
94 | (mount source target type flags options #:update-mtab? update-mtab?)) | |
95 | ||
96 | ;; The container's file system is completely ephemeral, sans directories | |
97 | ;; bind-mounted from the host. | |
98 | (mount "none" root "tmpfs") | |
99 | ||
100 | ;; A proc mount requires a new pid namespace. | |
101 | (when mount-/proc? | |
102 | (mount* "none" (scope "/proc") "proc" | |
103 | (logior MS_NOEXEC MS_NOSUID MS_NODEV))) | |
104 | ||
105 | ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in | |
106 | ;; the current network namespace. | |
107 | (when mount-/sys? | |
108 | (mount* "none" (scope "/sys") "sysfs" | |
109 | (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))) | |
110 | ||
111 | (mount* "none" (scope "/dev") "tmpfs" | |
112 | (logior MS_NOEXEC MS_STRICTATIME) | |
113 | "mode=755") | |
114 | ||
115 | ;; Create essential device nodes via bind-mounting them from the | |
116 | ;; host, because a process within a user namespace cannot create | |
117 | ;; device nodes. | |
118 | (for-each (lambda (device) | |
119 | (when (file-exists? device) | |
120 | ;; Create the mount point file. | |
4949ada9 | 121 | (touch (scope device)) |
c1f6a0c2 DT |
122 | (bind-mount device (scope device)))) |
123 | '("/dev/null" | |
124 | "/dev/zero" | |
125 | "/dev/full" | |
126 | "/dev/random" | |
127 | "/dev/urandom" | |
128 | "/dev/tty" | |
129 | "/dev/ptmx" | |
130 | "/dev/fuse")) | |
131 | ||
4949ada9 | 132 | ;; Setup the container's /dev/console by bind mounting the pseudo-terminal |
168aba29 LC |
133 | ;; associated with standard input when there is one. |
134 | (let* ((in (current-input-port)) | |
135 | (tty (catch 'system-error | |
136 | (lambda () | |
137 | ;; This call throws if IN does not correspond to a tty. | |
138 | ;; This is more reliable than 'isatty?'. | |
139 | (ttyname in)) | |
140 | (const #f))) | |
141 | (console (scope "/dev/console"))) | |
142 | (when tty | |
4949ada9 DT |
143 | (touch console) |
144 | (chmod console #o600) | |
168aba29 | 145 | (bind-mount tty console))) |
4949ada9 | 146 | |
c1f6a0c2 DT |
147 | ;; Setup standard input/output/error. |
148 | (symlink "/proc/self/fd" (scope "/dev/fd")) | |
149 | (symlink "/proc/self/fd/0" (scope "/dev/stdin")) | |
150 | (symlink "/proc/self/fd/1" (scope "/dev/stdout")) | |
151 | (symlink "/proc/self/fd/2" (scope "/dev/stderr")) | |
152 | ||
153 | ;; Mount user-specified file systems. | |
5970e8e2 | 154 | (for-each (lambda (file-system) |
1c65cca5 | 155 | (mount-file-system file-system #:root root)) |
c1f6a0c2 DT |
156 | mounts) |
157 | ||
158 | ;; Jail the process inside the container's root file system. | |
159 | (let ((put-old (string-append root "/real-root"))) | |
160 | (mkdir put-old) | |
161 | (pivot-root root put-old) | |
162 | (chdir "/") | |
163 | (umount "real-root" MNT_DETACH) | |
164 | (rmdir "real-root"))) | |
165 | ||
831bc146 DT |
166 | (define (initialize-user-namespace pid host-uids) |
167 | "Configure the user namespace for PID. HOST-UIDS specifies the number of | |
168 | host user identifiers to map into the user namespace." | |
c1f6a0c2 DT |
169 | (define proc-dir |
170 | (string-append "/proc/" (number->string pid))) | |
171 | ||
172 | (define (scope file) | |
173 | (string-append proc-dir file)) | |
174 | ||
831bc146 DT |
175 | (let ((uid (getuid)) |
176 | (gid (getgid))) | |
c1f6a0c2 DT |
177 | |
178 | ;; Only root can write to the gid map without first disabling the | |
179 | ;; setgroups syscall. | |
180 | (unless (and (zero? uid) (zero? gid)) | |
181 | (call-with-output-file (scope "/setgroups") | |
182 | (lambda (port) | |
183 | (display "deny" port)))) | |
184 | ||
185 | ;; Map the user/group that created the container to the root user | |
186 | ;; within the container. | |
187 | (call-with-output-file (scope "/uid_map") | |
188 | (lambda (port) | |
831bc146 | 189 | (format port "0 ~d ~d" uid host-uids))) |
c1f6a0c2 DT |
190 | (call-with-output-file (scope "/gid_map") |
191 | (lambda (port) | |
831bc146 | 192 | (format port "0 ~d ~d" gid host-uids))))) |
c1f6a0c2 DT |
193 | |
194 | (define (namespaces->bit-mask namespaces) | |
195 | "Return the number suitable for the 'flags' argument of 'clone' that | |
196 | corresponds to the symbols in NAMESPACES." | |
ee78d024 | 197 | ;; Use the same flags as fork(3) in addition to the namespace flags. |
35b50a75 | 198 | (apply logior SIGCHLD |
c1f6a0c2 DT |
199 | (map (match-lambda |
200 | ('mnt CLONE_NEWNS) | |
201 | ('uts CLONE_NEWUTS) | |
202 | ('ipc CLONE_NEWIPC) | |
203 | ('user CLONE_NEWUSER) | |
204 | ('pid CLONE_NEWPID) | |
205 | ('net CLONE_NEWNET)) | |
206 | namespaces))) | |
207 | ||
831bc146 | 208 | (define (run-container root mounts namespaces host-uids thunk) |
c1f6a0c2 | 209 | "Run THUNK in a new container process and return its PID. ROOT specifies |
5970e8e2 LC |
210 | the root directory for the container. MOUNTS is a list of <file-system> |
211 | objects that specify file systems to mount inside the container. NAMESPACES | |
c1f6a0c2 | 212 | is a list of symbols that correspond to the possible Linux namespaces: mnt, |
831bc146 DT |
213 | ipc, uts, user, and net. HOST-UIDS specifies the number of |
214 | host user identifiers to map into the user namespace." | |
c1f6a0c2 DT |
215 | ;; The parent process must initialize the user namespace for the child |
216 | ;; before it can boot. To negotiate this, a pipe is used such that the | |
217 | ;; child process blocks until the parent writes to it. | |
c06f6db7 LC |
218 | (match (socketpair PF_UNIX SOCK_STREAM 0) |
219 | ((child . parent) | |
c1f6a0c2 DT |
220 | (let ((flags (namespaces->bit-mask namespaces))) |
221 | (match (clone flags) | |
222 | (0 | |
223 | (call-with-clean-exit | |
224 | (lambda () | |
c06f6db7 | 225 | (close-port parent) |
c1f6a0c2 | 226 | ;; Wait for parent to set things up. |
c06f6db7 | 227 | (match (read child) |
4c14d4ea | 228 | ('ready |
4c14d4ea LC |
229 | (purify-environment) |
230 | (when (memq 'mnt namespaces) | |
c06f6db7 LC |
231 | (catch #t |
232 | (lambda () | |
233 | (mount-file-systems root mounts | |
234 | #:mount-/proc? (memq 'pid namespaces) | |
235 | #:mount-/sys? (memq 'net | |
236 | namespaces))) | |
237 | (lambda args | |
238 | ;; Forward the exception to the parent process. | |
36c4917c LC |
239 | ;; FIXME: SRFI-35 conditions and non-trivial objects |
240 | ;; cannot be 'read' so they shouldn't be written as is. | |
c06f6db7 LC |
241 | (write args child) |
242 | (primitive-exit 3)))) | |
4c14d4ea | 243 | ;; TODO: Manage capabilities. |
c06f6db7 LC |
244 | (write 'ready child) |
245 | (close-port child) | |
4c14d4ea LC |
246 | (thunk)) |
247 | (_ ;parent died or something | |
248 | (primitive-exit 2)))))) | |
c1f6a0c2 | 249 | (pid |
c06f6db7 | 250 | (close-port child) |
c1f6a0c2 | 251 | (when (memq 'user namespaces) |
831bc146 | 252 | (initialize-user-namespace pid host-uids)) |
c1f6a0c2 | 253 | ;; TODO: Initialize cgroups. |
c06f6db7 LC |
254 | (write 'ready parent) |
255 | (newline parent) | |
256 | ||
257 | ;; Check whether the child process' setup phase succeeded. | |
258 | (let ((message (read parent))) | |
259 | (close-port parent) | |
260 | (match message | |
261 | ('ready ;success | |
262 | pid) | |
263 | (((? symbol? key) args ...) ;exception | |
264 | (apply throw key args)) | |
265 | (_ ;unexpected termination | |
266 | #f))))))))) | |
c1f6a0c2 | 267 | |
831bc146 DT |
268 | (define* (call-with-container mounts thunk #:key (namespaces %namespaces) |
269 | (host-uids 1)) | |
c1f6a0c2 | 270 | "Run THUNK in a new container process and return its exit status. |
5970e8e2 LC |
271 | MOUNTS is a list of <file-system> objects that specify file systems to mount |
272 | inside the container. NAMESPACES is a list of symbols corresponding to | |
c1f6a0c2 | 273 | the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By |
831bc146 DT |
274 | default, all namespaces are used. HOST-UIDS is the number of host user |
275 | identifiers to map into the container's user namespace, if there is one. By | |
276 | default, only a single uid/gid, that of the current user, is mapped into the | |
277 | container. The host user that creates the container is the root user (uid/gid | |
278 | 0) within the container. Only root can map more than a single uid/gid. | |
c1f6a0c2 DT |
279 | |
280 | Note that if THUNK needs to load any additional Guile modules, the relevant | |
281 | module files must be present in one of the mappings in MOUNTS and the Guile | |
282 | load path must be adjusted as needed." | |
283 | (call-with-temporary-directory | |
284 | (lambda (root) | |
831bc146 | 285 | (let ((pid (run-container root mounts namespaces host-uids thunk))) |
c1f6a0c2 DT |
286 | ;; Catch SIGINT and kill the container process. |
287 | (sigaction SIGINT | |
288 | (lambda (signum) | |
289 | (false-if-exception | |
290 | (kill pid SIGKILL)))) | |
291 | ||
292 | (match (waitpid pid) | |
293 | ((_ . status) status)))))) | |
294 | ||
295 | (define (container-excursion pid thunk) | |
296 | "Run THUNK as a child process within the namespaces of process PID and | |
297 | return the exit status." | |
298 | (define (namespace-file pid namespace) | |
299 | (string-append "/proc/" (number->string pid) "/ns/" namespace)) | |
300 | ||
301 | (match (primitive-fork) | |
302 | (0 | |
303 | (call-with-clean-exit | |
304 | (lambda () | |
305 | (for-each (lambda (ns) | |
7fee5b53 LC |
306 | (let ((source (namespace-file (getpid) ns)) |
307 | (target (namespace-file pid ns))) | |
308 | ;; Joining the namespace that the process already | |
309 | ;; belongs to would throw an error so avoid that. | |
310 | ;; XXX: This /proc interface leads to TOCTTOU. | |
311 | (unless (string=? (readlink source) (readlink target)) | |
312 | (call-with-input-file source | |
313 | (lambda (current-ns-port) | |
314 | (call-with-input-file target | |
315 | (lambda (new-ns-port) | |
316 | (setns (fileno new-ns-port) 0)))))))) | |
c1f6a0c2 DT |
317 | ;; It's important that the user namespace is joined first, |
318 | ;; so that the user will have the privileges to join the | |
319 | ;; other namespaces. Furthermore, it's important that the | |
320 | ;; mount namespace is joined last, otherwise the /proc mount | |
321 | ;; point would no longer be accessible. | |
322 | '("user" "ipc" "uts" "net" "pid" "mnt")) | |
323 | (purify-environment) | |
324 | (chdir "/") | |
325 | (thunk)))) | |
326 | (pid | |
327 | (match (waitpid pid) | |
328 | ((_ . status) | |
329 | (status:exit-val status)))))) | |
c90db25f LC |
330 | |
331 | (define (container-excursion* pid thunk) | |
332 | "Like 'container-excursion', but return the return value of THUNK." | |
333 | (match (pipe) | |
334 | ((in . out) | |
335 | (match (container-excursion pid | |
336 | (lambda () | |
337 | (close-port in) | |
338 | (write (thunk) out))) | |
339 | (0 | |
340 | (close-port out) | |
341 | (let ((result (read in))) | |
342 | (close-port in) | |
343 | result)) | |
344 | (_ ;maybe PID died already | |
345 | (close-port out) | |
346 | (close-port in) | |
347 | #f))))) |