Commit | Line | Data |
---|---|---|
c1f6a0c2 DT |
1 | ;;; GNU Guix --- Functional package management for GNU |
2 | ;;; Copyright © 2015 David Thompson <davet@gnu.org> | |
af76c020 | 3 | ;;; Copyright © 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org> |
c1f6a0c2 DT |
4 | ;;; |
5 | ;;; This file is part of GNU Guix. | |
6 | ;;; | |
7 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
8 | ;;; under the terms of the GNU General Public License as published by | |
9 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
10 | ;;; your option) any later version. | |
11 | ;;; | |
12 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
13 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;;; GNU General Public License for more details. | |
16 | ;;; | |
17 | ;;; You should have received a copy of the GNU General Public License | |
18 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
19 | ||
20 | (define-module (gnu build linux-container) | |
21 | #:use-module (ice-9 format) | |
22 | #:use-module (ice-9 match) | |
b7d48312 | 23 | #:use-module (ice-9 rdelim) |
c1f6a0c2 | 24 | #:use-module (srfi srfi-98) |
c1f6a0c2 DT |
25 | #:use-module (guix build utils) |
26 | #:use-module (guix build syscalls) | |
5970e8e2 | 27 | #:use-module (gnu system file-systems) ;<file-system> |
c1f6a0c2 | 28 | #:use-module ((gnu build file-systems) #:select (mount-file-system)) |
b7d48312 DT |
29 | #:export (user-namespace-supported? |
30 | unprivileged-user-namespace-supported? | |
31 | setgroups-supported? | |
32 | %namespaces | |
c1f6a0c2 DT |
33 | run-container |
34 | call-with-container | |
c90db25f LC |
35 | container-excursion |
36 | container-excursion*)) | |
c1f6a0c2 | 37 | |
b7d48312 DT |
38 | (define (user-namespace-supported?) |
39 | "Return #t if user namespaces are supported on this system." | |
40 | (file-exists? "/proc/self/ns/user")) | |
41 | ||
42 | (define (unprivileged-user-namespace-supported?) | |
43 | "Return #t if user namespaces can be created by unprivileged users." | |
44 | (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone")) | |
45 | (if (file-exists? userns-file) | |
c5184468 | 46 | (eqv? #\1 (call-with-input-file userns-file read-char)) |
329fa5bd | 47 | #t))) |
b7d48312 DT |
48 | |
49 | (define (setgroups-supported?) | |
50 | "Return #t if the setgroups proc file, introduced in Linux-libre 3.19, | |
51 | exists." | |
52 | (file-exists? "/proc/self/setgroups")) | |
53 | ||
c1f6a0c2 DT |
54 | (define %namespaces |
55 | '(mnt pid ipc uts user net)) | |
56 | ||
57 | (define (call-with-clean-exit thunk) | |
58 | "Apply THUNK, but exit with a status code of 1 if it fails." | |
59 | (dynamic-wind | |
60 | (const #t) | |
a72ccbc2 DT |
61 | (lambda () |
62 | (thunk) | |
95aa64bc LC |
63 | |
64 | ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon | |
65 | ;; exit (coming from fd finalizers) when used by the Shepherd. To work | |
66 | ;; around that, exit forcefully so fd finalizers don't have a chance to | |
67 | ;; run and fail. | |
68 | (primitive-_exit 0)) | |
c1f6a0c2 | 69 | (lambda () |
95aa64bc | 70 | (primitive-_exit 1)))) |
c1f6a0c2 DT |
71 | |
72 | (define (purify-environment) | |
73 | "Unset all environment variables." | |
74 | (for-each unsetenv | |
75 | (match (get-environment-variables) | |
76 | (((names . _) ...) names)))) | |
77 | ||
78 | ;; The container setup procedure closely resembles that of the Docker | |
79 | ;; specification: | |
80 | ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md | |
81 | (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?) | |
5970e8e2 LC |
82 | "Mount the essential file systems and the those in MOUNTS, a list of |
83 | <file-system> objects, relative to ROOT; then make ROOT the new root directory | |
84 | for the process." | |
c1f6a0c2 DT |
85 | (define (scope dir) |
86 | (string-append root dir)) | |
87 | ||
4949ada9 DT |
88 | (define (touch file-name) |
89 | (call-with-output-file file-name (const #t))) | |
90 | ||
c1f6a0c2 DT |
91 | (define (bind-mount src dest) |
92 | (mount src dest "none" MS_BIND)) | |
93 | ||
94 | ;; Like mount, but creates the mount point if it doesn't exist. | |
95 | (define* (mount* source target type #:optional (flags 0) options | |
96 | #:key (update-mtab? #f)) | |
97 | (mkdir-p target) | |
98 | (mount source target type flags options #:update-mtab? update-mtab?)) | |
99 | ||
100 | ;; The container's file system is completely ephemeral, sans directories | |
101 | ;; bind-mounted from the host. | |
102 | (mount "none" root "tmpfs") | |
103 | ||
104 | ;; A proc mount requires a new pid namespace. | |
105 | (when mount-/proc? | |
106 | (mount* "none" (scope "/proc") "proc" | |
107 | (logior MS_NOEXEC MS_NOSUID MS_NODEV))) | |
108 | ||
109 | ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in | |
110 | ;; the current network namespace. | |
111 | (when mount-/sys? | |
112 | (mount* "none" (scope "/sys") "sysfs" | |
113 | (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))) | |
114 | ||
115 | (mount* "none" (scope "/dev") "tmpfs" | |
116 | (logior MS_NOEXEC MS_STRICTATIME) | |
117 | "mode=755") | |
118 | ||
119 | ;; Create essential device nodes via bind-mounting them from the | |
120 | ;; host, because a process within a user namespace cannot create | |
121 | ;; device nodes. | |
122 | (for-each (lambda (device) | |
123 | (when (file-exists? device) | |
124 | ;; Create the mount point file. | |
4949ada9 | 125 | (touch (scope device)) |
c1f6a0c2 DT |
126 | (bind-mount device (scope device)))) |
127 | '("/dev/null" | |
128 | "/dev/zero" | |
129 | "/dev/full" | |
130 | "/dev/random" | |
131 | "/dev/urandom" | |
132 | "/dev/tty" | |
c1f6a0c2 DT |
133 | "/dev/fuse")) |
134 | ||
935e79af LC |
135 | ;; Mount a new devpts instance on /dev/pts. |
136 | (when (file-exists? "/dev/ptmx") | |
137 | (mount* "none" (scope "/dev/pts") "devpts" 0 | |
138 | "newinstance,mode=0620") | |
139 | (symlink "/dev/pts/ptmx" (scope "/dev/ptmx"))) | |
140 | ||
4949ada9 | 141 | ;; Setup the container's /dev/console by bind mounting the pseudo-terminal |
168aba29 LC |
142 | ;; associated with standard input when there is one. |
143 | (let* ((in (current-input-port)) | |
144 | (tty (catch 'system-error | |
145 | (lambda () | |
146 | ;; This call throws if IN does not correspond to a tty. | |
147 | ;; This is more reliable than 'isatty?'. | |
148 | (ttyname in)) | |
149 | (const #f))) | |
150 | (console (scope "/dev/console"))) | |
151 | (when tty | |
4949ada9 DT |
152 | (touch console) |
153 | (chmod console #o600) | |
168aba29 | 154 | (bind-mount tty console))) |
4949ada9 | 155 | |
c1f6a0c2 DT |
156 | ;; Setup standard input/output/error. |
157 | (symlink "/proc/self/fd" (scope "/dev/fd")) | |
158 | (symlink "/proc/self/fd/0" (scope "/dev/stdin")) | |
159 | (symlink "/proc/self/fd/1" (scope "/dev/stdout")) | |
160 | (symlink "/proc/self/fd/2" (scope "/dev/stderr")) | |
161 | ||
162 | ;; Mount user-specified file systems. | |
5970e8e2 | 163 | (for-each (lambda (file-system) |
1c65cca5 | 164 | (mount-file-system file-system #:root root)) |
c1f6a0c2 DT |
165 | mounts) |
166 | ||
167 | ;; Jail the process inside the container's root file system. | |
168 | (let ((put-old (string-append root "/real-root"))) | |
169 | (mkdir put-old) | |
170 | (pivot-root root put-old) | |
171 | (chdir "/") | |
172 | (umount "real-root" MNT_DETACH) | |
e7481835 JL |
173 | (rmdir "real-root") |
174 | (chmod "/" #o755))) | |
c1f6a0c2 | 175 | |
af76c020 LC |
176 | (define* (initialize-user-namespace pid host-uids |
177 | #:key (guest-uid 0) (guest-gid 0)) | |
831bc146 | 178 | "Configure the user namespace for PID. HOST-UIDS specifies the number of |
af76c020 LC |
179 | host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID |
180 | specify the first UID (respectively GID) that host UIDs (respectively GIDs) | |
181 | map to in the namespace." | |
c1f6a0c2 DT |
182 | (define proc-dir |
183 | (string-append "/proc/" (number->string pid))) | |
184 | ||
185 | (define (scope file) | |
186 | (string-append proc-dir file)) | |
187 | ||
831bc146 DT |
188 | (let ((uid (getuid)) |
189 | (gid (getgid))) | |
c1f6a0c2 DT |
190 | |
191 | ;; Only root can write to the gid map without first disabling the | |
192 | ;; setgroups syscall. | |
193 | (unless (and (zero? uid) (zero? gid)) | |
194 | (call-with-output-file (scope "/setgroups") | |
195 | (lambda (port) | |
196 | (display "deny" port)))) | |
197 | ||
198 | ;; Map the user/group that created the container to the root user | |
199 | ;; within the container. | |
200 | (call-with-output-file (scope "/uid_map") | |
201 | (lambda (port) | |
af76c020 | 202 | (format port "~d ~d ~d" guest-uid uid host-uids))) |
c1f6a0c2 DT |
203 | (call-with-output-file (scope "/gid_map") |
204 | (lambda (port) | |
af76c020 | 205 | (format port "~d ~d ~d" guest-gid gid host-uids))))) |
c1f6a0c2 DT |
206 | |
207 | (define (namespaces->bit-mask namespaces) | |
208 | "Return the number suitable for the 'flags' argument of 'clone' that | |
209 | corresponds to the symbols in NAMESPACES." | |
ee78d024 | 210 | ;; Use the same flags as fork(3) in addition to the namespace flags. |
35b50a75 | 211 | (apply logior SIGCHLD |
c1f6a0c2 DT |
212 | (map (match-lambda |
213 | ('mnt CLONE_NEWNS) | |
214 | ('uts CLONE_NEWUTS) | |
215 | ('ipc CLONE_NEWIPC) | |
216 | ('user CLONE_NEWUSER) | |
217 | ('pid CLONE_NEWPID) | |
218 | ('net CLONE_NEWNET)) | |
219 | namespaces))) | |
220 | ||
af76c020 LC |
221 | (define* (run-container root mounts namespaces host-uids thunk |
222 | #:key (guest-uid 0) (guest-gid 0)) | |
c1f6a0c2 | 223 | "Run THUNK in a new container process and return its PID. ROOT specifies |
5970e8e2 LC |
224 | the root directory for the container. MOUNTS is a list of <file-system> |
225 | objects that specify file systems to mount inside the container. NAMESPACES | |
c1f6a0c2 | 226 | is a list of symbols that correspond to the possible Linux namespaces: mnt, |
af76c020 LC |
227 | ipc, uts, user, and net. |
228 | ||
229 | HOST-UIDS specifies the number of host user identifiers to map into the user | |
230 | namespace. GUEST-UID and GUEST-GID specify the first UID (respectively GID) | |
231 | that host UIDs (respectively GIDs) map to in the namespace." | |
c1f6a0c2 DT |
232 | ;; The parent process must initialize the user namespace for the child |
233 | ;; before it can boot. To negotiate this, a pipe is used such that the | |
234 | ;; child process blocks until the parent writes to it. | |
c06f6db7 LC |
235 | (match (socketpair PF_UNIX SOCK_STREAM 0) |
236 | ((child . parent) | |
c1f6a0c2 DT |
237 | (let ((flags (namespaces->bit-mask namespaces))) |
238 | (match (clone flags) | |
239 | (0 | |
240 | (call-with-clean-exit | |
241 | (lambda () | |
c06f6db7 | 242 | (close-port parent) |
c1f6a0c2 | 243 | ;; Wait for parent to set things up. |
c06f6db7 | 244 | (match (read child) |
4c14d4ea | 245 | ('ready |
4c14d4ea | 246 | (purify-environment) |
b3a83f1e MO |
247 | (when (and (memq 'mnt namespaces) |
248 | (not (string=? root "/"))) | |
c06f6db7 LC |
249 | (catch #t |
250 | (lambda () | |
251 | (mount-file-systems root mounts | |
252 | #:mount-/proc? (memq 'pid namespaces) | |
253 | #:mount-/sys? (memq 'net | |
254 | namespaces))) | |
255 | (lambda args | |
256 | ;; Forward the exception to the parent process. | |
36c4917c LC |
257 | ;; FIXME: SRFI-35 conditions and non-trivial objects |
258 | ;; cannot be 'read' so they shouldn't be written as is. | |
c06f6db7 LC |
259 | (write args child) |
260 | (primitive-exit 3)))) | |
4c14d4ea | 261 | ;; TODO: Manage capabilities. |
c06f6db7 LC |
262 | (write 'ready child) |
263 | (close-port child) | |
4c14d4ea LC |
264 | (thunk)) |
265 | (_ ;parent died or something | |
266 | (primitive-exit 2)))))) | |
c1f6a0c2 | 267 | (pid |
c06f6db7 | 268 | (close-port child) |
c1f6a0c2 | 269 | (when (memq 'user namespaces) |
af76c020 LC |
270 | (initialize-user-namespace pid host-uids |
271 | #:guest-uid guest-uid | |
272 | #:guest-gid guest-gid)) | |
c1f6a0c2 | 273 | ;; TODO: Initialize cgroups. |
c06f6db7 LC |
274 | (write 'ready parent) |
275 | (newline parent) | |
276 | ||
277 | ;; Check whether the child process' setup phase succeeded. | |
278 | (let ((message (read parent))) | |
279 | (close-port parent) | |
280 | (match message | |
281 | ('ready ;success | |
282 | pid) | |
283 | (((? symbol? key) args ...) ;exception | |
284 | (apply throw key args)) | |
285 | (_ ;unexpected termination | |
286 | #f))))))))) | |
c1f6a0c2 | 287 | |
40c369b2 LC |
288 | ;; FIXME: This is copied from (guix utils), which we cannot use because it |
289 | ;; would pull (guix config) and all. | |
290 | (define (call-with-temporary-directory proc) | |
291 | "Call PROC with a name of a temporary directory; close the directory and | |
292 | delete it when leaving the dynamic extent of this call." | |
293 | (let* ((directory (or (getenv "TMPDIR") "/tmp")) | |
294 | (template (string-append directory "/guix-directory.XXXXXX")) | |
295 | (tmp-dir (mkdtemp! template))) | |
296 | (dynamic-wind | |
297 | (const #t) | |
298 | (lambda () | |
299 | (proc tmp-dir)) | |
300 | (lambda () | |
301 | (false-if-exception (delete-file-recursively tmp-dir)))))) | |
302 | ||
831bc146 | 303 | (define* (call-with-container mounts thunk #:key (namespaces %namespaces) |
d236cd16 LC |
304 | (host-uids 1) (guest-uid 0) (guest-gid 0) |
305 | (process-spawned-hook (const #t))) | |
306 | "Run THUNK in a new container process and return its exit status; call | |
307 | PROCESS-SPAWNED-HOOK with the PID of the new process that has been spawned. | |
5970e8e2 LC |
308 | MOUNTS is a list of <file-system> objects that specify file systems to mount |
309 | inside the container. NAMESPACES is a list of symbols corresponding to | |
c1f6a0c2 | 310 | the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By |
af76c020 LC |
311 | default, all namespaces are used. |
312 | ||
313 | HOST-UIDS is the number of host user identifiers to map into the container's | |
314 | user namespace, if there is one. By default, only a single uid/gid, that of | |
315 | the current user, is mapped into the container. The host user that creates | |
316 | the container is the root user (uid/gid 0) within the container. Only root | |
317 | can map more than a single uid/gid. | |
318 | ||
319 | GUEST-UID and GUEST-GID specify the first UID (respectively GID) that host | |
320 | UIDs (respectively GIDs) map to in the namespace. | |
c1f6a0c2 DT |
321 | |
322 | Note that if THUNK needs to load any additional Guile modules, the relevant | |
323 | module files must be present in one of the mappings in MOUNTS and the Guile | |
324 | load path must be adjusted as needed." | |
325 | (call-with-temporary-directory | |
326 | (lambda (root) | |
af76c020 LC |
327 | (let ((pid (run-container root mounts namespaces host-uids thunk |
328 | #:guest-uid guest-uid | |
329 | #:guest-gid guest-gid))) | |
c1f6a0c2 DT |
330 | ;; Catch SIGINT and kill the container process. |
331 | (sigaction SIGINT | |
332 | (lambda (signum) | |
333 | (false-if-exception | |
334 | (kill pid SIGKILL)))) | |
335 | ||
d236cd16 | 336 | (process-spawned-hook pid) |
c1f6a0c2 DT |
337 | (match (waitpid pid) |
338 | ((_ . status) status)))))) | |
339 | ||
340 | (define (container-excursion pid thunk) | |
341 | "Run THUNK as a child process within the namespaces of process PID and | |
342 | return the exit status." | |
343 | (define (namespace-file pid namespace) | |
344 | (string-append "/proc/" (number->string pid) "/ns/" namespace)) | |
345 | ||
346 | (match (primitive-fork) | |
347 | (0 | |
348 | (call-with-clean-exit | |
349 | (lambda () | |
350 | (for-each (lambda (ns) | |
7fee5b53 LC |
351 | (let ((source (namespace-file (getpid) ns)) |
352 | (target (namespace-file pid ns))) | |
353 | ;; Joining the namespace that the process already | |
354 | ;; belongs to would throw an error so avoid that. | |
355 | ;; XXX: This /proc interface leads to TOCTTOU. | |
356 | (unless (string=? (readlink source) (readlink target)) | |
357 | (call-with-input-file source | |
358 | (lambda (current-ns-port) | |
359 | (call-with-input-file target | |
360 | (lambda (new-ns-port) | |
361 | (setns (fileno new-ns-port) 0)))))))) | |
c1f6a0c2 DT |
362 | ;; It's important that the user namespace is joined first, |
363 | ;; so that the user will have the privileges to join the | |
364 | ;; other namespaces. Furthermore, it's important that the | |
365 | ;; mount namespace is joined last, otherwise the /proc mount | |
366 | ;; point would no longer be accessible. | |
367 | '("user" "ipc" "uts" "net" "pid" "mnt")) | |
368 | (purify-environment) | |
369 | (chdir "/") | |
370 | (thunk)))) | |
371 | (pid | |
372 | (match (waitpid pid) | |
373 | ((_ . status) | |
374 | (status:exit-val status)))))) | |
c90db25f LC |
375 | |
376 | (define (container-excursion* pid thunk) | |
377 | "Like 'container-excursion', but return the return value of THUNK." | |
378 | (match (pipe) | |
379 | ((in . out) | |
380 | (match (container-excursion pid | |
381 | (lambda () | |
382 | (close-port in) | |
95aa64bc LC |
383 | (write (thunk) out) |
384 | (close-port out))) | |
c90db25f LC |
385 | (0 |
386 | (close-port out) | |
387 | (let ((result (read in))) | |
388 | (close-port in) | |
389 | result)) | |
390 | (_ ;maybe PID died already | |
391 | (close-port out) | |
392 | (close-port in) | |
393 | #f))))) |