Commit | Line | Data |
---|---|---|
c1f6a0c2 DT |
1 | ;;; GNU Guix --- Functional package management for GNU |
2 | ;;; Copyright © 2015 David Thompson <davet@gnu.org> | |
3 | ;;; | |
4 | ;;; This file is part of GNU Guix. | |
5 | ;;; | |
6 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
7 | ;;; under the terms of the GNU General Public License as published by | |
8 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | ;;; your option) any later version. | |
10 | ;;; | |
11 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
12 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | ;;; GNU General Public License for more details. | |
15 | ;;; | |
16 | ;;; You should have received a copy of the GNU General Public License | |
17 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
18 | ||
19 | (define-module (gnu build linux-container) | |
20 | #:use-module (ice-9 format) | |
21 | #:use-module (ice-9 match) | |
22 | #:use-module (srfi srfi-98) | |
23 | #:use-module (guix utils) | |
24 | #:use-module (guix build utils) | |
25 | #:use-module (guix build syscalls) | |
26 | #:use-module ((gnu build file-systems) #:select (mount-file-system)) | |
27 | #:export (%namespaces | |
28 | run-container | |
29 | call-with-container | |
30 | container-excursion)) | |
31 | ||
32 | (define %namespaces | |
33 | '(mnt pid ipc uts user net)) | |
34 | ||
35 | (define (call-with-clean-exit thunk) | |
36 | "Apply THUNK, but exit with a status code of 1 if it fails." | |
37 | (dynamic-wind | |
38 | (const #t) | |
a72ccbc2 DT |
39 | (lambda () |
40 | (thunk) | |
41 | (primitive-exit 0)) | |
c1f6a0c2 DT |
42 | (lambda () |
43 | (primitive-exit 1)))) | |
44 | ||
45 | (define (purify-environment) | |
46 | "Unset all environment variables." | |
47 | (for-each unsetenv | |
48 | (match (get-environment-variables) | |
49 | (((names . _) ...) names)))) | |
50 | ||
51 | ;; The container setup procedure closely resembles that of the Docker | |
52 | ;; specification: | |
53 | ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md | |
54 | (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?) | |
55 | "Mount the essential file systems and the those in the MOUNTS list relative | |
56 | to ROOT, then make ROOT the new root directory for the process." | |
57 | (define (scope dir) | |
58 | (string-append root dir)) | |
59 | ||
4949ada9 DT |
60 | (define (touch file-name) |
61 | (call-with-output-file file-name (const #t))) | |
62 | ||
c1f6a0c2 DT |
63 | (define (bind-mount src dest) |
64 | (mount src dest "none" MS_BIND)) | |
65 | ||
66 | ;; Like mount, but creates the mount point if it doesn't exist. | |
67 | (define* (mount* source target type #:optional (flags 0) options | |
68 | #:key (update-mtab? #f)) | |
69 | (mkdir-p target) | |
70 | (mount source target type flags options #:update-mtab? update-mtab?)) | |
71 | ||
72 | ;; The container's file system is completely ephemeral, sans directories | |
73 | ;; bind-mounted from the host. | |
74 | (mount "none" root "tmpfs") | |
75 | ||
76 | ;; A proc mount requires a new pid namespace. | |
77 | (when mount-/proc? | |
78 | (mount* "none" (scope "/proc") "proc" | |
79 | (logior MS_NOEXEC MS_NOSUID MS_NODEV))) | |
80 | ||
81 | ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in | |
82 | ;; the current network namespace. | |
83 | (when mount-/sys? | |
84 | (mount* "none" (scope "/sys") "sysfs" | |
85 | (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))) | |
86 | ||
87 | (mount* "none" (scope "/dev") "tmpfs" | |
88 | (logior MS_NOEXEC MS_STRICTATIME) | |
89 | "mode=755") | |
90 | ||
91 | ;; Create essential device nodes via bind-mounting them from the | |
92 | ;; host, because a process within a user namespace cannot create | |
93 | ;; device nodes. | |
94 | (for-each (lambda (device) | |
95 | (when (file-exists? device) | |
96 | ;; Create the mount point file. | |
4949ada9 | 97 | (touch (scope device)) |
c1f6a0c2 DT |
98 | (bind-mount device (scope device)))) |
99 | '("/dev/null" | |
100 | "/dev/zero" | |
101 | "/dev/full" | |
102 | "/dev/random" | |
103 | "/dev/urandom" | |
104 | "/dev/tty" | |
105 | "/dev/ptmx" | |
106 | "/dev/fuse")) | |
107 | ||
4949ada9 DT |
108 | ;; Setup the container's /dev/console by bind mounting the pseudo-terminal |
109 | ;; associated with standard input. | |
110 | (let ((in (current-input-port)) | |
111 | (console (scope "/dev/console"))) | |
112 | (when (isatty? in) | |
113 | (touch console) | |
114 | (chmod console #o600) | |
115 | (bind-mount (ttyname in) console))) | |
116 | ||
c1f6a0c2 DT |
117 | ;; Setup standard input/output/error. |
118 | (symlink "/proc/self/fd" (scope "/dev/fd")) | |
119 | (symlink "/proc/self/fd/0" (scope "/dev/stdin")) | |
120 | (symlink "/proc/self/fd/1" (scope "/dev/stdout")) | |
121 | (symlink "/proc/self/fd/2" (scope "/dev/stderr")) | |
122 | ||
123 | ;; Mount user-specified file systems. | |
124 | (for-each (lambda (spec) | |
125 | (mount-file-system spec #:root root)) | |
126 | mounts) | |
127 | ||
128 | ;; Jail the process inside the container's root file system. | |
129 | (let ((put-old (string-append root "/real-root"))) | |
130 | (mkdir put-old) | |
131 | (pivot-root root put-old) | |
132 | (chdir "/") | |
133 | (umount "real-root" MNT_DETACH) | |
134 | (rmdir "real-root"))) | |
135 | ||
831bc146 DT |
136 | (define (initialize-user-namespace pid host-uids) |
137 | "Configure the user namespace for PID. HOST-UIDS specifies the number of | |
138 | host user identifiers to map into the user namespace." | |
c1f6a0c2 DT |
139 | (define proc-dir |
140 | (string-append "/proc/" (number->string pid))) | |
141 | ||
142 | (define (scope file) | |
143 | (string-append proc-dir file)) | |
144 | ||
831bc146 DT |
145 | (let ((uid (getuid)) |
146 | (gid (getgid))) | |
c1f6a0c2 DT |
147 | |
148 | ;; Only root can write to the gid map without first disabling the | |
149 | ;; setgroups syscall. | |
150 | (unless (and (zero? uid) (zero? gid)) | |
151 | (call-with-output-file (scope "/setgroups") | |
152 | (lambda (port) | |
153 | (display "deny" port)))) | |
154 | ||
155 | ;; Map the user/group that created the container to the root user | |
156 | ;; within the container. | |
157 | (call-with-output-file (scope "/uid_map") | |
158 | (lambda (port) | |
831bc146 | 159 | (format port "0 ~d ~d" uid host-uids))) |
c1f6a0c2 DT |
160 | (call-with-output-file (scope "/gid_map") |
161 | (lambda (port) | |
831bc146 | 162 | (format port "0 ~d ~d" gid host-uids))))) |
c1f6a0c2 DT |
163 | |
164 | (define (namespaces->bit-mask namespaces) | |
165 | "Return the number suitable for the 'flags' argument of 'clone' that | |
166 | corresponds to the symbols in NAMESPACES." | |
ee78d024 | 167 | ;; Use the same flags as fork(3) in addition to the namespace flags. |
35b50a75 | 168 | (apply logior SIGCHLD |
c1f6a0c2 DT |
169 | (map (match-lambda |
170 | ('mnt CLONE_NEWNS) | |
171 | ('uts CLONE_NEWUTS) | |
172 | ('ipc CLONE_NEWIPC) | |
173 | ('user CLONE_NEWUSER) | |
174 | ('pid CLONE_NEWPID) | |
175 | ('net CLONE_NEWNET)) | |
176 | namespaces))) | |
177 | ||
831bc146 | 178 | (define (run-container root mounts namespaces host-uids thunk) |
c1f6a0c2 DT |
179 | "Run THUNK in a new container process and return its PID. ROOT specifies |
180 | the root directory for the container. MOUNTS is a list of file system specs | |
181 | that specify the mapping of host file systems into the container. NAMESPACES | |
182 | is a list of symbols that correspond to the possible Linux namespaces: mnt, | |
831bc146 DT |
183 | ipc, uts, user, and net. HOST-UIDS specifies the number of |
184 | host user identifiers to map into the user namespace." | |
c1f6a0c2 DT |
185 | ;; The parent process must initialize the user namespace for the child |
186 | ;; before it can boot. To negotiate this, a pipe is used such that the | |
187 | ;; child process blocks until the parent writes to it. | |
188 | (match (pipe) | |
189 | ((in . out) | |
190 | (let ((flags (namespaces->bit-mask namespaces))) | |
191 | (match (clone flags) | |
192 | (0 | |
193 | (call-with-clean-exit | |
194 | (lambda () | |
195 | (close out) | |
196 | ;; Wait for parent to set things up. | |
197 | (read in) | |
198 | (close in) | |
199 | (purify-environment) | |
200 | (when (memq 'mnt namespaces) | |
201 | (mount-file-systems root mounts | |
202 | #:mount-/proc? (memq 'pid namespaces) | |
203 | #:mount-/sys? (memq 'net namespaces))) | |
204 | ;; TODO: Manage capabilities. | |
205 | (thunk)))) | |
206 | (pid | |
207 | (when (memq 'user namespaces) | |
831bc146 | 208 | (initialize-user-namespace pid host-uids)) |
c1f6a0c2 DT |
209 | ;; TODO: Initialize cgroups. |
210 | (close in) | |
211 | (write 'ready out) | |
212 | (close out) | |
213 | pid)))))) | |
214 | ||
831bc146 DT |
215 | (define* (call-with-container mounts thunk #:key (namespaces %namespaces) |
216 | (host-uids 1)) | |
c1f6a0c2 DT |
217 | "Run THUNK in a new container process and return its exit status. |
218 | MOUNTS is a list of file system specs that specify the mapping of host file | |
219 | systems into the container. NAMESPACES is a list of symbols corresponding to | |
220 | the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By | |
831bc146 DT |
221 | default, all namespaces are used. HOST-UIDS is the number of host user |
222 | identifiers to map into the container's user namespace, if there is one. By | |
223 | default, only a single uid/gid, that of the current user, is mapped into the | |
224 | container. The host user that creates the container is the root user (uid/gid | |
225 | 0) within the container. Only root can map more than a single uid/gid. | |
c1f6a0c2 DT |
226 | |
227 | Note that if THUNK needs to load any additional Guile modules, the relevant | |
228 | module files must be present in one of the mappings in MOUNTS and the Guile | |
229 | load path must be adjusted as needed." | |
230 | (call-with-temporary-directory | |
231 | (lambda (root) | |
831bc146 | 232 | (let ((pid (run-container root mounts namespaces host-uids thunk))) |
c1f6a0c2 DT |
233 | ;; Catch SIGINT and kill the container process. |
234 | (sigaction SIGINT | |
235 | (lambda (signum) | |
236 | (false-if-exception | |
237 | (kill pid SIGKILL)))) | |
238 | ||
239 | (match (waitpid pid) | |
240 | ((_ . status) status)))))) | |
241 | ||
242 | (define (container-excursion pid thunk) | |
243 | "Run THUNK as a child process within the namespaces of process PID and | |
244 | return the exit status." | |
245 | (define (namespace-file pid namespace) | |
246 | (string-append "/proc/" (number->string pid) "/ns/" namespace)) | |
247 | ||
248 | (match (primitive-fork) | |
249 | (0 | |
250 | (call-with-clean-exit | |
251 | (lambda () | |
252 | (for-each (lambda (ns) | |
253 | (call-with-input-file (namespace-file (getpid) ns) | |
254 | (lambda (current-ns-port) | |
255 | (call-with-input-file (namespace-file pid ns) | |
256 | (lambda (new-ns-port) | |
257 | ;; Joining the namespace that the process | |
258 | ;; already belongs to would throw an error. | |
259 | (unless (= (port->fdes current-ns-port) | |
260 | (port->fdes new-ns-port)) | |
261 | (setns (port->fdes new-ns-port) 0))))))) | |
262 | ;; It's important that the user namespace is joined first, | |
263 | ;; so that the user will have the privileges to join the | |
264 | ;; other namespaces. Furthermore, it's important that the | |
265 | ;; mount namespace is joined last, otherwise the /proc mount | |
266 | ;; point would no longer be accessible. | |
267 | '("user" "ipc" "uts" "net" "pid" "mnt")) | |
268 | (purify-environment) | |
269 | (chdir "/") | |
270 | (thunk)))) | |
271 | (pid | |
272 | (match (waitpid pid) | |
273 | ((_ . status) | |
274 | (status:exit-val status)))))) |