gnu: nestopia-ue: Use system zlib.
[jackhill/guix/guix.git] / gnu / build / linux-container.scm
CommitLineData
c1f6a0c2
DT
1;;; GNU Guix --- Functional package management for GNU
2;;; Copyright © 2015 David Thompson <davet@gnu.org>
3;;;
4;;; This file is part of GNU Guix.
5;;;
6;;; GNU Guix is free software; you can redistribute it and/or modify it
7;;; under the terms of the GNU General Public License as published by
8;;; the Free Software Foundation; either version 3 of the License, or (at
9;;; your option) any later version.
10;;;
11;;; GNU Guix is distributed in the hope that it will be useful, but
12;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14;;; GNU General Public License for more details.
15;;;
16;;; You should have received a copy of the GNU General Public License
17;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
18
19(define-module (gnu build linux-container)
20 #:use-module (ice-9 format)
21 #:use-module (ice-9 match)
22 #:use-module (srfi srfi-98)
23 #:use-module (guix utils)
24 #:use-module (guix build utils)
25 #:use-module (guix build syscalls)
26 #:use-module ((gnu build file-systems) #:select (mount-file-system))
27 #:export (%namespaces
28 run-container
29 call-with-container
30 container-excursion))
31
32(define %namespaces
33 '(mnt pid ipc uts user net))
34
35(define (call-with-clean-exit thunk)
36 "Apply THUNK, but exit with a status code of 1 if it fails."
37 (dynamic-wind
38 (const #t)
a72ccbc2
DT
39 (lambda ()
40 (thunk)
41 (primitive-exit 0))
c1f6a0c2
DT
42 (lambda ()
43 (primitive-exit 1))))
44
45(define (purify-environment)
46 "Unset all environment variables."
47 (for-each unsetenv
48 (match (get-environment-variables)
49 (((names . _) ...) names))))
50
51;; The container setup procedure closely resembles that of the Docker
52;; specification:
53;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
54(define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
55 "Mount the essential file systems and the those in the MOUNTS list relative
56to ROOT, then make ROOT the new root directory for the process."
57 (define (scope dir)
58 (string-append root dir))
59
4949ada9
DT
60 (define (touch file-name)
61 (call-with-output-file file-name (const #t)))
62
c1f6a0c2
DT
63 (define (bind-mount src dest)
64 (mount src dest "none" MS_BIND))
65
66 ;; Like mount, but creates the mount point if it doesn't exist.
67 (define* (mount* source target type #:optional (flags 0) options
68 #:key (update-mtab? #f))
69 (mkdir-p target)
70 (mount source target type flags options #:update-mtab? update-mtab?))
71
72 ;; The container's file system is completely ephemeral, sans directories
73 ;; bind-mounted from the host.
74 (mount "none" root "tmpfs")
75
76 ;; A proc mount requires a new pid namespace.
77 (when mount-/proc?
78 (mount* "none" (scope "/proc") "proc"
79 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
80
81 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
82 ;; the current network namespace.
83 (when mount-/sys?
84 (mount* "none" (scope "/sys") "sysfs"
85 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
86
87 (mount* "none" (scope "/dev") "tmpfs"
88 (logior MS_NOEXEC MS_STRICTATIME)
89 "mode=755")
90
91 ;; Create essential device nodes via bind-mounting them from the
92 ;; host, because a process within a user namespace cannot create
93 ;; device nodes.
94 (for-each (lambda (device)
95 (when (file-exists? device)
96 ;; Create the mount point file.
4949ada9 97 (touch (scope device))
c1f6a0c2
DT
98 (bind-mount device (scope device))))
99 '("/dev/null"
100 "/dev/zero"
101 "/dev/full"
102 "/dev/random"
103 "/dev/urandom"
104 "/dev/tty"
105 "/dev/ptmx"
106 "/dev/fuse"))
107
4949ada9
DT
108 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
109 ;; associated with standard input.
110 (let ((in (current-input-port))
111 (console (scope "/dev/console")))
112 (when (isatty? in)
113 (touch console)
114 (chmod console #o600)
115 (bind-mount (ttyname in) console)))
116
c1f6a0c2
DT
117 ;; Setup standard input/output/error.
118 (symlink "/proc/self/fd" (scope "/dev/fd"))
119 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
120 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
121 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
122
123 ;; Mount user-specified file systems.
124 (for-each (lambda (spec)
125 (mount-file-system spec #:root root))
126 mounts)
127
128 ;; Jail the process inside the container's root file system.
129 (let ((put-old (string-append root "/real-root")))
130 (mkdir put-old)
131 (pivot-root root put-old)
132 (chdir "/")
133 (umount "real-root" MNT_DETACH)
134 (rmdir "real-root")))
135
831bc146
DT
136(define (initialize-user-namespace pid host-uids)
137 "Configure the user namespace for PID. HOST-UIDS specifies the number of
138host user identifiers to map into the user namespace."
c1f6a0c2
DT
139 (define proc-dir
140 (string-append "/proc/" (number->string pid)))
141
142 (define (scope file)
143 (string-append proc-dir file))
144
831bc146
DT
145 (let ((uid (getuid))
146 (gid (getgid)))
c1f6a0c2
DT
147
148 ;; Only root can write to the gid map without first disabling the
149 ;; setgroups syscall.
150 (unless (and (zero? uid) (zero? gid))
151 (call-with-output-file (scope "/setgroups")
152 (lambda (port)
153 (display "deny" port))))
154
155 ;; Map the user/group that created the container to the root user
156 ;; within the container.
157 (call-with-output-file (scope "/uid_map")
158 (lambda (port)
831bc146 159 (format port "0 ~d ~d" uid host-uids)))
c1f6a0c2
DT
160 (call-with-output-file (scope "/gid_map")
161 (lambda (port)
831bc146 162 (format port "0 ~d ~d" gid host-uids)))))
c1f6a0c2
DT
163
164(define (namespaces->bit-mask namespaces)
165 "Return the number suitable for the 'flags' argument of 'clone' that
166corresponds to the symbols in NAMESPACES."
ee78d024 167 ;; Use the same flags as fork(3) in addition to the namespace flags.
35b50a75 168 (apply logior SIGCHLD
c1f6a0c2
DT
169 (map (match-lambda
170 ('mnt CLONE_NEWNS)
171 ('uts CLONE_NEWUTS)
172 ('ipc CLONE_NEWIPC)
173 ('user CLONE_NEWUSER)
174 ('pid CLONE_NEWPID)
175 ('net CLONE_NEWNET))
176 namespaces)))
177
831bc146 178(define (run-container root mounts namespaces host-uids thunk)
c1f6a0c2
DT
179 "Run THUNK in a new container process and return its PID. ROOT specifies
180the root directory for the container. MOUNTS is a list of file system specs
181that specify the mapping of host file systems into the container. NAMESPACES
182is a list of symbols that correspond to the possible Linux namespaces: mnt,
831bc146
DT
183ipc, uts, user, and net. HOST-UIDS specifies the number of
184host user identifiers to map into the user namespace."
c1f6a0c2
DT
185 ;; The parent process must initialize the user namespace for the child
186 ;; before it can boot. To negotiate this, a pipe is used such that the
187 ;; child process blocks until the parent writes to it.
188 (match (pipe)
189 ((in . out)
190 (let ((flags (namespaces->bit-mask namespaces)))
191 (match (clone flags)
192 (0
193 (call-with-clean-exit
194 (lambda ()
195 (close out)
196 ;; Wait for parent to set things up.
197 (read in)
198 (close in)
199 (purify-environment)
200 (when (memq 'mnt namespaces)
201 (mount-file-systems root mounts
202 #:mount-/proc? (memq 'pid namespaces)
203 #:mount-/sys? (memq 'net namespaces)))
204 ;; TODO: Manage capabilities.
205 (thunk))))
206 (pid
207 (when (memq 'user namespaces)
831bc146 208 (initialize-user-namespace pid host-uids))
c1f6a0c2
DT
209 ;; TODO: Initialize cgroups.
210 (close in)
211 (write 'ready out)
212 (close out)
213 pid))))))
214
831bc146
DT
215(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
216 (host-uids 1))
c1f6a0c2
DT
217 "Run THUNK in a new container process and return its exit status.
218MOUNTS is a list of file system specs that specify the mapping of host file
219systems into the container. NAMESPACES is a list of symbols corresponding to
220the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
831bc146
DT
221default, all namespaces are used. HOST-UIDS is the number of host user
222identifiers to map into the container's user namespace, if there is one. By
223default, only a single uid/gid, that of the current user, is mapped into the
224container. The host user that creates the container is the root user (uid/gid
2250) within the container. Only root can map more than a single uid/gid.
c1f6a0c2
DT
226
227Note that if THUNK needs to load any additional Guile modules, the relevant
228module files must be present in one of the mappings in MOUNTS and the Guile
229load path must be adjusted as needed."
230 (call-with-temporary-directory
231 (lambda (root)
831bc146 232 (let ((pid (run-container root mounts namespaces host-uids thunk)))
c1f6a0c2
DT
233 ;; Catch SIGINT and kill the container process.
234 (sigaction SIGINT
235 (lambda (signum)
236 (false-if-exception
237 (kill pid SIGKILL))))
238
239 (match (waitpid pid)
240 ((_ . status) status))))))
241
242(define (container-excursion pid thunk)
243 "Run THUNK as a child process within the namespaces of process PID and
244return the exit status."
245 (define (namespace-file pid namespace)
246 (string-append "/proc/" (number->string pid) "/ns/" namespace))
247
248 (match (primitive-fork)
249 (0
250 (call-with-clean-exit
251 (lambda ()
252 (for-each (lambda (ns)
253 (call-with-input-file (namespace-file (getpid) ns)
254 (lambda (current-ns-port)
255 (call-with-input-file (namespace-file pid ns)
256 (lambda (new-ns-port)
257 ;; Joining the namespace that the process
258 ;; already belongs to would throw an error.
259 (unless (= (port->fdes current-ns-port)
260 (port->fdes new-ns-port))
261 (setns (port->fdes new-ns-port) 0)))))))
262 ;; It's important that the user namespace is joined first,
263 ;; so that the user will have the privileges to join the
264 ;; other namespaces. Furthermore, it's important that the
265 ;; mount namespace is joined last, otherwise the /proc mount
266 ;; point would no longer be accessible.
267 '("user" "ipc" "uts" "net" "pid" "mnt"))
268 (purify-environment)
269 (chdir "/")
270 (thunk))))
271 (pid
272 (match (waitpid pid)
273 ((_ . status)
274 (status:exit-val status))))))