pack: Relocatable wrapper leaves root available to child processes.
[jackhill/guix/guix.git] / gnu / packages / aux-files / run-in-namespace.c
1 /* GNU Guix --- Functional package management for GNU
2 Copyright (C) 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
3
4 This file is part of GNU Guix.
5
6 GNU Guix is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
10
11 GNU Guix is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. */
18
19 /* Make the given @WRAPPED_PROGRAM@ relocatable by executing it in a separate
20 mount namespace where the store is mounted in its right place.
21
22 We would happily do that in Scheme using 'call-with-container'. However,
23 this very program needs to be relocatable, so it needs to be statically
24 linked, which complicates things (Guile's modules can hardly be "linked"
25 into a single executable.) */
26
27 #define _GNU_SOURCE
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mount.h>
33 #include <errno.h>
34 #include <libgen.h>
35 #include <limits.h>
36 #include <string.h>
37 #include <assert.h>
38 #include <sys/stat.h>
39 #include <sys/types.h>
40 #include <sys/wait.h>
41 #include <fcntl.h>
42 #include <dirent.h>
43 #include <sys/syscall.h>
44 #include <sys/prctl.h>
45
46 /* Whether we're building the ld.so/libfakechroot wrapper. */
47 #define HAVE_EXEC_WITH_LOADER \
48 (defined PROGRAM_INTERPRETER) && (defined LOADER_AUDIT_MODULE) \
49 && (defined FAKECHROOT_LIBRARY)
50
51 /* The original store, "/gnu/store" by default. */
52 static const char original_store[] = "@STORE_DIRECTORY@";
53
54
55 /* Like 'malloc', but abort if 'malloc' returns NULL. */
56 static void *
57 xmalloc (size_t size)
58 {
59 void *result = malloc (size);
60 assert (result != NULL);
61 return result;
62 }
63
64 /* Concatenate DIRECTORY, a slash, and FILE. Return the result, which the
65 caller must eventually free. */
66 static char *
67 concat (const char *directory, const char *file)
68 {
69 char *result = xmalloc (strlen (directory) + 2 + strlen (file));
70
71 strcpy (result, directory);
72 strcat (result, "/");
73 strcat (result, file);
74 return result;
75 }
76
77 static void
78 mkdir_p (const char *directory)
79 {
80 if (strcmp (directory, "/") != 0)
81 {
82 char *parent = dirname (strdupa (directory));
83 mkdir_p (parent);
84 int err = mkdir (directory, 0700);
85 if (err < 0 && errno != EEXIST)
86 assert_perror (errno);
87 }
88 }
89
90 static void
91 rm_rf (const char *directory)
92 {
93 DIR *stream = opendir (directory);
94
95 for (struct dirent *entry = readdir (stream);
96 entry != NULL;
97 entry = readdir (stream))
98 {
99 if (strcmp (entry->d_name, ".") == 0
100 || strcmp (entry->d_name, "..") == 0)
101 continue;
102
103 char *full = concat (directory, entry->d_name);
104
105 int err = unlink (full);
106 if (err < 0)
107 {
108 if (errno == EISDIR)
109 /* Recurse (we expect a shallow directory structure so there's
110 little risk of stack overflow.) */
111 rm_rf (full);
112 else
113 assert_perror (errno);
114 }
115
116 free (full);
117 }
118
119 closedir (stream);
120
121 int err = rmdir (directory);
122 if (err < 0 && errno != ENOENT)
123 assert_perror (errno);
124 }
125
126 /* Make TARGET a bind-mount of SOURCE. Take into account ENTRY's type, which
127 corresponds to SOURCE. */
128 static int
129 bind_mount (const char *source, const struct dirent *entry,
130 const char *target)
131 {
132 if (entry->d_type == DT_DIR)
133 {
134 int err = mkdir (target, 0700);
135 if (err != 0)
136 return err;
137 }
138 else
139 close (open (target, O_WRONLY | O_CREAT));
140
141 return mount (source, target, "none",
142 MS_BIND | MS_REC | MS_RDONLY, NULL);
143 }
144
145 #if HAVE_EXEC_WITH_LOADER
146
147 /* Make TARGET a symlink to SOURCE. */
148 static int
149 make_symlink (const char *source, const struct dirent *entry,
150 const char *target)
151 {
152 return symlink (source, target);
153 }
154
155 #endif
156
157 /* Mirror with FIRMLINK all the top-level entries in SOURCE to TARGET. */
158 static void
159 mirror_directory (const char *source, const char *target,
160 int (* firmlink) (const char *, const struct dirent *,
161 const char *))
162 {
163 DIR *stream = opendir (source);
164
165 for (struct dirent *entry = readdir (stream);
166 entry != NULL;
167 entry = readdir (stream))
168 {
169 /* XXX: Some file systems may not report a useful 'd_type'. Ignore them
170 for now. */
171 assert (entry->d_type != DT_UNKNOWN);
172
173 if (strcmp (entry->d_name, ".") == 0
174 || strcmp (entry->d_name, "..") == 0)
175 continue;
176
177 char *abs_source = concat (source, entry->d_name);
178 char *new_entry = concat (target, entry->d_name);
179
180 if (entry->d_type == DT_LNK)
181 {
182 char target[PATH_MAX];
183
184 ssize_t result = readlink (abs_source, target, sizeof target - 1);
185 if (result > 0)
186 {
187 target[result] = '\0';
188 int err = symlink (target, new_entry);
189 if (err < 0)
190 assert_perror (errno);
191 }
192 }
193 else
194 {
195 /* Create the mount point. */
196 int err = firmlink (abs_source, entry, new_entry);
197
198 /* It used to be that only directories could be bind-mounted. Thus,
199 keep going if we fail to bind-mount a non-directory entry.
200 That's OK because regular files in the root file system are
201 usually uninteresting. */
202 if (err != 0 && entry->d_type != DT_DIR)
203 assert_perror (errno);
204
205 free (new_entry);
206 free (abs_source);
207 }
208 }
209
210 closedir (stream);
211 }
212
213 /* Write the user/group ID map for PID to FILE, mapping ID to itself. See
214 user_namespaces(7). */
215 static void
216 write_id_map (pid_t pid, const char *file, int id)
217 {
218 char id_map_file[100];
219 snprintf (id_map_file, sizeof id_map_file, "/proc/%d/%s", pid, file);
220
221 char id_map[100];
222
223 /* Map root and the current user. */
224 int len = snprintf (id_map, sizeof id_map, "%d %d 1\n", id, id);
225 int fd = open (id_map_file, O_WRONLY);
226 if (fd < 0)
227 assert_perror (errno);
228
229 int n = write (fd, id_map, len);
230 if (n < 0)
231 assert_perror (errno);
232
233 close (fd);
234 }
235
236 /* Disallow setgroups(2) for PID. */
237 static void
238 disallow_setgroups (pid_t pid)
239 {
240 char file[100];
241
242 snprintf (file, sizeof file, "/proc/%d/setgroups", pid);
243
244 int fd = open (file, O_WRONLY);
245 if (fd < 0)
246 assert_perror (errno);
247
248 int err = write (fd, "deny", 5);
249 if (err < 0)
250 assert_perror (errno);
251
252 close (fd);
253 }
254
255 /* Run the wrapper program in a separate mount user namespace. Return only
256 upon failure. */
257 static void
258 exec_in_user_namespace (const char *store, int argc, char *argv[])
259 {
260 /* Spawn @WRAPPED_PROGRAM@ in a separate namespace where STORE is
261 bind-mounted in the right place. */
262 int err, is_tmpfs;
263 char *new_root = mkdtemp (strdup ("/tmp/guix-exec-XXXXXX"));
264 char *new_store = concat (new_root, original_store);
265 char *cwd = get_current_dir_name ();
266
267 /* Become the new parent of grand-children when their parent dies. */
268 prctl (PR_SET_CHILD_SUBREAPER, 1);
269
270 /* Optionally, make NEW_ROOT a tmpfs. That way, if we have to leave it
271 behind because there are sub-processes still running when this wrapper
272 exits, it's OK. */
273 err = mount ("none", new_root, "tmpfs", 0, NULL);
274 is_tmpfs = (err == 0);
275
276 /* Create a child with separate namespaces and set up bind-mounts from
277 there. That way, bind-mounts automatically disappear when the child
278 exits, which simplifies cleanup for the parent. Note: clone is more
279 convenient than fork + unshare since the parent can directly write
280 the child uid_map/gid_map files. */
281 pid_t child = syscall (SYS_clone, SIGCHLD | CLONE_NEWNS | CLONE_NEWUSER,
282 NULL, NULL, NULL);
283 switch (child)
284 {
285 case 0:
286 /* Note: Due to <https://bugzilla.kernel.org/show_bug.cgi?id=183461>
287 we cannot make NEW_ROOT a tmpfs (which would have saved the need
288 for 'rm_rf'.) */
289 mirror_directory ("/", new_root, bind_mount);
290 mkdir_p (new_store);
291 err = mount (store, new_store, "none", MS_BIND | MS_REC | MS_RDONLY,
292 NULL);
293 if (err < 0)
294 assert_perror (errno);
295
296 chdir (new_root);
297 err = chroot (new_root);
298 if (err < 0)
299 assert_perror (errno);
300
301 /* Change back to where we were before chroot'ing. */
302 chdir (cwd);
303
304 int err = execv ("@WRAPPED_PROGRAM@", argv);
305 if (err < 0)
306 assert_perror (errno);
307 break;
308
309 case -1:
310 /* Failure: user namespaces not supported. */
311 fprintf (stderr, "%s: error: 'clone' failed: %m\n", argv[0]);
312 rm_rf (new_root);
313 free (new_root);
314 break;
315
316 default:
317 {
318 /* Map the current user/group ID in the child's namespace (the
319 default is to get the "overflow UID", i.e., the UID of
320 "nobody"). We must first disallow 'setgroups' for that
321 process. */
322 disallow_setgroups (child);
323 write_id_map (child, "uid_map", getuid ());
324 write_id_map (child, "gid_map", getgid ());
325
326 int status, status_other;
327 waitpid (child, &status, 0);
328
329 chdir ("/"); /* avoid EBUSY */
330 if (is_tmpfs)
331 {
332 /* NEW_ROOT lives on in child processes and we no longer need it
333 to exist as an empty directory in the global namespace. */
334 umount (new_root);
335 rmdir (new_root);
336 }
337 /* Check whether there are child processes left. If there are none,
338 we can remove NEW_ROOT just fine. Conversely, if there are
339 processes left (for example because this wrapper's child forked),
340 we have to leave NEW_ROOT behind so that those processes can still
341 access their root file system (XXX). */
342 else if (waitpid (-1 , &status_other, WNOHANG) == -1)
343 rm_rf (new_root);
344
345 free (new_root);
346
347 if (WIFEXITED (status))
348 exit (WEXITSTATUS (status));
349 else
350 /* Abnormal termination cannot really be reproduced, so exit
351 with 255. */
352 exit (255);
353 }
354 }
355 }
356
357 \f
358 #ifdef PROOT_PROGRAM
359
360 /* Execute the wrapped program with PRoot, passing it ARGC and ARGV, and
361 "bind-mounting" STORE in the right place. */
362 static void
363 exec_with_proot (const char *store, int argc, char *argv[])
364 {
365 int proot_specific_argc = 4;
366 int proot_argc = argc + proot_specific_argc;
367 char *proot_argv[proot_argc + 1], *proot;
368 char bind_spec[strlen (store) + 1 + sizeof original_store];
369
370 strcpy (bind_spec, store);
371 strcat (bind_spec, ":");
372 strcat (bind_spec, original_store);
373
374 proot = concat (store, PROOT_PROGRAM);
375
376 proot_argv[0] = proot;
377 proot_argv[1] = "-b";
378 proot_argv[2] = bind_spec;
379 proot_argv[3] = "@WRAPPED_PROGRAM@";
380
381 for (int i = 0; i < argc; i++)
382 proot_argv[i + proot_specific_argc] = argv[i + 1];
383
384 proot_argv[proot_argc] = NULL;
385
386 /* Seccomp support seems to invariably lead to segfaults; disable it by
387 default. */
388 setenv ("PROOT_NO_SECCOMP", "1", 0);
389
390 int err = execv (proot, proot_argv);
391 if (err < 0)
392 assert_perror (errno);
393 }
394
395 #endif
396
397 \f
398 #if HAVE_EXEC_WITH_LOADER
399
400 /* Traverse PATH, a NULL-terminated string array, and return a colon-separated
401 search path where each item of PATH has been relocated to STORE. The
402 result is malloc'd. */
403 static char *
404 relocated_search_path (const char *path[], const char *store)
405 {
406 char *new_path;
407 size_t size = 0;
408
409 for (size_t i = 0; path[i] != NULL; i++)
410 size += strlen (store) + strlen (path[i]) + 1; /* upper bound */
411
412 new_path = xmalloc (size + 1);
413 new_path[0] = '\0';
414
415 for (size_t i = 0; path[i] != NULL; i++)
416 {
417 if (strncmp (path[i], original_store,
418 sizeof original_store - 1) == 0)
419 {
420 strcat (new_path, store);
421 strcat (new_path, path[i] + sizeof original_store - 1);
422 }
423 else
424 strcat (new_path, path[i]); /* possibly $ORIGIN */
425
426 strcat (new_path, ":");
427 }
428
429 new_path[strlen (new_path) - 1] = '\0'; /* Remove trailing colon. */
430
431 return new_path;
432 }
433
434 /* Concatenate PATH1 and PATH2 with a colon in between. The result is
435 potentially malloc'd. */
436 static char *
437 concat_paths (const char *path1, const char *path2)
438 {
439 if (path1[0] == '\0')
440 return (char *) path2;
441 else
442 {
443 char *result = xmalloc (strlen (path1) + strlen (path2) + 2);
444 strcpy (result, path1);
445 strcat (result, ":");
446 strcat (result, path2);
447 return result;
448 }
449 }
450
451 /* Execute the wrapped program by invoking the loader (ld.so) directly,
452 passing it the audit module and preloading libfakechroot.so. */
453 static void
454 exec_with_loader (const char *store, int argc, char *argv[])
455 {
456 static const char *audit_library_path[] = LOADER_AUDIT_RUNPATH;
457 char *loader = concat (store,
458 PROGRAM_INTERPRETER + sizeof original_store);
459 size_t loader_specific_argc = 8;
460 size_t loader_argc = argc + loader_specific_argc;
461 char *loader_argv[loader_argc + 1];
462 loader_argv[0] = argv[0];
463 loader_argv[1] = "--audit";
464 loader_argv[2] = concat (store,
465 LOADER_AUDIT_MODULE + sizeof original_store);
466
467 /* The audit module depends on libc.so and libgcc_s.so so honor
468 AUDIT_LIBRARY_PATH. Additionally, honor $LD_LIBRARY_PATH if set. */
469 loader_argv[3] = "--library-path";
470 loader_argv[4] =
471 concat_paths (getenv ("LD_LIBRARY_PATH") ?: "",
472 relocated_search_path (audit_library_path, store));
473
474 loader_argv[5] = "--preload";
475 loader_argv[6] = concat (store,
476 FAKECHROOT_LIBRARY + sizeof original_store);
477 loader_argv[7] = concat (store,
478 "@WRAPPED_PROGRAM@" + sizeof original_store);
479
480 for (size_t i = 0; i < argc; i++)
481 loader_argv[i + loader_specific_argc] = argv[i + 1];
482
483 loader_argv[loader_argc] = NULL;
484
485 /* Set up the root directory. */
486 int err;
487 char *new_root = mkdtemp (strdup ("/tmp/guix-exec-XXXXXX"));
488 mirror_directory ("/", new_root, make_symlink);
489
490 /* 'mirror_directory' created a symlink for the ancestor of ORIGINAL_STORE,
491 typically "/gnu". Remove that entry so we can create NEW_STORE
492 below. */
493 const char *slash = strchr (original_store + 1, '/');
494 const char *top = slash != NULL
495 ? strndupa (original_store, slash - original_store)
496 : original_store;
497 char *new_store_top = concat (new_root, top);
498 unlink (new_store_top);
499
500 /* Now create the store under NEW_ROOT. */
501 char *new_store = concat (new_root, original_store);
502 char *new_store_parent = dirname (strdup (new_store));
503 mkdir_p (new_store_parent);
504 err = symlink (store, new_store);
505 if (err < 0)
506 assert_perror (errno);
507
508 #ifdef GCONV_DIRECTORY
509 /* Tell libc where to find its gconv modules. This is necessary because
510 gconv uses non-interposable 'open' calls. */
511 char *gconv_path = concat (store,
512 GCONV_DIRECTORY + sizeof original_store);
513 setenv ("GCONV_PATH", gconv_path, 1);
514 free (gconv_path);
515 #endif
516
517 setenv ("FAKECHROOT_BASE", new_root, 1);
518
519 /* Become the new parent of grand-children when their parent dies. */
520 prctl (PR_SET_CHILD_SUBREAPER, 1);
521
522 pid_t child = fork ();
523 switch (child)
524 {
525 case 0:
526 err = execv (loader, loader_argv);
527 if (err < 0)
528 assert_perror (errno);
529 exit (EXIT_FAILURE);
530 break;
531
532 case -1:
533 assert_perror (errno);
534 exit (EXIT_FAILURE);
535 break;
536
537 default:
538 {
539 int status, status_other;
540 waitpid (child, &status, 0);
541
542 /* If there are child processes still running, leave NEW_ROOT around
543 so they can still access it. XXX: In that case NEW_ROOT is left
544 behind. */
545 if (waitpid (-1 , &status_other, WNOHANG) == -1)
546 {
547 chdir ("/"); /* avoid EBUSY */
548 rm_rf (new_root);
549 }
550
551 free (new_root);
552 close (2); /* flushing stderr should be silent */
553
554 if (WIFEXITED (status))
555 exit (WEXITSTATUS (status));
556 else
557 /* Abnormal termination cannot really be reproduced, so exit
558 with 255. */
559 exit (255);
560 }
561 }
562 }
563
564 #endif
565
566 \f
567 /* Execution engines. */
568
569 struct engine
570 {
571 const char *name;
572 void (* exec) (const char *, int, char **);
573 };
574
575 static void
576 buffer_stderr (void)
577 {
578 static char stderr_buffer[4096];
579 setvbuf (stderr, stderr_buffer, _IOFBF, sizeof stderr_buffer);
580 }
581
582 /* The default engine: choose a robust method. */
583 static void
584 exec_default (const char *store, int argc, char *argv[])
585 {
586 /* Buffer stderr so that nothing's displayed if 'exec_in_user_namespace'
587 fails but 'exec_with_proot' works. */
588 buffer_stderr ();
589
590 exec_in_user_namespace (store, argc, argv);
591 #ifdef PROOT_PROGRAM
592 exec_with_proot (store, argc, argv);
593 #endif
594 }
595
596 /* The "performance" engine: choose performance over robustness. */
597 static void
598 exec_performance (const char *store, int argc, char *argv[])
599 {
600 buffer_stderr ();
601
602 exec_in_user_namespace (store, argc, argv);
603 #if HAVE_EXEC_WITH_LOADER
604 exec_with_loader (store, argc, argv);
605 #endif
606 }
607
608 /* List of supported engines. */
609 static const struct engine engines[] =
610 {
611 { "default", exec_default },
612 { "performance", exec_performance },
613 { "userns", exec_in_user_namespace },
614 #ifdef PROOT_PROGRAM
615 { "proot", exec_with_proot },
616 #endif
617 #if HAVE_EXEC_WITH_LOADER
618 { "fakechroot", exec_with_loader },
619 #endif
620 { NULL, NULL }
621 };
622
623 /* Return the "execution engine" to use. */
624 static const struct engine *
625 execution_engine (void)
626 {
627 const char *str = getenv ("GUIX_EXECUTION_ENGINE");
628
629 if (str == NULL)
630 str = "default";
631
632 try:
633 for (const struct engine *engine = engines;
634 engine->name != NULL;
635 engine++)
636 {
637 if (strcmp (engine->name, str) == 0)
638 return engine;
639 }
640
641 fprintf (stderr, "%s: unsupported Guix execution engine; ignoring\n",
642 str);
643 str = "default";
644 goto try;
645 }
646
647 \f
648 int
649 main (int argc, char *argv[])
650 {
651 ssize_t size;
652 char self[PATH_MAX];
653 size = readlink ("/proc/self/exe", self, sizeof self - 1);
654 assert (size > 0);
655
656 /* SELF is something like "/home/ludo/.local/gnu/store/…-foo/bin/ls" and we
657 want to extract "/home/ludo/.local/gnu/store". */
658 size_t index = strlen (self)
659 - strlen (WRAPPER_PROGRAM) + strlen (original_store);
660 char *store = strdup (self);
661 store[index] = '\0';
662
663 struct stat statbuf;
664
665 /* If STORE is already at the "right" place, we can execute
666 @WRAPPED_PROGRAM@ right away. This is not just an optimization: it's
667 needed when running one of these wrappers from within an unshare'd
668 namespace, because 'unshare' fails with EPERM in that context. */
669 if (strcmp (store, original_store) != 0
670 && lstat ("@WRAPPED_PROGRAM@", &statbuf) != 0)
671 {
672 const struct engine *engine = execution_engine ();
673 engine->exec (store, argc, argv);
674
675 /* If we reach this point, that's because ENGINE failed to do the
676 job. */
677 fprintf (stderr, "\
678 This may be because \"user namespaces\" are not supported on this system.\n\
679 Consequently, we cannot run '@WRAPPED_PROGRAM@',\n\
680 unless you move it to the '@STORE_DIRECTORY@' directory.\n\
681 \n\
682 Please refer to the 'guix pack' documentation for more information.\n");
683 return EXIT_FAILURE;
684 }
685
686 /* The executable is available under @STORE_DIRECTORY@, so we can now
687 execute it. */
688 int err = execv ("@WRAPPED_PROGRAM@", argv);
689 if (err < 0)
690 assert_perror (errno);
691
692 return EXIT_FAILURE;
693 }