1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015, 2016, 2017 Ludovic Courtès <ludo@gnu.org>
3 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
5 ;;; This file is part of GNU Guix.
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
20 (define-module (guix scripts offload)
21 #:use-module (ssh key)
22 #:use-module (ssh auth)
23 #:use-module (ssh session)
24 #:use-module (ssh channel)
25 #:use-module (ssh popen)
26 #:use-module (ssh dist)
27 #:use-module (ssh dist node)
28 #:use-module (ssh version)
29 #:use-module (guix config)
30 #:use-module (guix records)
31 #:use-module (guix ssh)
32 #:use-module (guix store)
33 #:use-module (guix derivations)
34 #:use-module ((guix serialization)
35 #:select (nar-error? nar-error-file))
36 #:use-module (guix nar)
37 #:use-module (guix utils)
38 #:use-module ((guix build syscalls)
39 #:select (fcntl-flock set-thread-name))
40 #:use-module ((guix build utils) #:select (which mkdir-p))
41 #:use-module (guix ui)
42 #:use-module (srfi srfi-1)
43 #:use-module (srfi srfi-11)
44 #:use-module (srfi srfi-26)
45 #:use-module (srfi srfi-34)
46 #:use-module (srfi srfi-35)
47 #:use-module (ice-9 popen)
48 #:use-module (ice-9 rdelim)
49 #:use-module (ice-9 match)
50 #:use-module (ice-9 regex)
51 #:use-module (ice-9 format)
52 #:use-module (ice-9 binary-ports)
53 #:export (build-machine
59 ;;; Attempt to offload builds to the machines listed in
60 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
61 ;;; retrieving the build output(s) over SSH upon success.
63 ;;; This command should not be used directly; instead, it is called on-demand
64 ;;; by the daemon, unless it was started with '--no-build-hook' or a client
65 ;;; inhibited build hooks.
70 (define-record-type* <build-machine>
71 build-machine make-build-machine
73 (name build-machine-name) ; string
74 (port build-machine-port ; number
76 (system build-machine-system) ; string
77 (user build-machine-user) ; string
78 (private-key build-machine-private-key ; file name
79 (default (user-openssh-private-key)))
80 (host-key build-machine-host-key) ; string
81 (compression build-machine-compression ; string
82 (default "zlib@openssh.com,zlib"))
83 (compression-level build-machine-compression-level ;integer
85 (daemon-socket build-machine-daemon-socket ; string
86 (default "/var/guix/daemon-socket/socket"))
87 (parallel-builds build-machine-parallel-builds ; number
89 (speed build-machine-speed ; inexact real
91 (features build-machine-features ; list of strings
94 (define-record-type* <build-requirements>
95 build-requirements make-build-requirements
97 (system build-requirements-system) ; string
98 (features build-requirements-features ; list of strings
101 (define %machine-file
102 ;; File that lists machines available as build slaves.
103 (string-append %config-directory "/machines.scm"))
105 (define (user-openssh-private-key)
106 "Return the user's default SSH private key, or #f if it could not be
108 (and=> (getenv "HOME")
109 (cut string-append <> "/.ssh/id_rsa")))
112 ;; Module in which the machine description file is loaded.
113 (let ((module (make-fresh-user-module)))
114 (module-use! module (resolve-interface '(guix scripts offload)))
117 (define* (build-machines #:optional (file %machine-file))
118 "Read the list of build machines from FILE and return it."
121 ;; Avoid ABI incompatibility with the <build-machine> record.
122 ;; (set! %fresh-auto-compile #t)
124 (save-module-excursion
126 (set-current-module %user-module)
127 (primitive-load file))))
130 (('system-error . rest)
131 (let ((err (system-error-errno args)))
132 ;; Silently ignore missing file since this is a common case.
135 (leave (G_ "failed to open machine file '~a': ~a~%")
136 file (strerror err)))))
137 (('syntax-error proc message properties form . rest)
138 (let ((loc (source-properties->location properties)))
139 (leave (G_ "~a: ~a~%")
140 (location->string loc) message)))
142 (leave (G_ "failed to load machine file '~a': ~s~%")
145 (define (host-key->type+key host-key)
146 "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
147 its key type as a symbol, and the actual base64-encoded string."
148 (define (type->symbol type)
149 (and (string-prefix? "ssh-" type)
150 (string->symbol (string-drop type 4))))
152 (match (string-tokenize host-key)
154 (values (type->symbol type) key))
156 (values (type->symbol type) key))))
158 (define (private-key-from-file* file)
159 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
160 can interpret meaningfully."
161 (catch 'guile-ssh-error
163 (private-key-from-file file))
164 (lambda (key proc str . rest)
166 (&message (message (format #f (G_ "failed to load SSH \
167 private key from '~a': ~a")
170 (define (open-ssh-session machine)
171 "Open an SSH session for MACHINE and return it. Throw an error on failure."
172 (let ((private (private-key-from-file* (build-machine-private-key machine)))
173 (public (public-key-from-file
174 (string-append (build-machine-private-key machine)
176 (session (make-session #:user (build-machine-user machine)
177 #:host (build-machine-name machine)
178 #:port (build-machine-port machine)
179 #:timeout 10 ;seconds
180 ;; #:log-verbosity 'protocol
181 #:identity (build-machine-private-key machine)
183 ;; By default libssh reads ~/.ssh/known_hosts
184 ;; and uses that to adjust its choice of cipher
185 ;; suites, which changes the type of host key
186 ;; that the server sends (RSA vs. Ed25519,
187 ;; etc.). Opt for something reproducible and
188 ;; stateless instead.
189 #:knownhosts "/dev/null"
191 ;; We need lightweight compression when
192 ;; exchanging full archives.
194 (build-machine-compression machine)
196 (build-machine-compression-level machine))))
197 (match (connect! session)
199 ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
200 ;; ed25519 keys and 'get-key-type' returns #f in that case.
201 (let-values (((server) (get-server-public-key session))
202 ((type key) (host-key->type+key
203 (build-machine-host-key machine))))
204 (unless (and (or (not (get-key-type server))
205 (eq? (get-key-type server) type))
206 (string=? (public-key->string server) key))
207 ;; Key mismatch: something's wrong. XXX: It could be that the server
208 ;; provided its Ed25519 key when we where expecting its RSA key.
209 (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
210 instead of '~a' of type '~a'~%")
211 (build-machine-name machine)
212 (public-key->string server) (get-key-type server)
215 (let ((auth (userauth-public-key! session private)))
216 (unless (eq? 'success auth)
217 (disconnect! session)
218 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
219 (build-machine-name machine) (get-error session))))
223 ;; Connection failed or timeout expired.
224 (leave (G_ "failed to connect to '~a': ~a~%")
225 (build-machine-name machine) (get-error session))))))
232 (define (lock-file file)
233 "Wait and acquire an exclusive lock on FILE. Return an open port."
234 (mkdir-p (dirname file))
235 (let ((port (open-file file "w0")))
236 (fcntl-flock port 'write-lock)
239 (define (unlock-file lock)
241 (fcntl-flock lock 'unlock)
245 (define-syntax-rule (with-file-lock file exp ...)
246 "Wait to acquire a lock on FILE and evaluate EXP in that context."
247 (let ((port (lock-file file)))
254 (unlock-file port)))))
256 (define-syntax-rule (with-machine-lock machine hint exp ...)
257 "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
259 (with-file-lock (machine-lock-file machine hint)
263 (define (machine-slot-file machine slot)
264 "Return the file name of MACHINE's file for SLOT."
265 ;; For each machine we have a bunch of files representing each build slot.
266 ;; When choosing a build machine, we attempt to get an exclusive lock on one
267 ;; of these; if we fail, that means all the build slots are already taken.
268 ;; Inspired by Nix's build-remote.pl.
269 (string-append (string-append %state-directory "/offload/"
270 (build-machine-name machine)
271 "/" (number->string slot))))
273 (define (acquire-build-slot machine)
274 "Attempt to acquire a build slot on MACHINE. Return the port representing
275 the slot, or #f if none is available.
277 This mechanism allows us to set a hard limit on the number of simultaneous
278 connections allowed to MACHINE."
279 (mkdir-p (dirname (machine-slot-file machine 0)))
280 (with-machine-lock machine 'slots
282 (let ((port (open-file (machine-slot-file machine slot)
286 (fcntl-flock port 'write-lock #:wait? #f)
288 (format (current-error-port)
289 "process ~a acquired build slot '~a'~%"
290 (getpid) (port-filename port))
293 ;; PORT is already locked by another process.
296 (iota (build-machine-parallel-builds machine)))))
298 (define (release-build-slot slot)
299 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
307 (define (build-log-port)
308 "Return the default port where build logs should be sent. The default is
309 file descriptor 4, which is open by the daemon before running the offload
311 (let ((port (fdopen 4 "w0")))
312 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
313 (set-port-revealed! port 1)
316 (define* (transfer-and-offload drv machine
320 (max-silent-time 3600)
323 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
324 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
327 (open-ssh-session machine))
330 (connect-to-remote-daemon session
331 (build-machine-daemon-socket machine)))
333 (set-build-options store
334 #:print-build-trace print-build-trace?
335 #:max-silent-time max-silent-time
336 #:timeout build-timeout)
338 ;; Protect DRV from garbage collection.
339 (add-temp-root store (derivation-file-name drv))
342 (send-files local (cons (derivation-file-name drv) inputs) store
343 #:log-port (current-output-port)))
344 (format (current-error-port) "offloading '~a' to '~a'...~%"
345 (derivation-file-name drv) (build-machine-name machine))
346 (format (current-error-port) "@ build-remote ~a ~a~%"
347 (derivation-file-name drv) (build-machine-name machine))
349 (guard (c ((nix-protocol-error? c)
350 (format (current-error-port)
351 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
352 (derivation-file-name drv)
353 (build-machine-name machine)
354 (nix-protocol-error-message c))
355 ;; Use exit code 100 for a permanent build failure. The daemon
356 ;; interprets other non-zero codes as transient build failures.
357 (primitive-exit 100)))
358 (parameterize ((current-build-output-port (build-log-port)))
359 (build-derivations store (list drv))))
361 (retrieve-files* outputs store
363 ;; We cannot use the 'import-paths' RPC here because we
364 ;; already hold the locks for FILES.
367 (restore-file-set port
368 #:log-port (current-error-port)
371 (format (current-error-port) "done with offloaded '~a'~%"
372 (derivation-file-name drv)))
379 (define (machine-matches? machine requirements)
380 "Return #t if MACHINE matches REQUIREMENTS."
381 (and (string=? (build-requirements-system requirements)
382 (build-machine-system machine))
384 (build-requirements-features requirements)
385 (build-machine-features machine))))
387 (define (machine-load machine)
388 "Return the load of MACHINE, divided by the number of parallel builds
389 allowed on MACHINE. Return +∞ if MACHINE is unreachable."
390 ;; Note: This procedure is costly since it creates a new SSH session.
391 (match (false-if-exception (open-ssh-session machine))
392 ((? session? session)
393 (let* ((pipe (open-remote-pipe* session OPEN_READ
394 "cat" "/proc/loadavg"))
395 (line (read-line pipe)))
397 (disconnect! session)
399 (if (eof-object? line)
400 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
401 (match (string-tokenize line)
402 ((one five fifteen . x)
403 (let* ((raw (string->number five))
404 (jobs (build-machine-parallel-builds machine))
405 (normalized (/ raw jobs)))
406 (format (current-error-port) "load on machine '~a' is ~s\
408 (build-machine-name machine) raw normalized)
411 +inf.0))))) ;something's fishy about MACHINE, so avoid it
413 +inf.0))) ;failed to connect to MACHINE, so avoid it
415 (define (machine-lock-file machine hint)
416 "Return the name of MACHINE's lock file for HINT."
417 (string-append %state-directory "/offload/"
418 (build-machine-name machine)
419 "." (symbol->string hint) ".lock"))
421 (define (machine-choice-lock-file)
422 "Return the name of the file used as a lock when choosing a build machine."
423 (string-append %state-directory "/offload/machine-choice.lock"))
425 (define (random-seed)
426 (logxor (getpid) (car (gettimeofday))))
429 (let ((state (seed->random-state (random-seed))))
431 "Return LST shuffled (using the Fisher-Yates algorithm.)"
432 (define vec (list->vector lst))
433 (let loop ((result '())
434 (i (vector-length vec)))
437 (let* ((j (random i state))
438 (val (vector-ref vec j)))
439 (vector-set! vec j (vector-ref vec (- i 1)))
440 (loop (cons val result) (- i 1))))))))
442 (define (choose-build-machine machines)
443 "Return two values: the best machine among MACHINES and its build
444 slot (which must later be released with 'release-build-slot'), or #f and #f."
446 ;; Proceed like this:
447 ;; 1. Acquire the global machine-choice lock.
448 ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
449 ;; those machines for which we failed.
450 ;; 3. Choose the best machine among those that are left.
451 ;; 4. Release the previously-acquired build slots of the other machines.
452 ;; 5. Release the global machine-choice lock.
454 (with-file-lock (machine-choice-lock-file)
455 (define machines+slots
456 (filter-map (lambda (machine)
457 (let ((slot (acquire-build-slot machine)))
458 (and slot (list machine slot))))
461 (define (undecorate pred)
467 (pred machine1 machine2)))))))
469 (define (machine-faster? m1 m2)
470 ;; Return #t if M1 is faster than M2.
471 (> (build-machine-speed m1)
472 (build-machine-speed m2)))
474 (let loop ((machines+slots
475 (sort machines+slots (undecorate machine-faster?))))
476 (match machines+slots
477 (((best slot) others ...)
478 ;; Return the best machine unless it's already overloaded.
479 ;; Note: We call 'machine-load' only as a last resort because it is
480 ;; too costly to call it once for every machine.
481 (if (< (machine-load best) 2.)
483 (((machines slots) ...)
484 ;; Release slots from the uninteresting machines.
485 (for-each release-build-slot slots)
487 ;; The caller must keep SLOT to protect it from GC and to
488 ;; eventually release it.
491 ;; BEST is overloaded, so try the next one.
492 (release-build-slot slot)
497 (define* (process-request wants-local? system drv features
499 print-build-trace? (max-silent-time 3600)
501 "Process a request to build DRV."
502 (let* ((local? (and wants-local? (string=? system (%current-system))))
503 (reqs (build-requirements
505 (features features)))
506 (candidates (filter (cut machine-matches? <> reqs)
510 ;; We'll never be able to match REQS.
511 (display "# decline\n"))
513 (let-values (((machine slot)
514 (choose-build-machine candidates)))
519 ;; Offload DRV to MACHINE.
520 (display "# accept\n")
521 (let ((inputs (string-tokenize (read-line)))
522 (outputs (string-tokenize (read-line))))
523 (transfer-and-offload drv machine
526 #:max-silent-time max-silent-time
527 #:build-timeout build-timeout
529 print-build-trace?)))
531 (release-build-slot slot)))
533 ;; Not now, all the machines are busy.
534 (display "# postpone\n")))))))
538 ;;; Installation tests.
541 (define (assert-node-repl node name)
542 "Bail out if NODE is not running Guile."
543 (match (node-guile-version node)
545 (leave (G_ "Guile could not be started on '~a'~%")
548 ;; Note: The version string already contains the word "Guile".
549 (info (G_ "'~a' is running ~a~%")
550 name (node-guile-version node)))))
552 (define (assert-node-has-guix node name)
553 "Bail out if NODE lacks the (guix) module, or if its daemon is not running."
554 (match (node-eval node
558 (add-text-to-store store "test"
559 "Hello, build machine!"))))
561 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
564 (leave (G_ "failed to use Guix module on '~a' (test returned ~s)~%")
567 (define %random-state
569 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
571 (define* (nonce #:optional (name (gethostname)))
572 (string-append name "-"
573 (number->string (random 1000000 (force %random-state)))))
575 (define (assert-node-can-import node name daemon-socket)
576 "Bail out if NODE refuses to import our archives."
577 (let ((session (node-session node)))
579 (let* ((item (add-text-to-store store "export-test" (nonce)))
580 (remote (connect-to-remote-daemon session daemon-socket)))
582 (send-files local (list item) remote))
584 (if (valid-path? remote item)
585 (info (G_ "'~a' successfully imported '~a'~%")
587 (leave (G_ "'~a' was not properly imported on '~a'~%")
590 (define (assert-node-can-export node name daemon-socket)
591 "Bail out if we cannot import signed archives from NODE."
592 (let* ((session (node-session node))
593 (remote (connect-to-remote-daemon session daemon-socket))
594 (item (add-text-to-store remote "import-test" (nonce name))))
596 (if (and (retrieve-files store (list item) remote)
597 (valid-path? store item))
598 (info (G_ "successfully imported '~a' from '~a'~%")
600 (leave (G_ "failed to import '~a' from '~a'~%")
603 (define (check-machine-availability machine-file pred)
604 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
606 (define (build-machine=? m1 m2)
607 (and (string=? (build-machine-name m1) (build-machine-name m2))
608 (= (build-machine-port m1) (build-machine-port m2))))
610 ;; A given build machine may appear several times (e.g., once for
611 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
612 (let ((machines (filter pred
613 (delete-duplicates (build-machines machine-file)
615 (info (G_ "testing ~a build machines defined in '~a'...~%")
616 (length machines) machine-file)
617 (let* ((names (map build-machine-name machines))
618 (sockets (map build-machine-daemon-socket machines))
619 (sessions (map open-ssh-session machines))
620 (nodes (map make-node sessions)))
621 (for-each assert-node-repl nodes names)
622 (for-each assert-node-has-guix nodes names)
623 (for-each assert-node-can-import nodes names sockets)
624 (for-each assert-node-can-export nodes names sockets))))
626 (define (check-machine-status machine-file pred)
627 "Print the load of each machine matching PRED in MACHINE-FILE."
628 (define (build-machine=? m1 m2)
629 (and (string=? (build-machine-name m1) (build-machine-name m2))
630 (= (build-machine-port m1) (build-machine-port m2))))
632 ;; A given build machine may appear several times (e.g., once for
633 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
634 (let ((machines (filter pred
635 (delete-duplicates (build-machines machine-file)
637 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
638 (length machines) machine-file)
639 (for-each (lambda (machine)
640 (let* ((node (make-node (open-ssh-session machine)))
641 (uts (node-eval node '(uname))))
642 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
643 host name: ~a~% normalized load: ~a~%"
644 (build-machine-name machine)
645 (utsname:sysname uts) (utsname:release uts)
646 (utsname:machine uts)
647 (utsname:nodename uts)
648 (parameterize ((current-error-port (%make-void-port "rw+")))
649 (machine-load machine)))))
657 (define (guix-offload . args)
658 (define request-line-rx
659 ;; The request format. See 'tryBuildHook' method in build.cc.
660 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
663 (char-set-complement (char-set #\,)))
665 ;; Make sure $HOME really corresponds to the current user. This is
666 ;; necessary since lsh uses that to determine the location of the yarrow
667 ;; seed file, and fails if it's owned by someone else.
668 (and=> (passwd:dir (getpw (getuid)))
669 (cut setenv "HOME" <>))
671 ;; We rely on protocol-level compression from libssh to optimize large data
672 ;; transfers. Warn if it's missing.
673 (unless (zlib-support?)
674 (warning (G_ "Guile-SSH lacks zlib support"))
675 (warning (G_ "data transfers will *not* be compressed!")))
678 ((system max-silent-time print-build-trace? build-timeout)
679 (let ((max-silent-time (string->number max-silent-time))
680 (build-timeout (string->number build-timeout))
681 (print-build-trace? (string=? print-build-trace? "1")))
682 (set-thread-name "guix offload")
683 (parameterize ((%current-system system))
684 (let loop ((line (read-line)))
685 (unless (eof-object? line)
686 (cond ((regexp-exec request-line-rx line)
690 (process-request (equal? (match:substring match 1) "1")
691 (match:substring match 2) ; system
692 (read-derivation-from-file
693 (match:substring match 3))
695 (match:substring match 4) not-coma)
696 #:print-build-trace? print-build-trace?
697 #:max-silent-time max-silent-time
698 #:build-timeout build-timeout))))
700 (leave (G_ "invalid request line: ~s~%") line)))
701 (loop (read-line)))))))
704 (let-values (((file pred)
708 (compose (cut string-match regexp <>)
709 build-machine-name)))
710 ((file) (values file (const #t)))
711 (() (values %machine-file (const #t)))
712 (x (leave (G_ "wrong number of arguments~%"))))))
713 (check-machine-availability (or file %machine-file) pred))))
716 (let-values (((file pred)
720 (compose (cut string-match regexp <>)
721 build-machine-name)))
722 ((file) (values file (const #t)))
723 (() (values %machine-file (const #t)))
724 (x (leave (G_ "wrong number of arguments~%"))))))
725 (check-machine-status (or file %machine-file) pred))))
727 (show-version-and-exit "guix offload"))
729 (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
730 Process build offload requests written on the standard input, possibly
731 offloading builds to the machines listed in '~a'.~%")
734 This tool is meant to be used internally by 'guix-daemon'.\n"))
735 (show-bug-report-information))
737 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
740 ;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
741 ;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
742 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
745 ;;; offload.scm ends here