1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015, 2016, 2017 Ludovic Courtès <ludo@gnu.org>
4 ;;; This file is part of GNU Guix.
6 ;;; GNU Guix is free software; you can redistribute it and/or modify it
7 ;;; under the terms of the GNU General Public License as published by
8 ;;; the Free Software Foundation; either version 3 of the License, or (at
9 ;;; your option) any later version.
11 ;;; GNU Guix is distributed in the hope that it will be useful, but
12 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;;; GNU General Public License for more details.
16 ;;; You should have received a copy of the GNU General Public License
17 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19 (define-module (guix scripts offload)
20 #:use-module (ssh key)
21 #:use-module (ssh auth)
22 #:use-module (ssh session)
23 #:use-module (ssh channel)
24 #:use-module (ssh popen)
25 #:use-module (ssh dist)
26 #:use-module (ssh dist node)
27 #:use-module (ssh version)
28 #:use-module (guix config)
29 #:use-module (guix records)
30 #:use-module (guix ssh)
31 #:use-module (guix store)
32 #:use-module (guix derivations)
33 #:use-module ((guix serialization)
34 #:select (nar-error? nar-error-file))
35 #:use-module (guix nar)
36 #:use-module (guix utils)
37 #:use-module ((guix build syscalls) #:select (fcntl-flock))
38 #:use-module ((guix build utils) #:select (which mkdir-p))
39 #:use-module (guix ui)
40 #:use-module (srfi srfi-1)
41 #:use-module (srfi srfi-11)
42 #:use-module (srfi srfi-26)
43 #:use-module (srfi srfi-34)
44 #:use-module (srfi srfi-35)
45 #:use-module (ice-9 popen)
46 #:use-module (ice-9 rdelim)
47 #:use-module (ice-9 match)
48 #:use-module (ice-9 regex)
49 #:use-module (ice-9 format)
50 #:use-module (ice-9 binary-ports)
51 #:export (build-machine
57 ;;; Attempt to offload builds to the machines listed in
58 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
59 ;;; retrieving the build output(s) over SSH upon success.
61 ;;; This command should not be used directly; instead, it is called on-demand
62 ;;; by the daemon, unless it was started with '--no-build-hook' or a client
63 ;;; inhibited build hooks.
68 (define-record-type* <build-machine>
69 build-machine make-build-machine
71 (name build-machine-name) ; string
72 (port build-machine-port ; number
74 (system build-machine-system) ; string
75 (user build-machine-user) ; string
76 (private-key build-machine-private-key ; file name
77 (default (user-openssh-private-key)))
78 (host-key build-machine-host-key) ; string
79 (compression build-machine-compression ; string
80 (default "zlib@openssh.com,zlib"))
81 (compression-level build-machine-compression-level ;integer
83 (daemon-socket build-machine-daemon-socket ; string
84 (default "/var/guix/daemon-socket/socket"))
85 (parallel-builds build-machine-parallel-builds ; number
87 (speed build-machine-speed ; inexact real
89 (features build-machine-features ; list of strings
92 (define-record-type* <build-requirements>
93 build-requirements make-build-requirements
95 (system build-requirements-system) ; string
96 (features build-requirements-features ; list of strings
100 ;; File that lists machines available as build slaves.
101 (string-append %config-directory "/machines.scm"))
103 (define (user-openssh-private-key)
104 "Return the user's default SSH private key, or #f if it could not be
106 (and=> (getenv "HOME")
107 (cut string-append <> "/.ssh/id_rsa")))
110 ;; Module in which the machine description file is loaded.
111 (let ((module (make-fresh-user-module)))
112 (module-use! module (resolve-interface '(guix scripts offload)))
115 (define* (build-machines #:optional (file %machine-file))
116 "Read the list of build machines from FILE and return it."
119 ;; Avoid ABI incompatibility with the <build-machine> record.
120 (set! %fresh-auto-compile #t)
122 (save-module-excursion
124 (set-current-module %user-module)
125 (primitive-load file))))
128 (('system-error . rest)
129 (let ((err (system-error-errno args)))
130 ;; Silently ignore missing file since this is a common case.
133 (leave (G_ "failed to open machine file '~a': ~a~%")
134 file (strerror err)))))
135 (('syntax-error proc message properties form . rest)
136 (let ((loc (source-properties->location properties)))
137 (leave (G_ "~a: ~a~%")
138 (location->string loc) message)))
140 (leave (G_ "failed to load machine file '~a': ~s~%")
143 (define (host-key->type+key host-key)
144 "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
145 its key type as a symbol, and the actual base64-encoded string."
146 (define (type->symbol type)
147 (and (string-prefix? "ssh-" type)
148 (string->symbol (string-drop type 4))))
150 (match (string-tokenize host-key)
152 (values (type->symbol type) key))
154 (values (type->symbol type) key))))
156 (define (private-key-from-file* file)
157 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
158 can interpret meaningfully."
159 (catch 'guile-ssh-error
161 (private-key-from-file file))
162 (lambda (key proc str . rest)
164 (&message (message (format #f (G_ "failed to load SSH \
165 private key from '~a': ~a")
168 (define (open-ssh-session machine)
169 "Open an SSH session for MACHINE and return it. Throw an error on failure."
170 (let ((private (private-key-from-file* (build-machine-private-key machine)))
171 (public (public-key-from-file
172 (string-append (build-machine-private-key machine)
174 (session (make-session #:user (build-machine-user machine)
175 #:host (build-machine-name machine)
176 #:port (build-machine-port machine)
177 #:timeout 10 ;seconds
178 ;; #:log-verbosity 'protocol
179 #:identity (build-machine-private-key machine)
181 ;; By default libssh reads ~/.ssh/known_hosts
182 ;; and uses that to adjust its choice of cipher
183 ;; suites, which changes the type of host key
184 ;; that the server sends (RSA vs. Ed25519,
185 ;; etc.). Opt for something reproducible and
186 ;; stateless instead.
187 #:knownhosts "/dev/null"
189 ;; We need lightweight compression when
190 ;; exchanging full archives.
192 (build-machine-compression machine)
194 (build-machine-compression-level machine))))
195 (match (connect! session)
197 ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
198 ;; ed25519 keys and 'get-key-type' returns #f in that case.
199 (let-values (((server) (get-server-public-key session))
200 ((type key) (host-key->type+key
201 (build-machine-host-key machine))))
202 (unless (and (or (not (get-key-type server))
203 (eq? (get-key-type server) type))
204 (string=? (public-key->string server) key))
205 ;; Key mismatch: something's wrong. XXX: It could be that the server
206 ;; provided its Ed25519 key when we where expecting its RSA key.
207 (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
208 instead of '~a' of type '~a'~%")
209 (build-machine-name machine)
210 (public-key->string server) (get-key-type server)
213 (let ((auth (userauth-public-key! session private)))
214 (unless (eq? 'success auth)
215 (disconnect! session)
216 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
217 (build-machine-name machine) (get-error session))))
221 ;; Connection failed or timeout expired.
222 (leave (G_ "failed to connect to '~a': ~a~%")
223 (build-machine-name machine) (get-error session))))))
230 (define (lock-file file)
231 "Wait and acquire an exclusive lock on FILE. Return an open port."
232 (mkdir-p (dirname file))
233 (let ((port (open-file file "w0")))
234 (fcntl-flock port 'write-lock)
237 (define (unlock-file lock)
239 (fcntl-flock lock 'unlock)
243 (define-syntax-rule (with-file-lock file exp ...)
244 "Wait to acquire a lock on FILE and evaluate EXP in that context."
245 (let ((port (lock-file file)))
252 (unlock-file port)))))
254 (define-syntax-rule (with-machine-lock machine hint exp ...)
255 "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
257 (with-file-lock (machine-lock-file machine hint)
261 (define (machine-slot-file machine slot)
262 "Return the file name of MACHINE's file for SLOT."
263 ;; For each machine we have a bunch of files representing each build slot.
264 ;; When choosing a build machine, we attempt to get an exclusive lock on one
265 ;; of these; if we fail, that means all the build slots are already taken.
266 ;; Inspired by Nix's build-remote.pl.
267 (string-append (string-append %state-directory "/offload/"
268 (build-machine-name machine)
269 "/" (number->string slot))))
271 (define (acquire-build-slot machine)
272 "Attempt to acquire a build slot on MACHINE. Return the port representing
273 the slot, or #f if none is available.
275 This mechanism allows us to set a hard limit on the number of simultaneous
276 connections allowed to MACHINE."
277 (mkdir-p (dirname (machine-slot-file machine 0)))
278 (with-machine-lock machine 'slots
280 (let ((port (open-file (machine-slot-file machine slot)
284 (fcntl-flock port 'write-lock #:wait? #f)
286 (format (current-error-port)
287 "process ~a acquired build slot '~a'~%"
288 (getpid) (port-filename port))
291 ;; PORT is already locked by another process.
294 (iota (build-machine-parallel-builds machine)))))
296 (define (release-build-slot slot)
297 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
305 (define (build-log-port)
306 "Return the default port where build logs should be sent. The default is
307 file descriptor 4, which is open by the daemon before running the offload
309 (let ((port (fdopen 4 "w0")))
310 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
311 (set-port-revealed! port 1)
314 (define* (transfer-and-offload drv machine
318 (max-silent-time 3600)
321 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
322 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
325 (open-ssh-session machine))
328 (connect-to-remote-daemon session
329 (build-machine-daemon-socket machine)))
331 (set-build-options store
332 #:print-build-trace print-build-trace?
333 #:max-silent-time max-silent-time
334 #:timeout build-timeout)
336 ;; Protect DRV from garbage collection.
337 (add-temp-root store (derivation-file-name drv))
340 (send-files local (cons (derivation-file-name drv) inputs) store
341 #:log-port (current-output-port)))
342 (format (current-error-port) "offloading '~a' to '~a'...~%"
343 (derivation-file-name drv) (build-machine-name machine))
344 (format (current-error-port) "@ build-remote ~a ~a~%"
345 (derivation-file-name drv) (build-machine-name machine))
347 (guard (c ((nix-protocol-error? c)
348 (format (current-error-port)
349 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
350 (derivation-file-name drv)
351 (build-machine-name machine)
352 (nix-protocol-error-message c))
353 ;; Use exit code 100 for a permanent build failure. The daemon
354 ;; interprets other non-zero codes as transient build failures.
355 (primitive-exit 100)))
356 (parameterize ((current-build-output-port (build-log-port)))
357 (build-derivations store (list drv))))
359 (retrieve-files* outputs store)
360 (format (current-error-port) "done with offloaded '~a'~%"
361 (derivation-file-name drv)))
363 (define (retrieve-files* files remote)
364 "Retrieve FILES from REMOTE and import them using 'restore-file-set'."
365 (let-values (((port count)
366 (file-retrieval-port files remote)))
367 (format #t (N_ "retrieving ~a store item from '~a'...~%"
368 "retrieving ~a store items from '~a'...~%" count)
369 count (remote-store-host remote))
371 ;; We cannot use the 'import-paths' RPC here because we already
372 ;; hold the locks for FILES.
373 (let ((result (restore-file-set port
374 #:log-port (current-error-port)
384 (define (machine-matches? machine requirements)
385 "Return #t if MACHINE matches REQUIREMENTS."
386 (and (string=? (build-requirements-system requirements)
387 (build-machine-system machine))
389 (build-requirements-features requirements)
390 (build-machine-features machine))))
392 (define (machine-load machine)
393 "Return the load of MACHINE, divided by the number of parallel builds
394 allowed on MACHINE. Return +∞ if MACHINE is unreachable."
395 ;; Note: This procedure is costly since it creates a new SSH session.
396 (match (false-if-exception (open-ssh-session machine))
397 ((? session? session)
398 (let* ((pipe (open-remote-pipe* session OPEN_READ
399 "cat" "/proc/loadavg"))
400 (line (read-line pipe)))
403 (if (eof-object? line)
404 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
405 (match (string-tokenize line)
406 ((one five fifteen . x)
407 (let* ((raw (string->number five))
408 (jobs (build-machine-parallel-builds machine))
409 (normalized (/ raw jobs)))
410 (format (current-error-port) "load on machine '~a' is ~s\
412 (build-machine-name machine) raw normalized)
415 +inf.0))))) ;something's fishy about MACHINE, so avoid it
417 +inf.0))) ;failed to connect to MACHINE, so avoid it
419 (define (machine-lock-file machine hint)
420 "Return the name of MACHINE's lock file for HINT."
421 (string-append %state-directory "/offload/"
422 (build-machine-name machine)
423 "." (symbol->string hint) ".lock"))
425 (define (machine-choice-lock-file)
426 "Return the name of the file used as a lock when choosing a build machine."
427 (string-append %state-directory "/offload/machine-choice.lock"))
431 ;; List of acquired build slots (open ports).
434 (define (choose-build-machine machines)
435 "Return the best machine among MACHINES, or #f."
437 ;; Proceed like this:
438 ;; 1. Acquire the global machine-choice lock.
439 ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
440 ;; those machines for which we failed.
441 ;; 3. Choose the best machine among those that are left.
442 ;; 4. Release the previously-acquired build slots of the other machines.
443 ;; 5. Release the global machine-choice lock.
445 (with-file-lock (machine-choice-lock-file)
446 (define machines+slots+loads
447 (filter-map (lambda (machine)
448 ;; Call 'machine-load' from here to make sure it is called
449 ;; only once per machine (it is expensive).
450 (let ((slot (acquire-build-slot machine)))
452 (list machine slot (machine-load machine)))))
455 (define (undecorate pred)
458 ((machine1 slot1 load1)
460 ((machine2 slot2 load2)
461 (pred machine1 load1 machine2 load2)))))))
463 (define (machine-less-loaded-or-faster? m1 l1 m2 l2)
464 ;; Return #t if M1 is either less loaded or faster than M2, with L1
465 ;; being the load of M1 and L2 the load of M2. (This relation defines a
466 ;; total order on machines.)
467 (> (/ (build-machine-speed m1) (+ 1 l1))
468 (/ (build-machine-speed m2) (+ 1 l2))))
470 (let loop ((machines+slots+loads
471 (sort machines+slots+loads
472 (undecorate machine-less-loaded-or-faster?))))
473 (match machines+slots+loads
474 (((best slot load) others ...)
475 ;; Return the best machine unless it's already overloaded.
478 (((machines slots loads) ...)
479 ;; Release slots from the uninteresting machines.
480 (for-each release-build-slot slots)
482 ;; Prevent SLOT from being GC'd.
483 (set! %slots (cons slot %slots))
486 ;; BEST is overloaded, so try the next one.
487 (release-build-slot slot)
491 (define* (process-request wants-local? system drv features
493 print-build-trace? (max-silent-time 3600)
495 "Process a request to build DRV."
496 (let* ((local? (and wants-local? (string=? system (%current-system))))
497 (reqs (build-requirements
499 (features features)))
500 (candidates (filter (cut machine-matches? <> reqs)
504 ;; We'll never be able to match REQS.
505 (display "# decline\n"))
507 (let ((machine (choose-build-machine candidates)))
510 ;; Offload DRV to MACHINE.
511 (display "# accept\n")
512 (let ((inputs (string-tokenize (read-line)))
513 (outputs (string-tokenize (read-line))))
514 (transfer-and-offload drv machine
517 #:max-silent-time max-silent-time
518 #:build-timeout build-timeout
519 #:print-build-trace? print-build-trace?)))
521 ;; Not now, all the machines are busy.
522 (display "# postpone\n")))))))
526 ;;; Installation tests.
529 (define (assert-node-repl node name)
530 "Bail out if NODE is not running Guile."
531 (match (node-guile-version node)
533 (leave (G_ "Guile could not be started on '~a'~%")
536 ;; Note: The version string already contains the word "Guile".
537 (info (G_ "'~a' is running ~a~%")
538 name (node-guile-version node)))))
540 (define (assert-node-has-guix node name)
541 "Bail out if NODE lacks the (guix) module, or if its daemon is not running."
542 (match (node-eval node
546 (add-text-to-store store "test"
547 "Hello, build machine!"))))
549 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
552 (leave (G_ "failed to use Guix module on '~a' (test returned ~s)~%")
555 (define %random-state
557 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
559 (define* (nonce #:optional (name (gethostname)))
560 (string-append name "-"
561 (number->string (random 1000000 (force %random-state)))))
563 (define (assert-node-can-import node name daemon-socket)
564 "Bail out if NODE refuses to import our archives."
565 (let ((session (node-session node)))
567 (let* ((item (add-text-to-store store "export-test" (nonce)))
568 (remote (connect-to-remote-daemon session daemon-socket)))
570 (send-files local (list item) remote))
572 (if (valid-path? remote item)
573 (info (G_ "'~a' successfully imported '~a'~%")
575 (leave (G_ "'~a' was not properly imported on '~a'~%")
578 (define (assert-node-can-export node name daemon-socket)
579 "Bail out if we cannot import signed archives from NODE."
580 (let* ((session (node-session node))
581 (remote (connect-to-remote-daemon session daemon-socket))
582 (item (add-text-to-store remote "import-test" (nonce name))))
584 (if (and (retrieve-files store (list item) remote)
585 (valid-path? store item))
586 (info (G_ "successfully imported '~a' from '~a'~%")
588 (leave (G_ "failed to import '~a' from '~a'~%")
591 (define (check-machine-availability machine-file pred)
592 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
594 (define (build-machine=? m1 m2)
595 (and (string=? (build-machine-name m1) (build-machine-name m2))
596 (= (build-machine-port m1) (build-machine-port m2))))
598 ;; A given build machine may appear several times (e.g., once for
599 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
600 (let ((machines (filter pred
601 (delete-duplicates (build-machines machine-file)
603 (info (G_ "testing ~a build machines defined in '~a'...~%")
604 (length machines) machine-file)
605 (let* ((names (map build-machine-name machines))
606 (sockets (map build-machine-daemon-socket machines))
607 (sessions (map open-ssh-session machines))
608 (nodes (map make-node sessions)))
609 (for-each assert-node-repl nodes names)
610 (for-each assert-node-has-guix nodes names)
611 (for-each assert-node-can-import nodes names sockets)
612 (for-each assert-node-can-export nodes names sockets))))
619 (define (guix-offload . args)
620 (define request-line-rx
621 ;; The request format. See 'tryBuildHook' method in build.cc.
622 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
625 (char-set-complement (char-set #\,)))
627 ;; Make sure $HOME really corresponds to the current user. This is
628 ;; necessary since lsh uses that to determine the location of the yarrow
629 ;; seed file, and fails if it's owned by someone else.
630 (and=> (passwd:dir (getpw (getuid)))
631 (cut setenv "HOME" <>))
633 ;; We rely on protocol-level compression from libssh to optimize large data
634 ;; transfers. Warn if it's missing.
635 (unless (zlib-support?)
636 (warning (G_ "Guile-SSH lacks zlib support"))
637 (warning (G_ "data transfers will *not* be compressed!")))
640 ((system max-silent-time print-build-trace? build-timeout)
641 (let ((max-silent-time (string->number max-silent-time))
642 (build-timeout (string->number build-timeout))
643 (print-build-trace? (string=? print-build-trace? "1")))
644 (parameterize ((%current-system system))
645 (let loop ((line (read-line)))
646 (unless (eof-object? line)
647 (cond ((regexp-exec request-line-rx line)
651 (process-request (equal? (match:substring match 1) "1")
652 (match:substring match 2) ; system
653 (call-with-input-file
654 (match:substring match 3)
657 (match:substring match 4) not-coma)
658 #:print-build-trace? print-build-trace?
659 #:max-silent-time max-silent-time
660 #:build-timeout build-timeout))))
662 (leave (G_ "invalid request line: ~s~%") line)))
663 (loop (read-line)))))))
666 (let-values (((file pred)
670 (compose (cut string-match regexp <>)
671 build-machine-name)))
672 ((file) (values file (const #t)))
673 (() (values %machine-file (const #t)))
674 (x (leave (G_ "wrong number of arguments~%"))))))
675 (check-machine-availability (or file %machine-file) pred))))
677 (show-version-and-exit "guix offload"))
679 (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
680 Process build offload requests written on the standard input, possibly
681 offloading builds to the machines listed in '~a'.~%")
684 This tool is meant to be used internally by 'guix-daemon'.\n"))
685 (show-bug-report-information))
687 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
690 ;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
691 ;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
692 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
695 ;;; offload.scm ends here