environment: Create /etc/group in containers.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
3 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (guix scripts offload)
21 #:use-module (ssh key)
22 #:use-module (ssh auth)
23 #:use-module (ssh session)
24 #:use-module (ssh channel)
25 #:use-module (ssh popen)
26 #:use-module (ssh version)
27 #:use-module (guix config)
28 #:use-module (guix records)
29 #:use-module (guix ssh)
30 #:use-module (guix store)
31 #:use-module (guix inferior)
32 #:use-module (guix derivations)
33 #:use-module ((guix serialization)
34 #:select (nar-error? nar-error-file))
35 #:use-module (guix nar)
36 #:use-module (guix utils)
37 #:use-module ((guix build syscalls)
38 #:select (fcntl-flock set-thread-name))
39 #:use-module ((guix build utils) #:select (which mkdir-p))
40 #:use-module (guix ui)
41 #:use-module (srfi srfi-1)
42 #:use-module (srfi srfi-11)
43 #:use-module (srfi srfi-26)
44 #:use-module (srfi srfi-34)
45 #:use-module (srfi srfi-35)
46 #:use-module (ice-9 popen)
47 #:use-module (ice-9 rdelim)
48 #:use-module (ice-9 match)
49 #:use-module (ice-9 regex)
50 #:use-module (ice-9 format)
51 #:use-module (ice-9 binary-ports)
52 #:export (build-machine
53 build-requirements
54 guix-offload))
55
56 ;;; Commentary:
57 ;;;
58 ;;; Attempt to offload builds to the machines listed in
59 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
60 ;;; retrieving the build output(s) over SSH upon success.
61 ;;;
62 ;;; This command should not be used directly; instead, it is called on-demand
63 ;;; by the daemon, unless it was started with '--no-build-hook' or a client
64 ;;; inhibited build hooks.
65 ;;;
66 ;;; Code:
67
68
69 (define-record-type* <build-machine>
70 build-machine make-build-machine
71 build-machine?
72 (name build-machine-name) ; string
73 (port build-machine-port ; number
74 (default 22))
75 (system build-machine-system) ; string
76 (user build-machine-user) ; string
77 (private-key build-machine-private-key ; file name
78 (default (user-openssh-private-key)))
79 (host-key build-machine-host-key) ; string
80 (compression build-machine-compression ; string
81 (default "zlib@openssh.com,zlib"))
82 (compression-level build-machine-compression-level ;integer
83 (default 3))
84 (daemon-socket build-machine-daemon-socket ; string
85 (default "/var/guix/daemon-socket/socket"))
86 (parallel-builds build-machine-parallel-builds ; number
87 (default 1))
88 (speed build-machine-speed ; inexact real
89 (default 1.0))
90 (features build-machine-features ; list of strings
91 (default '())))
92
93 (define-record-type* <build-requirements>
94 build-requirements make-build-requirements
95 build-requirements?
96 (system build-requirements-system) ; string
97 (features build-requirements-features ; list of strings
98 (default '())))
99
100 (define %machine-file
101 ;; File that lists machines available as build slaves.
102 (string-append %config-directory "/machines.scm"))
103
104 (define (user-openssh-private-key)
105 "Return the user's default SSH private key, or #f if it could not be
106 determined."
107 (and=> (getenv "HOME")
108 (cut string-append <> "/.ssh/id_rsa")))
109
110 (define %user-module
111 ;; Module in which the machine description file is loaded.
112 (let ((module (make-fresh-user-module)))
113 (module-use! module (resolve-interface '(guix scripts offload)))
114 module))
115
116 (define* (build-machines #:optional (file %machine-file))
117 "Read the list of build machines from FILE and return it."
118 (catch #t
119 (lambda ()
120 ;; Avoid ABI incompatibility with the <build-machine> record.
121 ;; (set! %fresh-auto-compile #t)
122
123 (save-module-excursion
124 (lambda ()
125 (set-current-module %user-module)
126 (match (primitive-load file)
127 (((? build-machine? machines) ...)
128 machines)
129 (_
130 ;; Instead of crashing, assume the empty list.
131 (warning (G_ "'~a' did not return a list of build machines; \
132 ignoring it~%")
133 file)
134 '())))))
135 (lambda args
136 (match args
137 (('system-error . rest)
138 (let ((err (system-error-errno args)))
139 ;; Silently ignore missing file since this is a common case.
140 (if (= ENOENT err)
141 '()
142 (leave (G_ "failed to open machine file '~a': ~a~%")
143 file (strerror err)))))
144 (('syntax-error proc message properties form . rest)
145 (let ((loc (source-properties->location properties)))
146 (leave (G_ "~a: ~a~%")
147 (location->string loc) message)))
148 (x
149 (leave (G_ "failed to load machine file '~a': ~s~%")
150 file args))))))
151
152 (define (host-key->type+key host-key)
153 "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
154 its key type as a symbol, and the actual base64-encoded string."
155 (define (type->symbol type)
156 (and (string-prefix? "ssh-" type)
157 (string->symbol (string-drop type 4))))
158
159 (match (string-tokenize host-key)
160 ((type key x)
161 (values (type->symbol type) key))
162 ((type key)
163 (values (type->symbol type) key))))
164
165 (define (private-key-from-file* file)
166 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
167 can interpret meaningfully."
168 (catch 'guile-ssh-error
169 (lambda ()
170 (private-key-from-file file))
171 (lambda (key proc str . rest)
172 (raise (condition
173 (&message (message (format #f (G_ "failed to load SSH \
174 private key from '~a': ~a")
175 file str))))))))
176
177 (define (open-ssh-session machine)
178 "Open an SSH session for MACHINE and return it. Throw an error on failure."
179 (let ((private (private-key-from-file* (build-machine-private-key machine)))
180 (public (public-key-from-file
181 (string-append (build-machine-private-key machine)
182 ".pub")))
183 (session (make-session #:user (build-machine-user machine)
184 #:host (build-machine-name machine)
185 #:port (build-machine-port machine)
186 #:timeout 10 ;seconds
187 ;; #:log-verbosity 'protocol
188 #:identity (build-machine-private-key machine)
189
190 ;; By default libssh reads ~/.ssh/known_hosts
191 ;; and uses that to adjust its choice of cipher
192 ;; suites, which changes the type of host key
193 ;; that the server sends (RSA vs. Ed25519,
194 ;; etc.). Opt for something reproducible and
195 ;; stateless instead.
196 #:knownhosts "/dev/null"
197
198 ;; We need lightweight compression when
199 ;; exchanging full archives.
200 #:compression
201 (build-machine-compression machine)
202 #:compression-level
203 (build-machine-compression-level machine))))
204 (match (connect! session)
205 ('ok
206 ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
207 ;; ed25519 keys and 'get-key-type' returns #f in that case.
208 (let-values (((server) (get-server-public-key session))
209 ((type key) (host-key->type+key
210 (build-machine-host-key machine))))
211 (unless (and (or (not (get-key-type server))
212 (eq? (get-key-type server) type))
213 (string=? (public-key->string server) key))
214 ;; Key mismatch: something's wrong. XXX: It could be that the server
215 ;; provided its Ed25519 key when we where expecting its RSA key.
216 (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
217 instead of '~a' of type '~a'~%")
218 (build-machine-name machine)
219 (public-key->string server) (get-key-type server)
220 key type)))
221
222 (let ((auth (userauth-public-key! session private)))
223 (unless (eq? 'success auth)
224 (disconnect! session)
225 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
226 (build-machine-name machine) (get-error session))))
227
228 session)
229 (x
230 ;; Connection failed or timeout expired.
231 (leave (G_ "failed to connect to '~a': ~a~%")
232 (build-machine-name machine) (get-error session))))))
233
234 \f
235 ;;;
236 ;;; Synchronization.
237 ;;;
238
239 (define (lock-file file)
240 "Wait and acquire an exclusive lock on FILE. Return an open port."
241 (mkdir-p (dirname file))
242 (let ((port (open-file file "w0")))
243 (fcntl-flock port 'write-lock)
244 port))
245
246 (define (unlock-file lock)
247 "Unlock LOCK."
248 (fcntl-flock lock 'unlock)
249 (close-port lock)
250 #t)
251
252 (define-syntax-rule (with-file-lock file exp ...)
253 "Wait to acquire a lock on FILE and evaluate EXP in that context."
254 (let ((port (lock-file file)))
255 (dynamic-wind
256 (lambda ()
257 #t)
258 (lambda ()
259 exp ...)
260 (lambda ()
261 (unlock-file port)))))
262
263 (define (machine-slot-file machine slot)
264 "Return the file name of MACHINE's file for SLOT."
265 ;; For each machine we have a bunch of files representing each build slot.
266 ;; When choosing a build machine, we attempt to get an exclusive lock on one
267 ;; of these; if we fail, that means all the build slots are already taken.
268 ;; Inspired by Nix's build-remote.pl.
269 (string-append (string-append %state-directory "/offload/"
270 (build-machine-name machine)
271 "/" (number->string slot))))
272
273 (define (acquire-build-slot machine)
274 "Attempt to acquire a build slot on MACHINE. Return the port representing
275 the slot, or #f if none is available.
276
277 This mechanism allows us to set a hard limit on the number of simultaneous
278 connections allowed to MACHINE."
279 (mkdir-p (dirname (machine-slot-file machine 0)))
280
281 ;; When several 'guix offload' processes run in parallel, there's a race
282 ;; among them, but since they try the slots in the same order, we're fine.
283 (any (lambda (slot)
284 (let ((port (open-file (machine-slot-file machine slot)
285 "w0")))
286 (catch 'flock-error
287 (lambda ()
288 (fcntl-flock port 'write-lock #:wait? #f)
289 ;; Got it!
290 (format (current-error-port)
291 "process ~a acquired build slot '~a'~%"
292 (getpid) (port-filename port))
293 port)
294 (lambda args
295 ;; PORT is already locked by another process.
296 (close-port port)
297 #f))))
298 (iota (build-machine-parallel-builds machine))))
299
300 (define (release-build-slot slot)
301 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
302 (close-port slot))
303
304 \f
305 ;;;
306 ;;; Offloading.
307 ;;;
308
309 (define (build-log-port)
310 "Return the default port where build logs should be sent. The default is
311 file descriptor 4, which is open by the daemon before running the offload
312 hook."
313 (let ((port (fdopen 4 "w0")))
314 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
315 (set-port-revealed! port 1)
316 port))
317
318 (define (node-guile-version node)
319 (inferior-eval '(version) node))
320
321 (define (node-free-disk-space node)
322 "Return the free disk space, in bytes, in NODE's store."
323 (inferior-eval `(begin
324 (use-modules (guix build syscalls))
325 (free-disk-space ,(%store-prefix)))
326 node))
327
328 (define* (transfer-and-offload drv machine
329 #:key
330 (inputs '())
331 (outputs '())
332 (max-silent-time 3600)
333 build-timeout
334 print-build-trace?)
335 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
336 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
337 MACHINE."
338 (define session
339 (open-ssh-session machine))
340
341 (define store
342 (connect-to-remote-daemon session
343 (build-machine-daemon-socket machine)))
344
345 (set-build-options store
346 #:print-build-trace print-build-trace?
347 #:max-silent-time max-silent-time
348 #:timeout build-timeout)
349
350 ;; Protect DRV from garbage collection.
351 (add-temp-root store (derivation-file-name drv))
352
353 (with-store local
354 (send-files local (cons (derivation-file-name drv) inputs) store
355 #:log-port (current-output-port)))
356 (format (current-error-port) "offloading '~a' to '~a'...~%"
357 (derivation-file-name drv) (build-machine-name machine))
358 (format (current-error-port) "@ build-remote ~a ~a~%"
359 (derivation-file-name drv) (build-machine-name machine))
360
361 (guard (c ((store-protocol-error? c)
362 (format (current-error-port)
363 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
364 (derivation-file-name drv)
365 (build-machine-name machine)
366 (store-protocol-error-message c))
367 (let* ((inferior (false-if-exception (remote-inferior session)))
368 (space (false-if-exception
369 (node-free-disk-space inferior))))
370
371 (when inferior
372 (close-inferior inferior))
373
374 ;; Use exit code 100 for a permanent build failure. The daemon
375 ;; interprets other non-zero codes as transient build failures.
376 (if (and space (< space (* 10 (expt 2 20))))
377 (begin
378 (format (current-error-port)
379 (G_ "build failure may have been caused by lack \
380 of free disk space on '~a'~%")
381 (build-machine-name machine))
382 (primitive-exit 1))
383 (primitive-exit 100)))))
384 (parameterize ((current-build-output-port (build-log-port)))
385 (build-derivations store (list drv))))
386
387 (retrieve-files* outputs store
388
389 ;; We cannot use the 'import-paths' RPC here because we
390 ;; already hold the locks for FILES.
391 #:import
392 (lambda (port)
393 (restore-file-set port
394 #:log-port (current-error-port)
395 #:lock? #f)))
396
397 (format (current-error-port) "done with offloaded '~a'~%"
398 (derivation-file-name drv)))
399
400 \f
401 ;;;
402 ;;; Scheduling.
403 ;;;
404
405 (define (machine-matches? machine requirements)
406 "Return #t if MACHINE matches REQUIREMENTS."
407 (and (string=? (build-requirements-system requirements)
408 (build-machine-system machine))
409 (lset<= string=?
410 (build-requirements-features requirements)
411 (build-machine-features machine))))
412
413 (define %minimum-disk-space
414 ;; Minimum disk space required on the build machine for a build to be
415 ;; offloaded. This keeps us from offloading to machines that are bound to
416 ;; run out of disk space.
417 (* 100 (expt 2 20))) ;100 MiB
418
419 (define (node-load node)
420 "Return the load on NODE. Return +∞ if NODE is misbehaving."
421 (let ((line (inferior-eval '(begin
422 (use-modules (ice-9 rdelim))
423 (call-with-input-file "/proc/loadavg"
424 read-string))
425 node)))
426 (if (eof-object? line)
427 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
428 (match (string-tokenize line)
429 ((one five fifteen . x)
430 (string->number one))
431 (x
432 +inf.0)))))
433
434 (define (normalized-load machine load)
435 "Divide LOAD by the number of parallel builds of MACHINE."
436 (if (rational? load)
437 (let* ((jobs (build-machine-parallel-builds machine))
438 (normalized (/ load jobs)))
439 (format (current-error-port) "load on machine '~a' is ~s\
440 (normalized: ~s)~%"
441 (build-machine-name machine) load normalized)
442 normalized)
443 load))
444
445 (define (random-seed)
446 (logxor (getpid) (car (gettimeofday))))
447
448 (define shuffle
449 (let ((state (seed->random-state (random-seed))))
450 (lambda (lst)
451 "Return LST shuffled (using the Fisher-Yates algorithm.)"
452 (define vec (list->vector lst))
453 (let loop ((result '())
454 (i (vector-length vec)))
455 (if (zero? i)
456 result
457 (let* ((j (random i state))
458 (val (vector-ref vec j)))
459 (vector-set! vec j (vector-ref vec (- i 1)))
460 (loop (cons val result) (- i 1))))))))
461
462 (define (choose-build-machine machines)
463 "Return two values: the best machine among MACHINES and its build
464 slot (which must later be released with 'release-build-slot'), or #f and #f."
465
466 ;; Proceed like this:
467 ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
468 ;; those machines for which we failed.
469 ;; 2. Choose the best machine among those that are left.
470 ;; 3. Release the previously-acquired build slots of the other machines.
471
472 (define machines+slots
473 (filter-map (lambda (machine)
474 (let ((slot (acquire-build-slot machine)))
475 (and slot (list machine slot))))
476 (shuffle machines)))
477
478 (define (undecorate pred)
479 (lambda (a b)
480 (match a
481 ((machine1 slot1)
482 (match b
483 ((machine2 slot2)
484 (pred machine1 machine2)))))))
485
486 (define (machine-faster? m1 m2)
487 ;; Return #t if M1 is faster than M2.
488 (> (build-machine-speed m1)
489 (build-machine-speed m2)))
490
491 (let loop ((machines+slots
492 (sort machines+slots (undecorate machine-faster?))))
493 (match machines+slots
494 (((best slot) others ...)
495 ;; Return the best machine unless it's already overloaded.
496 ;; Note: We call 'node-load' only as a last resort because it is
497 ;; too costly to call it once for every machine.
498 (let* ((session (false-if-exception (open-ssh-session best)))
499 (node (and session (remote-inferior session)))
500 (load (and node (normalized-load best (node-load node))))
501 (space (and node (node-free-disk-space node))))
502 (when node (close-inferior node))
503 (when session (disconnect! session))
504 (if (and node (< load 2.) (>= space %minimum-disk-space))
505 (match others
506 (((machines slots) ...)
507 ;; Release slots from the uninteresting machines.
508 (for-each release-build-slot slots)
509
510 ;; The caller must keep SLOT to protect it from GC and to
511 ;; eventually release it.
512 (values best slot)))
513 (begin
514 ;; BEST is unsuitable, so try the next one.
515 (when (and space (< space %minimum-disk-space))
516 (format (current-error-port)
517 "skipping machine '~a' because it is low \
518 on disk space (~,2f MiB free)~%"
519 (build-machine-name best)
520 (/ space (expt 2 20) 1.)))
521 (release-build-slot slot)
522 (loop others)))))
523 (()
524 (values #f #f)))))
525
526 (define (call-with-timeout timeout drv thunk)
527 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
528 THUNK. Use DRV as an indication of what we were building when the timeout
529 expired."
530 (if (number? timeout)
531 (dynamic-wind
532 (lambda ()
533 (sigaction SIGALRM
534 (lambda _
535 ;; The exit code here will be 1, which guix-daemon will
536 ;; interpret as a transient failure.
537 (leave (G_ "timeout expired while offloading '~a'~%")
538 (derivation-file-name drv))))
539 (alarm timeout))
540 thunk
541 (lambda ()
542 (alarm 0)))
543 (thunk)))
544
545 (define-syntax-rule (with-timeout timeout drv exp ...)
546 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
547 If TIMEOUT is #f, simply evaluate EXP..."
548 (call-with-timeout timeout drv (lambda () exp ...)))
549
550 (define* (process-request wants-local? system drv features
551 #:key
552 print-build-trace? (max-silent-time 3600)
553 build-timeout)
554 "Process a request to build DRV."
555 (let* ((local? (and wants-local? (string=? system (%current-system))))
556 (reqs (build-requirements
557 (system system)
558 (features features)))
559 (candidates (filter (cut machine-matches? <> reqs)
560 (build-machines))))
561 (match candidates
562 (()
563 ;; We'll never be able to match REQS.
564 (display "# decline\n"))
565 ((x ...)
566 (let-values (((machine slot)
567 (choose-build-machine candidates)))
568 (if machine
569 (dynamic-wind
570 (const #f)
571 (lambda ()
572 ;; Offload DRV to MACHINE.
573 (display "# accept\n")
574 (let ((inputs (string-tokenize (read-line)))
575 (outputs (string-tokenize (read-line))))
576 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
577 ;; be issues with the connection or deadlocks that could
578 ;; lead the 'guix offload' process to remain stuck forever.
579 ;; To avoid that, install a timeout here as well.
580 (with-timeout build-timeout drv
581 (transfer-and-offload drv machine
582 #:inputs inputs
583 #:outputs outputs
584 #:max-silent-time max-silent-time
585 #:build-timeout build-timeout
586 #:print-build-trace?
587 print-build-trace?))))
588 (lambda ()
589 (release-build-slot slot)))
590
591 ;; Not now, all the machines are busy.
592 (display "# postpone\n")))))))
593
594 \f
595 ;;;
596 ;;; Installation tests.
597 ;;;
598
599 (define (assert-node-repl node name)
600 "Bail out if NODE is not running Guile."
601 (match (node-guile-version node)
602 (#f
603 (report-guile-error name))
604 ((? string? version)
605 (info (G_ "'~a' is running GNU Guile ~a~%")
606 name (node-guile-version node)))))
607
608 (define (assert-node-has-guix node name)
609 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
610 daemon is not running."
611 (unless (inferior? node)
612 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
613
614 (match (inferior-eval '(begin
615 (use-modules (guix))
616 (and add-text-to-store 'alright))
617 node)
618 ('alright #t)
619 (_ (report-module-error name)))
620
621 (match (inferior-eval '(begin
622 (use-modules (guix))
623 (with-store store
624 (add-text-to-store store "test"
625 "Hello, build machine!")))
626 node)
627 ((? string? str)
628 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
629 name str))
630 (x
631 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
632 name x))))
633
634 (define %random-state
635 (delay
636 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
637
638 (define* (nonce #:optional (name (gethostname)))
639 (string-append name "-"
640 (number->string (random 1000000 (force %random-state)))))
641
642 (define (assert-node-can-import session node name daemon-socket)
643 "Bail out if NODE refuses to import our archives."
644 (with-store store
645 (let* ((item (add-text-to-store store "export-test" (nonce)))
646 (remote (connect-to-remote-daemon session daemon-socket)))
647 (with-store local
648 (send-files local (list item) remote))
649
650 (if (valid-path? remote item)
651 (info (G_ "'~a' successfully imported '~a'~%")
652 name item)
653 (leave (G_ "'~a' was not properly imported on '~a'~%")
654 item name)))))
655
656 (define (assert-node-can-export session node name daemon-socket)
657 "Bail out if we cannot import signed archives from NODE."
658 (let* ((remote (connect-to-remote-daemon session daemon-socket))
659 (item (add-text-to-store remote "import-test" (nonce name))))
660 (with-store store
661 (if (and (retrieve-files store (list item) remote)
662 (valid-path? store item))
663 (info (G_ "successfully imported '~a' from '~a'~%")
664 item name)
665 (leave (G_ "failed to import '~a' from '~a'~%")
666 item name)))))
667
668 (define (check-machine-availability machine-file pred)
669 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
670 machine."
671 (define (build-machine=? m1 m2)
672 (and (string=? (build-machine-name m1) (build-machine-name m2))
673 (= (build-machine-port m1) (build-machine-port m2))))
674
675 ;; A given build machine may appear several times (e.g., once for
676 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
677 (let ((machines (filter pred
678 (delete-duplicates (build-machines machine-file)
679 build-machine=?))))
680 (info (G_ "testing ~a build machines defined in '~a'...~%")
681 (length machines) machine-file)
682 (let* ((names (map build-machine-name machines))
683 (sockets (map build-machine-daemon-socket machines))
684 (sessions (map open-ssh-session machines))
685 (nodes (map remote-inferior sessions)))
686 (for-each assert-node-has-guix nodes names)
687 (for-each assert-node-repl nodes names)
688 (for-each assert-node-can-import sessions nodes names sockets)
689 (for-each assert-node-can-export sessions nodes names sockets)
690 (for-each close-inferior nodes)
691 (for-each disconnect! sessions))))
692
693 (define (check-machine-status machine-file pred)
694 "Print the load of each machine matching PRED in MACHINE-FILE."
695 (define (build-machine=? m1 m2)
696 (and (string=? (build-machine-name m1) (build-machine-name m2))
697 (= (build-machine-port m1) (build-machine-port m2))))
698
699 ;; A given build machine may appear several times (e.g., once for
700 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
701 (let ((machines (filter pred
702 (delete-duplicates (build-machines machine-file)
703 build-machine=?))))
704 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
705 (length machines) machine-file)
706 (for-each (lambda (machine)
707 (define session
708 (open-ssh-session machine))
709
710 (match (remote-inferior session)
711 (#f
712 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
713 (build-machine-name machine)))
714 ((? inferior? inferior)
715 (let ((now (car (gettimeofday))))
716 (match (inferior-eval '(list (uname)
717 (car (gettimeofday)))
718 inferior)
719 ((uts time)
720 (when (< time now)
721 ;; Build machine clocks must not be behind as this
722 ;; could cause timestamp issues.
723 (warning (G_ "machine '~a' is ~a seconds behind~%")
724 (build-machine-name machine)
725 (- now time)))
726
727 (let ((load (node-load inferior))
728 (free (node-free-disk-space inferior)))
729 (close-inferior inferior)
730 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
731 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
732 time difference: ~a s~%"
733 (build-machine-name machine)
734 (utsname:sysname uts) (utsname:release uts)
735 (utsname:machine uts)
736 (utsname:nodename uts)
737 (normalized-load machine load)
738 (/ free (expt 2 20) 1.)
739 (- time now))))))))
740
741 (disconnect! session))
742 machines)))
743
744 \f
745 ;;;
746 ;;; Entry point.
747 ;;;
748
749 (define (guix-offload . args)
750 (define request-line-rx
751 ;; The request format. See 'tryBuildHook' method in build.cc.
752 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
753
754 (define not-coma
755 (char-set-complement (char-set #\,)))
756
757 ;; Make sure $HOME really corresponds to the current user. This is
758 ;; necessary since lsh uses that to determine the location of the yarrow
759 ;; seed file, and fails if it's owned by someone else.
760 (and=> (passwd:dir (getpw (getuid)))
761 (cut setenv "HOME" <>))
762
763 ;; We rely on protocol-level compression from libssh to optimize large data
764 ;; transfers. Warn if it's missing.
765 (unless (zlib-support?)
766 (warning (G_ "Guile-SSH lacks zlib support"))
767 (warning (G_ "data transfers will *not* be compressed!")))
768
769 (match args
770 ((system max-silent-time print-build-trace? build-timeout)
771 (let ((max-silent-time (string->number max-silent-time))
772 (build-timeout (string->number build-timeout))
773 (print-build-trace? (string=? print-build-trace? "1")))
774 (set-thread-name "guix offload")
775 (parameterize ((%current-system system))
776 (let loop ((line (read-line)))
777 (unless (eof-object? line)
778 (cond ((regexp-exec request-line-rx line)
779 =>
780 (lambda (match)
781 (with-error-handling
782 (process-request (equal? (match:substring match 1) "1")
783 (match:substring match 2) ; system
784 (read-derivation-from-file
785 (match:substring match 3))
786 (string-tokenize
787 (match:substring match 4) not-coma)
788 #:print-build-trace? print-build-trace?
789 #:max-silent-time max-silent-time
790 #:build-timeout build-timeout))))
791 (else
792 (leave (G_ "invalid request line: ~s~%") line)))
793 (loop (read-line)))))))
794 (("test" rest ...)
795 (with-error-handling
796 (let-values (((file pred)
797 (match rest
798 ((file regexp)
799 (values file
800 (compose (cut string-match regexp <>)
801 build-machine-name)))
802 ((file) (values file (const #t)))
803 (() (values %machine-file (const #t)))
804 (x (leave (G_ "wrong number of arguments~%"))))))
805 (check-machine-availability (or file %machine-file) pred))))
806 (("status" rest ...)
807 (with-error-handling
808 (let-values (((file pred)
809 (match rest
810 ((file regexp)
811 (values file
812 (compose (cut string-match regexp <>)
813 build-machine-name)))
814 ((file) (values file (const #t)))
815 (() (values %machine-file (const #t)))
816 (x (leave (G_ "wrong number of arguments~%"))))))
817 (check-machine-status (or file %machine-file) pred))))
818 (("--version")
819 (show-version-and-exit "guix offload"))
820 (("--help")
821 (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
822 Process build offload requests written on the standard input, possibly
823 offloading builds to the machines listed in '~a'.~%")
824 %machine-file)
825 (display (G_ "
826 This tool is meant to be used internally by 'guix-daemon'.\n"))
827 (show-bug-report-information))
828 (x
829 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
830
831 ;;; Local Variables:
832 ;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
833 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
834 ;;; eval: (put 'with-timeout 'scheme-indent-function 2)
835 ;;; End:
836
837 ;;; offload.scm ends here