offload: Adjust 'test' and 'status' to the latest changes.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015, 2016, 2017, 2018 Ludovic Courtès <ludo@gnu.org>
3 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
4 ;;;
5 ;;; This file is part of GNU Guix.
6 ;;;
7 ;;; GNU Guix is free software; you can redistribute it and/or modify it
8 ;;; under the terms of the GNU General Public License as published by
9 ;;; the Free Software Foundation; either version 3 of the License, or (at
10 ;;; your option) any later version.
11 ;;;
12 ;;; GNU Guix is distributed in the hope that it will be useful, but
13 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;;; GNU General Public License for more details.
16 ;;;
17 ;;; You should have received a copy of the GNU General Public License
18 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20 (define-module (guix scripts offload)
21 #:use-module (ssh key)
22 #:use-module (ssh auth)
23 #:use-module (ssh session)
24 #:use-module (ssh channel)
25 #:use-module (ssh popen)
26 #:use-module (ssh version)
27 #:use-module (guix config)
28 #:use-module (guix records)
29 #:use-module (guix ssh)
30 #:use-module (guix store)
31 #:use-module (guix inferior)
32 #:use-module (guix derivations)
33 #:use-module ((guix serialization)
34 #:select (nar-error? nar-error-file))
35 #:use-module (guix nar)
36 #:use-module (guix utils)
37 #:use-module ((guix build syscalls)
38 #:select (fcntl-flock set-thread-name))
39 #:use-module ((guix build utils) #:select (which mkdir-p))
40 #:use-module (guix ui)
41 #:use-module (srfi srfi-1)
42 #:use-module (srfi srfi-11)
43 #:use-module (srfi srfi-26)
44 #:use-module (srfi srfi-34)
45 #:use-module (srfi srfi-35)
46 #:use-module (ice-9 popen)
47 #:use-module (ice-9 rdelim)
48 #:use-module (ice-9 match)
49 #:use-module (ice-9 regex)
50 #:use-module (ice-9 format)
51 #:use-module (ice-9 binary-ports)
52 #:export (build-machine
53 build-requirements
54 guix-offload))
55
56 ;;; Commentary:
57 ;;;
58 ;;; Attempt to offload builds to the machines listed in
59 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
60 ;;; retrieving the build output(s) over SSH upon success.
61 ;;;
62 ;;; This command should not be used directly; instead, it is called on-demand
63 ;;; by the daemon, unless it was started with '--no-build-hook' or a client
64 ;;; inhibited build hooks.
65 ;;;
66 ;;; Code:
67
68
69 (define-record-type* <build-machine>
70 build-machine make-build-machine
71 build-machine?
72 (name build-machine-name) ; string
73 (port build-machine-port ; number
74 (default 22))
75 (system build-machine-system) ; string
76 (user build-machine-user) ; string
77 (private-key build-machine-private-key ; file name
78 (default (user-openssh-private-key)))
79 (host-key build-machine-host-key) ; string
80 (compression build-machine-compression ; string
81 (default "zlib@openssh.com,zlib"))
82 (compression-level build-machine-compression-level ;integer
83 (default 3))
84 (daemon-socket build-machine-daemon-socket ; string
85 (default "/var/guix/daemon-socket/socket"))
86 (parallel-builds build-machine-parallel-builds ; number
87 (default 1))
88 (speed build-machine-speed ; inexact real
89 (default 1.0))
90 (features build-machine-features ; list of strings
91 (default '())))
92
93 (define-record-type* <build-requirements>
94 build-requirements make-build-requirements
95 build-requirements?
96 (system build-requirements-system) ; string
97 (features build-requirements-features ; list of strings
98 (default '())))
99
100 (define %machine-file
101 ;; File that lists machines available as build slaves.
102 (string-append %config-directory "/machines.scm"))
103
104 (define (user-openssh-private-key)
105 "Return the user's default SSH private key, or #f if it could not be
106 determined."
107 (and=> (getenv "HOME")
108 (cut string-append <> "/.ssh/id_rsa")))
109
110 (define %user-module
111 ;; Module in which the machine description file is loaded.
112 (let ((module (make-fresh-user-module)))
113 (module-use! module (resolve-interface '(guix scripts offload)))
114 module))
115
116 (define* (build-machines #:optional (file %machine-file))
117 "Read the list of build machines from FILE and return it."
118 (catch #t
119 (lambda ()
120 ;; Avoid ABI incompatibility with the <build-machine> record.
121 ;; (set! %fresh-auto-compile #t)
122
123 (save-module-excursion
124 (lambda ()
125 (set-current-module %user-module)
126 (match (primitive-load file)
127 (((? build-machine? machines) ...)
128 machines)
129 (_
130 ;; Instead of crashing, assume the empty list.
131 (warning (G_ "'~a' did not return a list of build machines; \
132 ignoring it~%")
133 file)
134 '())))))
135 (lambda args
136 (match args
137 (('system-error . rest)
138 (let ((err (system-error-errno args)))
139 ;; Silently ignore missing file since this is a common case.
140 (if (= ENOENT err)
141 '()
142 (leave (G_ "failed to open machine file '~a': ~a~%")
143 file (strerror err)))))
144 (('syntax-error proc message properties form . rest)
145 (let ((loc (source-properties->location properties)))
146 (leave (G_ "~a: ~a~%")
147 (location->string loc) message)))
148 (x
149 (leave (G_ "failed to load machine file '~a': ~s~%")
150 file args))))))
151
152 (define (host-key->type+key host-key)
153 "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
154 its key type as a symbol, and the actual base64-encoded string."
155 (define (type->symbol type)
156 (and (string-prefix? "ssh-" type)
157 (string->symbol (string-drop type 4))))
158
159 (match (string-tokenize host-key)
160 ((type key x)
161 (values (type->symbol type) key))
162 ((type key)
163 (values (type->symbol type) key))))
164
165 (define (private-key-from-file* file)
166 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
167 can interpret meaningfully."
168 (catch 'guile-ssh-error
169 (lambda ()
170 (private-key-from-file file))
171 (lambda (key proc str . rest)
172 (raise (condition
173 (&message (message (format #f (G_ "failed to load SSH \
174 private key from '~a': ~a")
175 file str))))))))
176
177 (define (open-ssh-session machine)
178 "Open an SSH session for MACHINE and return it. Throw an error on failure."
179 (let ((private (private-key-from-file* (build-machine-private-key machine)))
180 (public (public-key-from-file
181 (string-append (build-machine-private-key machine)
182 ".pub")))
183 (session (make-session #:user (build-machine-user machine)
184 #:host (build-machine-name machine)
185 #:port (build-machine-port machine)
186 #:timeout 10 ;seconds
187 ;; #:log-verbosity 'protocol
188 #:identity (build-machine-private-key machine)
189
190 ;; By default libssh reads ~/.ssh/known_hosts
191 ;; and uses that to adjust its choice of cipher
192 ;; suites, which changes the type of host key
193 ;; that the server sends (RSA vs. Ed25519,
194 ;; etc.). Opt for something reproducible and
195 ;; stateless instead.
196 #:knownhosts "/dev/null"
197
198 ;; We need lightweight compression when
199 ;; exchanging full archives.
200 #:compression
201 (build-machine-compression machine)
202 #:compression-level
203 (build-machine-compression-level machine))))
204 (match (connect! session)
205 ('ok
206 ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
207 ;; ed25519 keys and 'get-key-type' returns #f in that case.
208 (let-values (((server) (get-server-public-key session))
209 ((type key) (host-key->type+key
210 (build-machine-host-key machine))))
211 (unless (and (or (not (get-key-type server))
212 (eq? (get-key-type server) type))
213 (string=? (public-key->string server) key))
214 ;; Key mismatch: something's wrong. XXX: It could be that the server
215 ;; provided its Ed25519 key when we where expecting its RSA key.
216 (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
217 instead of '~a' of type '~a'~%")
218 (build-machine-name machine)
219 (public-key->string server) (get-key-type server)
220 key type)))
221
222 (let ((auth (userauth-public-key! session private)))
223 (unless (eq? 'success auth)
224 (disconnect! session)
225 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
226 (build-machine-name machine) (get-error session))))
227
228 session)
229 (x
230 ;; Connection failed or timeout expired.
231 (leave (G_ "failed to connect to '~a': ~a~%")
232 (build-machine-name machine) (get-error session))))))
233
234 \f
235 ;;;
236 ;;; Synchronization.
237 ;;;
238
239 (define (lock-file file)
240 "Wait and acquire an exclusive lock on FILE. Return an open port."
241 (mkdir-p (dirname file))
242 (let ((port (open-file file "w0")))
243 (fcntl-flock port 'write-lock)
244 port))
245
246 (define (unlock-file lock)
247 "Unlock LOCK."
248 (fcntl-flock lock 'unlock)
249 (close-port lock)
250 #t)
251
252 (define-syntax-rule (with-file-lock file exp ...)
253 "Wait to acquire a lock on FILE and evaluate EXP in that context."
254 (let ((port (lock-file file)))
255 (dynamic-wind
256 (lambda ()
257 #t)
258 (lambda ()
259 exp ...)
260 (lambda ()
261 (unlock-file port)))))
262
263 (define-syntax-rule (with-machine-lock machine hint exp ...)
264 "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
265 context."
266 (with-file-lock (machine-lock-file machine hint)
267 exp ...))
268
269
270 (define (machine-slot-file machine slot)
271 "Return the file name of MACHINE's file for SLOT."
272 ;; For each machine we have a bunch of files representing each build slot.
273 ;; When choosing a build machine, we attempt to get an exclusive lock on one
274 ;; of these; if we fail, that means all the build slots are already taken.
275 ;; Inspired by Nix's build-remote.pl.
276 (string-append (string-append %state-directory "/offload/"
277 (build-machine-name machine)
278 "/" (number->string slot))))
279
280 (define (acquire-build-slot machine)
281 "Attempt to acquire a build slot on MACHINE. Return the port representing
282 the slot, or #f if none is available.
283
284 This mechanism allows us to set a hard limit on the number of simultaneous
285 connections allowed to MACHINE."
286 (mkdir-p (dirname (machine-slot-file machine 0)))
287 (with-machine-lock machine 'slots
288 (any (lambda (slot)
289 (let ((port (open-file (machine-slot-file machine slot)
290 "w0")))
291 (catch 'flock-error
292 (lambda ()
293 (fcntl-flock port 'write-lock #:wait? #f)
294 ;; Got it!
295 (format (current-error-port)
296 "process ~a acquired build slot '~a'~%"
297 (getpid) (port-filename port))
298 port)
299 (lambda args
300 ;; PORT is already locked by another process.
301 (close-port port)
302 #f))))
303 (iota (build-machine-parallel-builds machine)))))
304
305 (define (release-build-slot slot)
306 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
307 (close-port slot))
308
309 \f
310 ;;;
311 ;;; Offloading.
312 ;;;
313
314 (define (build-log-port)
315 "Return the default port where build logs should be sent. The default is
316 file descriptor 4, which is open by the daemon before running the offload
317 hook."
318 (let ((port (fdopen 4 "w0")))
319 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
320 (set-port-revealed! port 1)
321 port))
322
323 (define (node-guile-version node)
324 (inferior-eval '(version) node))
325
326 (define (node-free-disk-space node)
327 "Return the free disk space, in bytes, in NODE's store."
328 (inferior-eval `(begin
329 (use-modules (guix build syscalls))
330 (free-disk-space ,(%store-prefix)))
331 node))
332
333 (define* (transfer-and-offload drv machine
334 #:key
335 (inputs '())
336 (outputs '())
337 (max-silent-time 3600)
338 build-timeout
339 print-build-trace?)
340 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
341 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
342 MACHINE."
343 (define session
344 (open-ssh-session machine))
345
346 (define store
347 (connect-to-remote-daemon session
348 (build-machine-daemon-socket machine)))
349
350 (set-build-options store
351 #:print-build-trace print-build-trace?
352 #:max-silent-time max-silent-time
353 #:timeout build-timeout)
354
355 ;; Protect DRV from garbage collection.
356 (add-temp-root store (derivation-file-name drv))
357
358 (with-store local
359 (send-files local (cons (derivation-file-name drv) inputs) store
360 #:log-port (current-output-port)))
361 (format (current-error-port) "offloading '~a' to '~a'...~%"
362 (derivation-file-name drv) (build-machine-name machine))
363 (format (current-error-port) "@ build-remote ~a ~a~%"
364 (derivation-file-name drv) (build-machine-name machine))
365
366 (guard (c ((nix-protocol-error? c)
367 (format (current-error-port)
368 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
369 (derivation-file-name drv)
370 (build-machine-name machine)
371 (nix-protocol-error-message c))
372 (let* ((inferior (false-if-exception (remote-inferior session)))
373 (space (false-if-exception
374 (node-free-disk-space inferior))))
375
376 (when inferior
377 (close-inferior inferior))
378
379 ;; Use exit code 100 for a permanent build failure. The daemon
380 ;; interprets other non-zero codes as transient build failures.
381 (if (and space (< space (* 10 (expt 2 20))))
382 (begin
383 (format (current-error-port)
384 (G_ "build failure may have been caused by lack \
385 of free disk space on '~a'~%")
386 (build-machine-name machine))
387 (primitive-exit 1))
388 (primitive-exit 100)))))
389 (parameterize ((current-build-output-port (build-log-port)))
390 (build-derivations store (list drv))))
391
392 (retrieve-files* outputs store
393
394 ;; We cannot use the 'import-paths' RPC here because we
395 ;; already hold the locks for FILES.
396 #:import
397 (lambda (port)
398 (restore-file-set port
399 #:log-port (current-error-port)
400 #:lock? #f)))
401
402 (format (current-error-port) "done with offloaded '~a'~%"
403 (derivation-file-name drv)))
404
405 \f
406 ;;;
407 ;;; Scheduling.
408 ;;;
409
410 (define (machine-matches? machine requirements)
411 "Return #t if MACHINE matches REQUIREMENTS."
412 (and (string=? (build-requirements-system requirements)
413 (build-machine-system machine))
414 (lset<= string=?
415 (build-requirements-features requirements)
416 (build-machine-features machine))))
417
418 (define %minimum-disk-space
419 ;; Minimum disk space required on the build machine for a build to be
420 ;; offloaded. This keeps us from offloading to machines that are bound to
421 ;; run out of disk space.
422 (* 100 (expt 2 20))) ;100 MiB
423
424 (define (node-load node)
425 "Return the load on NODE. Return +∞ if NODE is misbehaving."
426 (let ((line (inferior-eval '(begin
427 (use-modules (ice-9 rdelim))
428 (call-with-input-file "/proc/loadavg"
429 read-string))
430 node)))
431 (if (eof-object? line)
432 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
433 (match (string-tokenize line)
434 ((one five fifteen . x)
435 (string->number one))
436 (x
437 +inf.0)))))
438
439 (define (normalized-load machine load)
440 "Divide LOAD by the number of parallel builds of MACHINE."
441 (if (rational? load)
442 (let* ((jobs (build-machine-parallel-builds machine))
443 (normalized (/ load jobs)))
444 (format (current-error-port) "load on machine '~a' is ~s\
445 (normalized: ~s)~%"
446 (build-machine-name machine) load normalized)
447 normalized)
448 load))
449
450 (define (machine-lock-file machine hint)
451 "Return the name of MACHINE's lock file for HINT."
452 (string-append %state-directory "/offload/"
453 (build-machine-name machine)
454 "." (symbol->string hint) ".lock"))
455
456 (define (machine-choice-lock-file)
457 "Return the name of the file used as a lock when choosing a build machine."
458 (string-append %state-directory "/offload/machine-choice.lock"))
459
460 (define (random-seed)
461 (logxor (getpid) (car (gettimeofday))))
462
463 (define shuffle
464 (let ((state (seed->random-state (random-seed))))
465 (lambda (lst)
466 "Return LST shuffled (using the Fisher-Yates algorithm.)"
467 (define vec (list->vector lst))
468 (let loop ((result '())
469 (i (vector-length vec)))
470 (if (zero? i)
471 result
472 (let* ((j (random i state))
473 (val (vector-ref vec j)))
474 (vector-set! vec j (vector-ref vec (- i 1)))
475 (loop (cons val result) (- i 1))))))))
476
477 (define (choose-build-machine machines)
478 "Return two values: the best machine among MACHINES and its build
479 slot (which must later be released with 'release-build-slot'), or #f and #f."
480
481 ;; Proceed like this:
482 ;; 1. Acquire the global machine-choice lock.
483 ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
484 ;; those machines for which we failed.
485 ;; 3. Choose the best machine among those that are left.
486 ;; 4. Release the previously-acquired build slots of the other machines.
487 ;; 5. Release the global machine-choice lock.
488
489 (with-file-lock (machine-choice-lock-file)
490 (define machines+slots
491 (filter-map (lambda (machine)
492 (let ((slot (acquire-build-slot machine)))
493 (and slot (list machine slot))))
494 (shuffle machines)))
495
496 (define (undecorate pred)
497 (lambda (a b)
498 (match a
499 ((machine1 slot1)
500 (match b
501 ((machine2 slot2)
502 (pred machine1 machine2)))))))
503
504 (define (machine-faster? m1 m2)
505 ;; Return #t if M1 is faster than M2.
506 (> (build-machine-speed m1)
507 (build-machine-speed m2)))
508
509 (let loop ((machines+slots
510 (sort machines+slots (undecorate machine-faster?))))
511 (match machines+slots
512 (((best slot) others ...)
513 ;; Return the best machine unless it's already overloaded.
514 ;; Note: We call 'node-load' only as a last resort because it is
515 ;; too costly to call it once for every machine.
516 (let* ((session (false-if-exception (open-ssh-session best)))
517 (node (and session (remote-inferior session)))
518 (load (and node (normalized-load best (node-load node))))
519 (space (and node (node-free-disk-space node))))
520 (when node (close-inferior node))
521 (when session (disconnect! session))
522 (if (and node (< load 2.) (>= space %minimum-disk-space))
523 (match others
524 (((machines slots) ...)
525 ;; Release slots from the uninteresting machines.
526 (for-each release-build-slot slots)
527
528 ;; The caller must keep SLOT to protect it from GC and to
529 ;; eventually release it.
530 (values best slot)))
531 (begin
532 ;; BEST is unsuitable, so try the next one.
533 (when (and space (< space %minimum-disk-space))
534 (format (current-error-port)
535 "skipping machine '~a' because it is low \
536 on disk space (~,2f MiB free)~%"
537 (build-machine-name best)
538 (/ space (expt 2 20) 1.)))
539 (release-build-slot slot)
540 (loop others)))))
541 (()
542 (values #f #f))))))
543
544 (define (call-with-timeout timeout drv thunk)
545 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
546 THUNK. Use DRV as an indication of what we were building when the timeout
547 expired."
548 (if (number? timeout)
549 (dynamic-wind
550 (lambda ()
551 (sigaction SIGALRM
552 (lambda _
553 ;; The exit code here will be 1, which guix-daemon will
554 ;; interpret as a transient failure.
555 (leave (G_ "timeout expired while offloading '~a'~%")
556 (derivation-file-name drv))))
557 (alarm timeout))
558 thunk
559 (lambda ()
560 (alarm 0)))
561 (thunk)))
562
563 (define-syntax-rule (with-timeout timeout drv exp ...)
564 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
565 If TIMEOUT is #f, simply evaluate EXP..."
566 (call-with-timeout timeout drv (lambda () exp ...)))
567
568 (define* (process-request wants-local? system drv features
569 #:key
570 print-build-trace? (max-silent-time 3600)
571 build-timeout)
572 "Process a request to build DRV."
573 (let* ((local? (and wants-local? (string=? system (%current-system))))
574 (reqs (build-requirements
575 (system system)
576 (features features)))
577 (candidates (filter (cut machine-matches? <> reqs)
578 (build-machines))))
579 (match candidates
580 (()
581 ;; We'll never be able to match REQS.
582 (display "# decline\n"))
583 ((x ...)
584 (let-values (((machine slot)
585 (choose-build-machine candidates)))
586 (if machine
587 (dynamic-wind
588 (const #f)
589 (lambda ()
590 ;; Offload DRV to MACHINE.
591 (display "# accept\n")
592 (let ((inputs (string-tokenize (read-line)))
593 (outputs (string-tokenize (read-line))))
594 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
595 ;; be issues with the connection or deadlocks that could
596 ;; lead the 'guix offload' process to remain stuck forever.
597 ;; To avoid that, install a timeout here as well.
598 (with-timeout build-timeout drv
599 (transfer-and-offload drv machine
600 #:inputs inputs
601 #:outputs outputs
602 #:max-silent-time max-silent-time
603 #:build-timeout build-timeout
604 #:print-build-trace?
605 print-build-trace?))))
606 (lambda ()
607 (release-build-slot slot)))
608
609 ;; Not now, all the machines are busy.
610 (display "# postpone\n")))))))
611
612 \f
613 ;;;
614 ;;; Installation tests.
615 ;;;
616
617 (define (assert-node-repl node name)
618 "Bail out if NODE is not running Guile."
619 (match (node-guile-version node)
620 (#f
621 (report-guile-error name))
622 ((? string? version)
623 (info (G_ "'~a' is running GNU Guile ~a~%")
624 name (node-guile-version node)))))
625
626 (define (assert-node-has-guix node name)
627 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
628 daemon is not running."
629 (unless (inferior? node)
630 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
631
632 (match (inferior-eval '(begin
633 (use-modules (guix))
634 (and add-text-to-store 'alright))
635 node)
636 ('alright #t)
637 (_ (report-module-error name)))
638
639 (match (inferior-eval '(begin
640 (use-modules (guix))
641 (with-store store
642 (add-text-to-store store "test"
643 "Hello, build machine!")))
644 node)
645 ((? string? str)
646 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
647 name str))
648 (x
649 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
650 name x))))
651
652 (define %random-state
653 (delay
654 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
655
656 (define* (nonce #:optional (name (gethostname)))
657 (string-append name "-"
658 (number->string (random 1000000 (force %random-state)))))
659
660 (define (assert-node-can-import session node name daemon-socket)
661 "Bail out if NODE refuses to import our archives."
662 (with-store store
663 (let* ((item (add-text-to-store store "export-test" (nonce)))
664 (remote (connect-to-remote-daemon session daemon-socket)))
665 (with-store local
666 (send-files local (list item) remote))
667
668 (if (valid-path? remote item)
669 (info (G_ "'~a' successfully imported '~a'~%")
670 name item)
671 (leave (G_ "'~a' was not properly imported on '~a'~%")
672 item name)))))
673
674 (define (assert-node-can-export session node name daemon-socket)
675 "Bail out if we cannot import signed archives from NODE."
676 (let* ((remote (connect-to-remote-daemon session daemon-socket))
677 (item (add-text-to-store remote "import-test" (nonce name))))
678 (with-store store
679 (if (and (retrieve-files store (list item) remote)
680 (valid-path? store item))
681 (info (G_ "successfully imported '~a' from '~a'~%")
682 item name)
683 (leave (G_ "failed to import '~a' from '~a'~%")
684 item name)))))
685
686 (define (check-machine-availability machine-file pred)
687 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
688 machine."
689 (define (build-machine=? m1 m2)
690 (and (string=? (build-machine-name m1) (build-machine-name m2))
691 (= (build-machine-port m1) (build-machine-port m2))))
692
693 ;; A given build machine may appear several times (e.g., once for
694 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
695 (let ((machines (filter pred
696 (delete-duplicates (build-machines machine-file)
697 build-machine=?))))
698 (info (G_ "testing ~a build machines defined in '~a'...~%")
699 (length machines) machine-file)
700 (let* ((names (map build-machine-name machines))
701 (sockets (map build-machine-daemon-socket machines))
702 (sessions (map open-ssh-session machines))
703 (nodes (map remote-inferior sessions)))
704 (for-each assert-node-has-guix nodes names)
705 (for-each assert-node-repl nodes names)
706 (for-each assert-node-can-import sessions nodes names sockets)
707 (for-each assert-node-can-export sessions nodes names sockets)
708 (for-each close-inferior nodes)
709 (for-each disconnect! sessions))))
710
711 (define (check-machine-status machine-file pred)
712 "Print the load of each machine matching PRED in MACHINE-FILE."
713 (define (build-machine=? m1 m2)
714 (and (string=? (build-machine-name m1) (build-machine-name m2))
715 (= (build-machine-port m1) (build-machine-port m2))))
716
717 ;; A given build machine may appear several times (e.g., once for
718 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
719 (let ((machines (filter pred
720 (delete-duplicates (build-machines machine-file)
721 build-machine=?))))
722 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
723 (length machines) machine-file)
724 (for-each (lambda (machine)
725 (define session
726 (open-ssh-session machine))
727
728 (match (remote-inferior session)
729 (#f
730 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
731 (build-machine-name machine)))
732 ((? inferior? inferior)
733 (let ((uts (inferior-eval '(uname) inferior))
734 (load (node-load inferior))
735 (free (node-free-disk-space inferior)))
736 (close-inferior inferior)
737 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
738 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%"
739 (build-machine-name machine)
740 (utsname:sysname uts) (utsname:release uts)
741 (utsname:machine uts)
742 (utsname:nodename uts)
743 (normalized-load machine load)
744 (/ free (expt 2 20) 1.)))))
745
746 (disconnect! session))
747 machines)))
748
749 \f
750 ;;;
751 ;;; Entry point.
752 ;;;
753
754 (define (guix-offload . args)
755 (define request-line-rx
756 ;; The request format. See 'tryBuildHook' method in build.cc.
757 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
758
759 (define not-coma
760 (char-set-complement (char-set #\,)))
761
762 ;; Make sure $HOME really corresponds to the current user. This is
763 ;; necessary since lsh uses that to determine the location of the yarrow
764 ;; seed file, and fails if it's owned by someone else.
765 (and=> (passwd:dir (getpw (getuid)))
766 (cut setenv "HOME" <>))
767
768 ;; We rely on protocol-level compression from libssh to optimize large data
769 ;; transfers. Warn if it's missing.
770 (unless (zlib-support?)
771 (warning (G_ "Guile-SSH lacks zlib support"))
772 (warning (G_ "data transfers will *not* be compressed!")))
773
774 (match args
775 ((system max-silent-time print-build-trace? build-timeout)
776 (let ((max-silent-time (string->number max-silent-time))
777 (build-timeout (string->number build-timeout))
778 (print-build-trace? (string=? print-build-trace? "1")))
779 (set-thread-name "guix offload")
780 (parameterize ((%current-system system))
781 (let loop ((line (read-line)))
782 (unless (eof-object? line)
783 (cond ((regexp-exec request-line-rx line)
784 =>
785 (lambda (match)
786 (with-error-handling
787 (process-request (equal? (match:substring match 1) "1")
788 (match:substring match 2) ; system
789 (read-derivation-from-file
790 (match:substring match 3))
791 (string-tokenize
792 (match:substring match 4) not-coma)
793 #:print-build-trace? print-build-trace?
794 #:max-silent-time max-silent-time
795 #:build-timeout build-timeout))))
796 (else
797 (leave (G_ "invalid request line: ~s~%") line)))
798 (loop (read-line)))))))
799 (("test" rest ...)
800 (with-error-handling
801 (let-values (((file pred)
802 (match rest
803 ((file regexp)
804 (values file
805 (compose (cut string-match regexp <>)
806 build-machine-name)))
807 ((file) (values file (const #t)))
808 (() (values %machine-file (const #t)))
809 (x (leave (G_ "wrong number of arguments~%"))))))
810 (check-machine-availability (or file %machine-file) pred))))
811 (("status" rest ...)
812 (with-error-handling
813 (let-values (((file pred)
814 (match rest
815 ((file regexp)
816 (values file
817 (compose (cut string-match regexp <>)
818 build-machine-name)))
819 ((file) (values file (const #t)))
820 (() (values %machine-file (const #t)))
821 (x (leave (G_ "wrong number of arguments~%"))))))
822 (check-machine-status (or file %machine-file) pred))))
823 (("--version")
824 (show-version-and-exit "guix offload"))
825 (("--help")
826 (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
827 Process build offload requests written on the standard input, possibly
828 offloading builds to the machines listed in '~a'.~%")
829 %machine-file)
830 (display (G_ "
831 This tool is meant to be used internally by 'guix-daemon'.\n"))
832 (show-bug-report-information))
833 (x
834 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
835
836 ;;; Local Variables:
837 ;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
838 ;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
839 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
840 ;;; eval: (put 'with-timeout 'scheme-indent-function 2)
841 ;;; End:
842
843 ;;; offload.scm ends here