copy, offload: Explicitly close SSH channels and sessions.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
3 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
4 ;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
5 ;;;
6 ;;; This file is part of GNU Guix.
7 ;;;
8 ;;; GNU Guix is free software; you can redistribute it and/or modify it
9 ;;; under the terms of the GNU General Public License as published by
10 ;;; the Free Software Foundation; either version 3 of the License, or (at
11 ;;; your option) any later version.
12 ;;;
13 ;;; GNU Guix is distributed in the hope that it will be useful, but
14 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;;; GNU General Public License for more details.
17 ;;;
18 ;;; You should have received a copy of the GNU General Public License
19 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
20
21 (define-module (guix scripts offload)
22 #:use-module (ssh key)
23 #:use-module (ssh auth)
24 #:use-module (ssh session)
25 #:use-module (ssh channel)
26 #:use-module (ssh popen)
27 #:use-module (ssh version)
28 #:use-module (guix config)
29 #:use-module (guix records)
30 #:use-module (guix ssh)
31 #:use-module (guix store)
32 #:use-module (guix inferior)
33 #:use-module (guix derivations)
34 #:use-module ((guix serialization)
35 #:select (nar-error? nar-error-file))
36 #:use-module (guix nar)
37 #:use-module ((guix utils) #:select (%current-system))
38 #:use-module ((guix build syscalls)
39 #:select (fcntl-flock set-thread-name))
40 #:use-module ((guix build utils) #:select (which mkdir-p))
41 #:use-module (guix ui)
42 #:use-module (guix diagnostics)
43 #:use-module (srfi srfi-1)
44 #:use-module (srfi srfi-11)
45 #:use-module (srfi srfi-26)
46 #:use-module (srfi srfi-34)
47 #:use-module (srfi srfi-35)
48 #:use-module (ice-9 popen)
49 #:use-module (ice-9 rdelim)
50 #:use-module (ice-9 match)
51 #:use-module (ice-9 regex)
52 #:use-module (ice-9 format)
53 #:use-module (ice-9 binary-ports)
54 #:export (build-machine
55 build-requirements
56 guix-offload))
57
58 ;;; Commentary:
59 ;;;
60 ;;; Attempt to offload builds to the machines listed in
61 ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
62 ;;; retrieving the build output(s) over SSH upon success.
63 ;;;
64 ;;; This command should not be used directly; instead, it is called on-demand
65 ;;; by the daemon, unless it was started with '--no-offload' or a client
66 ;;; inhibited build hooks.
67 ;;;
68 ;;; Code:
69
70 (define-record-type* <build-machine>
71 build-machine make-build-machine
72 build-machine?
73 (name build-machine-name) ; string
74 (port build-machine-port ; number
75 (default 22))
76 (systems %build-machine-systems ; list of strings
77 (default #f)) ; drop default after system is removed
78 (system %build-machine-system ; deprecated
79 (default #f))
80 (user build-machine-user) ; string
81 (private-key build-machine-private-key ; file name
82 (default (user-openssh-private-key)))
83 (host-key build-machine-host-key) ; string
84 (compression build-machine-compression ; string
85 (default "zlib@openssh.com,zlib"))
86 (compression-level build-machine-compression-level ;integer
87 (default 3))
88 (daemon-socket build-machine-daemon-socket ; string
89 (default "/var/guix/daemon-socket/socket"))
90 (parallel-builds build-machine-parallel-builds ; number
91 (default 1))
92 (speed build-machine-speed ; inexact real
93 (default 1.0))
94 (features build-machine-features ; list of strings
95 (default '())))
96
97 ;;; Deprecated.
98 (define (build-machine-system machine)
99 (warning (G_ "The 'system' field is deprecated, \
100 please use 'systems' instead.~%"))
101 (%build-machine-system machine))
102
103 ;;; TODO: Remove after the deprecated 'system' field is removed.
104 (define (build-machine-systems machine)
105 (or (%build-machine-systems machine)
106 (list (build-machine-system machine))
107 (leave (G_ "The build-machine object lacks a value for its 'systems'
108 field."))))
109
110 (define-record-type* <build-requirements>
111 build-requirements make-build-requirements
112 build-requirements?
113 (system build-requirements-system) ; string
114 (features build-requirements-features ; list of strings
115 (default '())))
116
117 (define %machine-file
118 ;; File that lists machines available as build slaves.
119 (string-append %config-directory "/machines.scm"))
120
121 (define (user-openssh-private-key)
122 "Return the user's default SSH private key, or #f if it could not be
123 determined."
124 (and=> (getenv "HOME")
125 (cut string-append <> "/.ssh/id_rsa")))
126
127 (define %user-module
128 ;; Module in which the machine description file is loaded.
129 (let ((module (make-fresh-user-module)))
130 (module-use! module (resolve-interface '(guix scripts offload)))
131 module))
132
133 (define* (build-machines #:optional (file %machine-file))
134 "Read the list of build machines from FILE and return it."
135 (catch #t
136 (lambda ()
137 ;; Avoid ABI incompatibility with the <build-machine> record.
138 ;; (set! %fresh-auto-compile #t)
139
140 (save-module-excursion
141 (lambda ()
142 (set-current-module %user-module)
143 (match (primitive-load file)
144 (((? build-machine? machines) ...)
145 machines)
146 (_
147 ;; Instead of crashing, assume the empty list.
148 (warning (G_ "'~a' did not return a list of build machines; \
149 ignoring it~%")
150 file)
151 '())))))
152 (lambda args
153 (match args
154 (('system-error . rest)
155 (let ((err (system-error-errno args)))
156 ;; Silently ignore missing file since this is a common case.
157 (if (= ENOENT err)
158 '()
159 (leave (G_ "failed to open machine file '~a': ~a~%")
160 file (strerror err)))))
161 (('syntax-error proc message properties form . rest)
162 (let ((loc (source-properties->location properties)))
163 (leave (G_ "~a: ~a~%")
164 (location->string loc) message)))
165 (x
166 (leave (G_ "failed to load machine file '~a': ~s~%")
167 file args))))))
168
169 (define (private-key-from-file* file)
170 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
171 can interpret meaningfully."
172 (catch 'guile-ssh-error
173 (lambda ()
174 (private-key-from-file file))
175 (lambda (key proc str . rest)
176 (raise (formatted-message (G_ "failed to load SSH \
177 private key from '~a': ~a")
178 file str)))))
179
180 (define* (open-ssh-session machine #:optional (max-silent-time -1))
181 "Open an SSH session for MACHINE and return it. Throw an error on failure."
182 (let ((private (private-key-from-file* (build-machine-private-key machine)))
183 (public (public-key-from-file
184 (string-append (build-machine-private-key machine)
185 ".pub")))
186 (session (make-session #:user (build-machine-user machine)
187 #:host (build-machine-name machine)
188 #:port (build-machine-port machine)
189 #:timeout 10 ;initial timeout (seconds)
190 ;; #:log-verbosity 'protocol
191 #:identity (build-machine-private-key machine)
192
193 ;; By default libssh reads ~/.ssh/known_hosts
194 ;; and uses that to adjust its choice of cipher
195 ;; suites, which changes the type of host key
196 ;; that the server sends (RSA vs. Ed25519,
197 ;; etc.). Opt for something reproducible and
198 ;; stateless instead.
199 #:knownhosts "/dev/null"
200
201 ;; We need lightweight compression when
202 ;; exchanging full archives.
203 #:compression
204 (build-machine-compression machine)
205 #:compression-level
206 (build-machine-compression-level machine))))
207 (match (connect! session)
208 ('ok
209 ;; Make sure the server's key is what we expect.
210 (authenticate-server* session (build-machine-host-key machine))
211
212 (let ((auth (userauth-public-key! session private)))
213 (unless (eq? 'success auth)
214 (disconnect! session)
215 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
216 (build-machine-name machine) (get-error session))))
217
218 ;; From then on use MAX-SILENT-TIME as the absolute timeout when
219 ;; reading from or write to a channel for this session.
220 (session-set! session 'timeout max-silent-time)
221
222 session)
223 (x
224 ;; Connection failed or timeout expired.
225 (leave (G_ "failed to connect to '~a': ~a~%")
226 (build-machine-name machine) (get-error session))))))
227
228 \f
229 ;;;
230 ;;; Synchronization.
231 ;;;
232
233 (define (machine-slot-file machine slot)
234 "Return the file name of MACHINE's file for SLOT."
235 ;; For each machine we have a bunch of files representing each build slot.
236 ;; When choosing a build machine, we attempt to get an exclusive lock on one
237 ;; of these; if we fail, that means all the build slots are already taken.
238 ;; Inspired by Nix's build-remote.pl.
239 (string-append (string-append %state-directory "/offload/"
240 (build-machine-name machine) ":"
241 (number->string (build-machine-port machine))
242 "/" (number->string slot))))
243
244 (define (acquire-build-slot machine)
245 "Attempt to acquire a build slot on MACHINE. Return the port representing
246 the slot, or #f if none is available.
247
248 This mechanism allows us to set a hard limit on the number of simultaneous
249 connections allowed to MACHINE."
250 (mkdir-p (dirname (machine-slot-file machine 0)))
251
252 ;; When several 'guix offload' processes run in parallel, there's a race
253 ;; among them, but since they try the slots in the same order, we're fine.
254 (any (lambda (slot)
255 (let ((port (open-file (machine-slot-file machine slot)
256 "w0")))
257 (catch 'flock-error
258 (lambda ()
259 (fcntl-flock port 'write-lock #:wait? #f)
260 ;; Got it!
261 (format (current-error-port)
262 "process ~a acquired build slot '~a'~%"
263 (getpid) (port-filename port))
264 port)
265 (lambda args
266 ;; PORT is already locked by another process.
267 (close-port port)
268 #f))))
269 (iota (build-machine-parallel-builds machine))))
270
271 (define (release-build-slot slot)
272 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
273 (close-port slot))
274
275 \f
276 ;;;
277 ;;; Offloading.
278 ;;;
279
280 (define (build-log-port)
281 "Return the default port where build logs should be sent. The default is
282 file descriptor 4, which is open by the daemon before running the offload
283 hook."
284 (let ((port (fdopen 4 "w0")))
285 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
286 (set-port-revealed! port 1)
287 port))
288
289 (define (node-guile-version node)
290 (inferior-eval '(version) node))
291
292 (define (node-free-disk-space node)
293 "Return the free disk space, in bytes, in NODE's store."
294 (inferior-eval `(begin
295 (use-modules (guix build syscalls))
296 (free-disk-space ,(%store-prefix)))
297 node))
298
299 (define* (transfer-and-offload drv machine
300 #:key
301 (inputs '())
302 (outputs '())
303 (max-silent-time 3600)
304 build-timeout
305 print-build-trace?)
306 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
307 INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
308 MACHINE."
309 (define session
310 (open-ssh-session machine max-silent-time))
311
312 (define store
313 (connect-to-remote-daemon session
314 (build-machine-daemon-socket machine)))
315
316 (set-build-options store
317 #:print-build-trace print-build-trace?
318 #:max-silent-time max-silent-time
319 #:timeout build-timeout)
320
321 ;; Protect DRV from garbage collection.
322 (add-temp-root store (derivation-file-name drv))
323
324 (with-store local
325 (send-files local (cons (derivation-file-name drv) inputs) store
326 #:log-port (current-output-port)))
327 (format (current-error-port) "offloading '~a' to '~a'...~%"
328 (derivation-file-name drv) (build-machine-name machine))
329 (format (current-error-port) "@ build-remote ~a ~a~%"
330 (derivation-file-name drv) (build-machine-name machine))
331
332 (guard (c ((store-protocol-error? c)
333 (format (current-error-port)
334 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
335 (derivation-file-name drv)
336 (build-machine-name machine)
337 (store-protocol-error-message c))
338 (let* ((inferior (false-if-exception (remote-inferior session)))
339 (space (false-if-exception
340 (node-free-disk-space inferior))))
341
342 (when inferior
343 (close-inferior inferior))
344
345 ;; Use exit code 100 for a permanent build failure. The daemon
346 ;; interprets other non-zero codes as transient build failures.
347 (if (and space (< space (* 10 (expt 2 20))))
348 (begin
349 (format (current-error-port)
350 (G_ "build failure may have been caused by lack \
351 of free disk space on '~a'~%")
352 (build-machine-name machine))
353 (primitive-exit 1))
354 (primitive-exit 100)))))
355 (parameterize ((current-build-output-port (build-log-port)))
356 (build-derivations store (list drv))))
357
358 (retrieve-files* outputs store
359
360 ;; We cannot use the 'import-paths' RPC here because we
361 ;; already hold the locks for FILES.
362 #:import
363 (lambda (port)
364 (restore-file-set port
365 #:log-port (current-error-port)
366 #:lock? #f)))
367
368 (close-connection store)
369 (disconnect! session)
370 (format (current-error-port) "done with offloaded '~a'~%"
371 (derivation-file-name drv)))
372
373 \f
374 ;;;
375 ;;; Scheduling.
376 ;;;
377
378 (define (machine-matches? machine requirements)
379 "Return #t if MACHINE matches REQUIREMENTS."
380 (and (member (build-requirements-system requirements)
381 (build-machine-systems machine))
382 (lset<= string=?
383 (build-requirements-features requirements)
384 (build-machine-features machine))))
385
386 (define %minimum-disk-space
387 ;; Minimum disk space required on the build machine for a build to be
388 ;; offloaded. This keeps us from offloading to machines that are bound to
389 ;; run out of disk space.
390 (* 100 (expt 2 20))) ;100 MiB
391
392 (define (node-load node)
393 "Return the load on NODE. Return +∞ if NODE is misbehaving."
394 (let ((line (inferior-eval '(begin
395 (use-modules (ice-9 rdelim))
396 (call-with-input-file "/proc/loadavg"
397 read-string))
398 node)))
399 (if (eof-object? line)
400 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
401 (match (string-tokenize line)
402 ((one five fifteen . x)
403 (string->number one))
404 (x
405 +inf.0)))))
406
407 (define (normalized-load machine load)
408 "Divide LOAD by the number of parallel builds of MACHINE."
409 (if (rational? load)
410 (let* ((jobs (build-machine-parallel-builds machine))
411 (normalized (/ load jobs)))
412 (format (current-error-port) "load on machine '~a' is ~s\
413 (normalized: ~s)~%"
414 (build-machine-name machine) load normalized)
415 normalized)
416 load))
417
418 (define (random-seed)
419 (logxor (getpid) (car (gettimeofday))))
420
421 (define shuffle
422 (let ((state (seed->random-state (random-seed))))
423 (lambda (lst)
424 "Return LST shuffled (using the Fisher-Yates algorithm.)"
425 (define vec (list->vector lst))
426 (let loop ((result '())
427 (i (vector-length vec)))
428 (if (zero? i)
429 result
430 (let* ((j (random i state))
431 (val (vector-ref vec j)))
432 (vector-set! vec j (vector-ref vec (- i 1)))
433 (loop (cons val result) (- i 1))))))))
434
435 (define (choose-build-machine machines)
436 "Return two values: the best machine among MACHINES and its build
437 slot (which must later be released with 'release-build-slot'), or #f and #f."
438
439 ;; Proceed like this:
440 ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
441 ;; those machines for which we failed.
442 ;; 2. Choose the best machine among those that are left.
443 ;; 3. Release the previously-acquired build slots of the other machines.
444
445 (define machines+slots
446 (filter-map (lambda (machine)
447 (let ((slot (acquire-build-slot machine)))
448 (and slot (list machine slot))))
449 (shuffle machines)))
450
451 (define (undecorate pred)
452 (lambda (a b)
453 (match a
454 ((machine1 slot1)
455 (match b
456 ((machine2 slot2)
457 (pred machine1 machine2)))))))
458
459 (define (machine-faster? m1 m2)
460 ;; Return #t if M1 is faster than M2.
461 (> (build-machine-speed m1)
462 (build-machine-speed m2)))
463
464 (let loop ((machines+slots
465 (sort machines+slots (undecorate machine-faster?))))
466 (match machines+slots
467 (((best slot) others ...)
468 ;; Return the best machine unless it's already overloaded.
469 ;; Note: We call 'node-load' only as a last resort because it is
470 ;; too costly to call it once for every machine.
471 (let* ((session (false-if-exception (open-ssh-session best
472 %short-timeout)))
473 (node (and session (remote-inferior session)))
474 (load (and node (normalized-load best (node-load node))))
475 (space (and node (node-free-disk-space node))))
476 (when node (close-inferior node))
477 (when session (disconnect! session))
478 (if (and node (< load 2.) (>= space %minimum-disk-space))
479 (match others
480 (((machines slots) ...)
481 ;; Release slots from the uninteresting machines.
482 (for-each release-build-slot slots)
483
484 ;; The caller must keep SLOT to protect it from GC and to
485 ;; eventually release it.
486 (values best slot)))
487 (begin
488 ;; BEST is unsuitable, so try the next one.
489 (when (and space (< space %minimum-disk-space))
490 (format (current-error-port)
491 "skipping machine '~a' because it is low \
492 on disk space (~,2f MiB free)~%"
493 (build-machine-name best)
494 (/ space (expt 2 20) 1.)))
495 (release-build-slot slot)
496 (loop others)))))
497 (()
498 (values #f #f)))))
499
500 (define (call-with-timeout timeout drv thunk)
501 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
502 THUNK. Use DRV as an indication of what we were building when the timeout
503 expired."
504 (if (number? timeout)
505 (dynamic-wind
506 (lambda ()
507 (sigaction SIGALRM
508 (lambda _
509 ;; The exit code here will be 1, which guix-daemon will
510 ;; interpret as a transient failure.
511 (leave (G_ "timeout expired while offloading '~a'~%")
512 (derivation-file-name drv))))
513 (alarm timeout))
514 thunk
515 (lambda ()
516 (alarm 0)))
517 (thunk)))
518
519 (define-syntax-rule (with-timeout timeout drv exp ...)
520 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
521 If TIMEOUT is #f, simply evaluate EXP..."
522 (call-with-timeout timeout drv (lambda () exp ...)))
523
524 (define* (process-request wants-local? system drv features
525 #:key
526 print-build-trace? (max-silent-time 3600)
527 build-timeout)
528 "Process a request to build DRV."
529 (let* ((local? (and wants-local? (string=? system (%current-system))))
530 (reqs (build-requirements
531 (system system)
532 (features features)))
533 (candidates (filter (cut machine-matches? <> reqs)
534 (build-machines))))
535 (match candidates
536 (()
537 ;; We'll never be able to match REQS.
538 (display "# decline\n"))
539 ((x ...)
540 (let-values (((machine slot)
541 (choose-build-machine candidates)))
542 (if machine
543 (dynamic-wind
544 (const #f)
545 (lambda ()
546 ;; Offload DRV to MACHINE.
547 (display "# accept\n")
548 (let ((inputs (string-tokenize (read-line)))
549 (outputs (string-tokenize (read-line))))
550 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
551 ;; be issues with the connection or deadlocks that could
552 ;; lead the 'guix offload' process to remain stuck forever.
553 ;; To avoid that, install a timeout here as well.
554 (with-timeout build-timeout drv
555 (transfer-and-offload drv machine
556 #:inputs inputs
557 #:outputs outputs
558 #:max-silent-time max-silent-time
559 #:build-timeout build-timeout
560 #:print-build-trace?
561 print-build-trace?))))
562 (lambda ()
563 (release-build-slot slot)))
564
565 ;; Not now, all the machines are busy.
566 (display "# postpone\n")))))))
567
568 \f
569 ;;;
570 ;;; Installation tests.
571 ;;;
572
573 (define %short-timeout
574 ;; Timeout in seconds used on SSH connections where reads and writes
575 ;; shouldn't take long.
576 15)
577
578 (define (assert-node-repl node name)
579 "Bail out if NODE is not running Guile."
580 (match (node-guile-version node)
581 (#f
582 (report-guile-error name))
583 ((? string? version)
584 (info (G_ "'~a' is running GNU Guile ~a~%")
585 name (node-guile-version node)))))
586
587 (define (assert-node-has-guix node name)
588 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
589 daemon is not running."
590 (unless (inferior? node)
591 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
592
593 (match (inferior-eval '(begin
594 (use-modules (guix))
595 (and add-text-to-store 'alright))
596 node)
597 ('alright #t)
598 (_ (report-module-error name)))
599
600 (match (inferior-eval '(begin
601 (use-modules (guix))
602 (with-store store
603 (add-text-to-store store "test"
604 "Hello, build machine!")))
605 node)
606 ((? string? str)
607 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
608 name str))
609 (x
610 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
611 name x))))
612
613 (define %random-state
614 (delay
615 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
616
617 (define* (nonce #:optional (name (gethostname)))
618 (string-append name "-"
619 (number->string (random 1000000 (force %random-state)))))
620
621 (define (assert-node-can-import session node name daemon-socket)
622 "Bail out if NODE refuses to import our archives."
623 (with-store store
624 (let* ((item (add-text-to-store store "export-test" (nonce)))
625 (remote (connect-to-remote-daemon session daemon-socket)))
626 (with-store local
627 (send-files local (list item) remote))
628
629 (if (valid-path? remote item)
630 (info (G_ "'~a' successfully imported '~a'~%")
631 name item)
632 (leave (G_ "'~a' was not properly imported on '~a'~%")
633 item name)))))
634
635 (define (assert-node-can-export session node name daemon-socket)
636 "Bail out if we cannot import signed archives from NODE."
637 (let* ((remote (connect-to-remote-daemon session daemon-socket))
638 (item (add-text-to-store remote "import-test" (nonce name))))
639 (with-store store
640 (if (and (retrieve-files store (list item) remote)
641 (valid-path? store item))
642 (info (G_ "successfully imported '~a' from '~a'~%")
643 item name)
644 (leave (G_ "failed to import '~a' from '~a'~%")
645 item name)))))
646
647 (define (check-machine-availability machine-file pred)
648 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
649 machine."
650 (define (build-machine=? m1 m2)
651 (and (string=? (build-machine-name m1) (build-machine-name m2))
652 (= (build-machine-port m1) (build-machine-port m2))))
653
654 ;; A given build machine may appear several times (e.g., once for
655 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
656 (let ((machines (filter pred
657 (delete-duplicates (build-machines machine-file)
658 build-machine=?))))
659 (info (G_ "testing ~a build machines defined in '~a'...~%")
660 (length machines) machine-file)
661 (let* ((names (map build-machine-name machines))
662 (sockets (map build-machine-daemon-socket machines))
663 (sessions (map (cut open-ssh-session <> %short-timeout) machines))
664 (nodes (map remote-inferior sessions)))
665 (for-each assert-node-has-guix nodes names)
666 (for-each assert-node-repl nodes names)
667 (for-each assert-node-can-import sessions nodes names sockets)
668 (for-each assert-node-can-export sessions nodes names sockets)
669 (for-each close-inferior nodes)
670 (for-each disconnect! sessions))))
671
672 (define (check-machine-status machine-file pred)
673 "Print the load of each machine matching PRED in MACHINE-FILE."
674 (define (build-machine=? m1 m2)
675 (and (string=? (build-machine-name m1) (build-machine-name m2))
676 (= (build-machine-port m1) (build-machine-port m2))))
677
678 ;; A given build machine may appear several times (e.g., once for
679 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
680 (let ((machines (filter pred
681 (delete-duplicates (build-machines machine-file)
682 build-machine=?))))
683 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
684 (length machines) machine-file)
685 (for-each (lambda (machine)
686 (define session
687 (open-ssh-session machine %short-timeout))
688
689 (match (remote-inferior session)
690 (#f
691 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
692 (build-machine-name machine)))
693 ((? inferior? inferior)
694 (let ((now (car (gettimeofday))))
695 (match (inferior-eval '(list (uname)
696 (car (gettimeofday)))
697 inferior)
698 ((uts time)
699 (when (< time now)
700 ;; Build machine clocks must not be behind as this
701 ;; could cause timestamp issues.
702 (warning (G_ "machine '~a' is ~a seconds behind~%")
703 (build-machine-name machine)
704 (- now time)))
705
706 (let ((load (node-load inferior))
707 (free (node-free-disk-space inferior)))
708 (close-inferior inferior)
709 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
710 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
711 time difference: ~a s~%"
712 (build-machine-name machine)
713 (utsname:sysname uts) (utsname:release uts)
714 (utsname:machine uts)
715 (utsname:nodename uts)
716 (normalized-load machine load)
717 (/ free (expt 2 20) 1.)
718 (- time now))))))))
719
720 (disconnect! session))
721 machines)))
722
723 \f
724 ;;;
725 ;;; Entry point.
726 ;;;
727
728 (define (guix-offload . args)
729 (define request-line-rx
730 ;; The request format. See 'tryBuildHook' method in build.cc.
731 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
732
733 (define not-coma
734 (char-set-complement (char-set #\,)))
735
736 ;; Make sure $HOME really corresponds to the current user. This is
737 ;; necessary since lsh uses that to determine the location of the yarrow
738 ;; seed file, and fails if it's owned by someone else.
739 (and=> (passwd:dir (getpw (getuid)))
740 (cut setenv "HOME" <>))
741
742 ;; We rely on protocol-level compression from libssh to optimize large data
743 ;; transfers. Warn if it's missing.
744 (unless (zlib-support?)
745 (warning (G_ "Guile-SSH lacks zlib support"))
746 (warning (G_ "data transfers will *not* be compressed!")))
747
748 (match args
749 ((system max-silent-time print-build-trace? build-timeout)
750 (let ((max-silent-time (string->number max-silent-time))
751 (build-timeout (string->number build-timeout))
752 (print-build-trace? (string=? print-build-trace? "1")))
753 (set-thread-name "guix offload")
754 (parameterize ((%current-system system))
755 (let loop ((line (read-line)))
756 (unless (eof-object? line)
757 (cond ((regexp-exec request-line-rx line)
758 =>
759 (lambda (match)
760 (with-error-handling
761 (process-request (equal? (match:substring match 1) "1")
762 (match:substring match 2) ; system
763 (read-derivation-from-file
764 (match:substring match 3))
765 (string-tokenize
766 (match:substring match 4) not-coma)
767 #:print-build-trace? print-build-trace?
768 #:max-silent-time max-silent-time
769 #:build-timeout build-timeout))))
770 (else
771 (leave (G_ "invalid request line: ~s~%") line)))
772 (loop (read-line)))))))
773 (("test" rest ...)
774 (with-error-handling
775 (let-values (((file pred)
776 (match rest
777 ((file regexp)
778 (values file
779 (compose (cut string-match regexp <>)
780 build-machine-name)))
781 ((file) (values file (const #t)))
782 (() (values %machine-file (const #t)))
783 (x (leave (G_ "wrong number of arguments~%"))))))
784 (check-machine-availability (or file %machine-file) pred))))
785 (("status" rest ...)
786 (with-error-handling
787 (let-values (((file pred)
788 (match rest
789 ((file regexp)
790 (values file
791 (compose (cut string-match regexp <>)
792 build-machine-name)))
793 ((file) (values file (const #t)))
794 (() (values %machine-file (const #t)))
795 (x (leave (G_ "wrong number of arguments~%"))))))
796 (check-machine-status (or file %machine-file) pred))))
797 (("--version")
798 (show-version-and-exit "guix offload"))
799 (("--help")
800 (format #t (G_ "Usage: guix offload SYSTEM MAX-SILENT-TIME \
801 PRINT-BUILD-TRACE? BUILD-TIMEOUT
802 Process build offload requests written on the standard input, possibly
803 offloading builds to the machines listed in '~a'.~%")
804 %machine-file)
805 (display (G_ "
806 This tool is meant to be used internally by 'guix-daemon'.\n"))
807 (show-bug-report-information))
808 (x
809 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
810
811 ;;; Local Variables:
812 ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
813 ;;; eval: (put 'with-timeout 'scheme-indent-function 2)
814 ;;; End:
815
816 ;;; offload.scm ends here