copy, offload: Explicitly close SSH channels and sessions.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
CommitLineData
49e6291a 1;;; GNU Guix --- Functional package management for GNU
d51bfe24 2;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
dafc3daf 3;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
4b5a6fbc 4;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
49e6291a
LC
5;;;
6;;; This file is part of GNU Guix.
7;;;
8;;; GNU Guix is free software; you can redistribute it and/or modify it
9;;; under the terms of the GNU General Public License as published by
10;;; the Free Software Foundation; either version 3 of the License, or (at
11;;; your option) any later version.
12;;;
13;;; GNU Guix is distributed in the hope that it will be useful, but
14;;; WITHOUT ANY WARRANTY; without even the implied warranty of
15;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16;;; GNU General Public License for more details.
17;;;
18;;; You should have received a copy of the GNU General Public License
19;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
20
21(define-module (guix scripts offload)
21531add
LC
22 #:use-module (ssh key)
23 #:use-module (ssh auth)
24 #:use-module (ssh session)
25 #:use-module (ssh channel)
cf283dd9 26 #:use-module (ssh popen)
0b724753 27 #:use-module (ssh version)
49e6291a
LC
28 #:use-module (guix config)
29 #:use-module (guix records)
987a29ba 30 #:use-module (guix ssh)
49e6291a 31 #:use-module (guix store)
ed7b4437 32 #:use-module (guix inferior)
49e6291a 33 #:use-module (guix derivations)
2535635f
LC
34 #:use-module ((guix serialization)
35 #:select (nar-error? nar-error-file))
49e6291a 36 #:use-module (guix nar)
d51bfe24 37 #:use-module ((guix utils) #:select (%current-system))
8902d0f2
LC
38 #:use-module ((guix build syscalls)
39 #:select (fcntl-flock set-thread-name))
f326fef8 40 #:use-module ((guix build utils) #:select (which mkdir-p))
49e6291a 41 #:use-module (guix ui)
d51bfe24 42 #:use-module (guix diagnostics)
49e6291a 43 #:use-module (srfi srfi-1)
6c41cce0 44 #:use-module (srfi srfi-11)
49e6291a
LC
45 #:use-module (srfi srfi-26)
46 #:use-module (srfi srfi-34)
47 #:use-module (srfi srfi-35)
48 #:use-module (ice-9 popen)
49 #:use-module (ice-9 rdelim)
50 #:use-module (ice-9 match)
51 #:use-module (ice-9 regex)
52 #:use-module (ice-9 format)
2535635f 53 #:use-module (ice-9 binary-ports)
49e6291a
LC
54 #:export (build-machine
55 build-requirements
56 guix-offload))
57
58;;; Commentary:
59;;;
60;;; Attempt to offload builds to the machines listed in
61;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
62;;; retrieving the build output(s) over SSH upon success.
63;;;
64;;; This command should not be used directly; instead, it is called on-demand
dc209d5a 65;;; by the daemon, unless it was started with '--no-offload' or a client
49e6291a
LC
66;;; inhibited build hooks.
67;;;
68;;; Code:
69
49e6291a
LC
70(define-record-type* <build-machine>
71 build-machine make-build-machine
72 build-machine?
73 (name build-machine-name) ; string
cecd72d5
LC
74 (port build-machine-port ; number
75 (default 22))
4b5a6fbc
MC
76 (systems %build-machine-systems ; list of strings
77 (default #f)) ; drop default after system is removed
78 (system %build-machine-system ; deprecated
79 (default #f))
49e6291a
LC
80 (user build-machine-user) ; string
81 (private-key build-machine-private-key ; file name
21531add
LC
82 (default (user-openssh-private-key)))
83 (host-key build-machine-host-key) ; string
1d48cf94
LC
84 (compression build-machine-compression ; string
85 (default "zlib@openssh.com,zlib"))
86 (compression-level build-machine-compression-level ;integer
87 (default 3))
cf283dd9
LC
88 (daemon-socket build-machine-daemon-socket ; string
89 (default "/var/guix/daemon-socket/socket"))
49e6291a
LC
90 (parallel-builds build-machine-parallel-builds ; number
91 (default 1))
92 (speed build-machine-speed ; inexact real
93 (default 1.0))
94 (features build-machine-features ; list of strings
95 (default '())))
96
4b5a6fbc
MC
97;;; Deprecated.
98(define (build-machine-system machine)
99 (warning (G_ "The 'system' field is deprecated, \
100please use 'systems' instead.~%"))
101 (%build-machine-system machine))
102
103;;; TODO: Remove after the deprecated 'system' field is removed.
104(define (build-machine-systems machine)
105 (or (%build-machine-systems machine)
106 (list (build-machine-system machine))
107 (leave (G_ "The build-machine object lacks a value for its 'systems'
108field."))))
109
49e6291a
LC
110(define-record-type* <build-requirements>
111 build-requirements make-build-requirements
112 build-requirements?
113 (system build-requirements-system) ; string
114 (features build-requirements-features ; list of strings
115 (default '())))
116
117(define %machine-file
118 ;; File that lists machines available as build slaves.
119 (string-append %config-directory "/machines.scm"))
120
21531add
LC
121(define (user-openssh-private-key)
122 "Return the user's default SSH private key, or #f if it could not be
49e6291a
LC
123determined."
124 (and=> (getenv "HOME")
21531add 125 (cut string-append <> "/.ssh/id_rsa")))
49e6291a
LC
126
127(define %user-module
128 ;; Module in which the machine description file is loaded.
129 (let ((module (make-fresh-user-module)))
130 (module-use! module (resolve-interface '(guix scripts offload)))
131 module))
132
133(define* (build-machines #:optional (file %machine-file))
134 "Read the list of build machines from FILE and return it."
135 (catch #t
136 (lambda ()
137 ;; Avoid ABI incompatibility with the <build-machine> record.
e2721a05 138 ;; (set! %fresh-auto-compile #t)
49e6291a
LC
139
140 (save-module-excursion
141 (lambda ()
142 (set-current-module %user-module)
a9a685cc
LC
143 (match (primitive-load file)
144 (((? build-machine? machines) ...)
145 machines)
146 (_
147 ;; Instead of crashing, assume the empty list.
148 (warning (G_ "'~a' did not return a list of build machines; \
149ignoring it~%")
150 file)
151 '())))))
49e6291a
LC
152 (lambda args
153 (match args
e465d9e1 154 (('system-error . rest)
49e6291a
LC
155 (let ((err (system-error-errno args)))
156 ;; Silently ignore missing file since this is a common case.
157 (if (= ENOENT err)
158 '()
69daee23 159 (leave (G_ "failed to open machine file '~a': ~a~%")
92cb2e28 160 file (strerror err)))))
c1202fb1
LC
161 (('syntax-error proc message properties form . rest)
162 (let ((loc (source-properties->location properties)))
69daee23 163 (leave (G_ "~a: ~a~%")
c1202fb1 164 (location->string loc) message)))
e465d9e1 165 (x
69daee23 166 (leave (G_ "failed to load machine file '~a': ~s~%")
92cb2e28 167 file args))))))
49e6291a 168
21531add
LC
169(define (private-key-from-file* file)
170 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
171can interpret meaningfully."
172 (catch 'guile-ssh-error
173 (lambda ()
174 (private-key-from-file file))
175 (lambda (key proc str . rest)
d51bfe24 176 (raise (formatted-message (G_ "failed to load SSH \
21531add 177private key from '~a': ~a")
d51bfe24 178 file str)))))
21531add 179
00d73219 180(define* (open-ssh-session machine #:optional (max-silent-time -1))
21531add
LC
181 "Open an SSH session for MACHINE and return it. Throw an error on failure."
182 (let ((private (private-key-from-file* (build-machine-private-key machine)))
183 (public (public-key-from-file
184 (string-append (build-machine-private-key machine)
185 ".pub")))
186 (session (make-session #:user (build-machine-user machine)
187 #:host (build-machine-name machine)
188 #:port (build-machine-port machine)
00d73219 189 #:timeout 10 ;initial timeout (seconds)
21531add
LC
190 ;; #:log-verbosity 'protocol
191 #:identity (build-machine-private-key machine)
192
bd834577
LC
193 ;; By default libssh reads ~/.ssh/known_hosts
194 ;; and uses that to adjust its choice of cipher
195 ;; suites, which changes the type of host key
196 ;; that the server sends (RSA vs. Ed25519,
197 ;; etc.). Opt for something reproducible and
198 ;; stateless instead.
199 #:knownhosts "/dev/null"
200
21531add
LC
201 ;; We need lightweight compression when
202 ;; exchanging full archives.
1d48cf94
LC
203 #:compression
204 (build-machine-compression machine)
205 #:compression-level
206 (build-machine-compression-level machine))))
74afca5d
LC
207 (match (connect! session)
208 ('ok
114dcb42
LC
209 ;; Make sure the server's key is what we expect.
210 (authenticate-server* session (build-machine-host-key machine))
74afca5d
LC
211
212 (let ((auth (userauth-public-key! session private)))
213 (unless (eq? 'success auth)
214 (disconnect! session)
69daee23 215 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
74afca5d
LC
216 (build-machine-name machine) (get-error session))))
217
00d73219
LC
218 ;; From then on use MAX-SILENT-TIME as the absolute timeout when
219 ;; reading from or write to a channel for this session.
220 (session-set! session 'timeout max-silent-time)
221
74afca5d
LC
222 session)
223 (x
224 ;; Connection failed or timeout expired.
69daee23 225 (leave (G_ "failed to connect to '~a': ~a~%")
74afca5d 226 (build-machine-name machine) (get-error session))))))
21531add 227
59f704df
LC
228\f
229;;;
230;;; Synchronization.
231;;;
232
59f704df
LC
233(define (machine-slot-file machine slot)
234 "Return the file name of MACHINE's file for SLOT."
235 ;; For each machine we have a bunch of files representing each build slot.
236 ;; When choosing a build machine, we attempt to get an exclusive lock on one
237 ;; of these; if we fail, that means all the build slots are already taken.
238 ;; Inspired by Nix's build-remote.pl.
239 (string-append (string-append %state-directory "/offload/"
a3af06ad
LC
240 (build-machine-name machine) ":"
241 (number->string (build-machine-port machine))
59f704df
LC
242 "/" (number->string slot))))
243
244(define (acquire-build-slot machine)
245 "Attempt to acquire a build slot on MACHINE. Return the port representing
246the slot, or #f if none is available.
247
248This mechanism allows us to set a hard limit on the number of simultaneous
249connections allowed to MACHINE."
250 (mkdir-p (dirname (machine-slot-file machine 0)))
0ef595b9
LC
251
252 ;; When several 'guix offload' processes run in parallel, there's a race
253 ;; among them, but since they try the slots in the same order, we're fine.
254 (any (lambda (slot)
255 (let ((port (open-file (machine-slot-file machine slot)
256 "w0")))
257 (catch 'flock-error
258 (lambda ()
259 (fcntl-flock port 'write-lock #:wait? #f)
260 ;; Got it!
261 (format (current-error-port)
262 "process ~a acquired build slot '~a'~%"
263 (getpid) (port-filename port))
264 port)
265 (lambda args
266 ;; PORT is already locked by another process.
267 (close-port port)
268 #f))))
269 (iota (build-machine-parallel-builds machine))))
59f704df
LC
270
271(define (release-build-slot slot)
272 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
273 (close-port slot))
274
275\f
276;;;
277;;; Offloading.
278;;;
279
d81195bf
LC
280(define (build-log-port)
281 "Return the default port where build logs should be sent. The default is
282file descriptor 4, which is open by the daemon before running the offload
283hook."
284 (let ((port (fdopen 4 "w0")))
285 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
286 (set-port-revealed! port 1)
287 port))
288
ed7b4437
LC
289(define (node-guile-version node)
290 (inferior-eval '(version) node))
291
63b0c3ea
LC
292(define (node-free-disk-space node)
293 "Return the free disk space, in bytes, in NODE's store."
ed7b4437
LC
294 (inferior-eval `(begin
295 (use-modules (guix build syscalls))
296 (free-disk-space ,(%store-prefix)))
297 node))
63b0c3ea 298
88da0b68
LC
299(define* (transfer-and-offload drv machine
300 #:key
301 (inputs '())
302 (outputs '())
303 (max-silent-time 3600)
714084e6 304 build-timeout
88da0b68
LC
305 print-build-trace?)
306 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
307INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
308MACHINE."
9e76eed3 309 (define session
00d73219 310 (open-ssh-session machine max-silent-time))
9e76eed3 311
cf283dd9
LC
312 (define store
313 (connect-to-remote-daemon session
314 (build-machine-daemon-socket machine)))
315
316 (set-build-options store
317 #:print-build-trace print-build-trace?
318 #:max-silent-time max-silent-time
319 #:timeout build-timeout)
320
321 ;; Protect DRV from garbage collection.
322 (add-temp-root store (derivation-file-name drv))
323
987a29ba
LC
324 (with-store local
325 (send-files local (cons (derivation-file-name drv) inputs) store
326 #:log-port (current-output-port)))
cf283dd9
LC
327 (format (current-error-port) "offloading '~a' to '~a'...~%"
328 (derivation-file-name drv) (build-machine-name machine))
329 (format (current-error-port) "@ build-remote ~a ~a~%"
330 (derivation-file-name drv) (build-machine-name machine))
331
f9e8a123 332 (guard (c ((store-protocol-error? c)
cf283dd9 333 (format (current-error-port)
69daee23 334 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
cf283dd9
LC
335 (derivation-file-name drv)
336 (build-machine-name machine)
f9e8a123 337 (store-protocol-error-message c))
ed7b4437
LC
338 (let* ((inferior (false-if-exception (remote-inferior session)))
339 (space (false-if-exception
340 (node-free-disk-space inferior))))
341
342 (when inferior
343 (close-inferior inferior))
b96e05ae
LC
344
345 ;; Use exit code 100 for a permanent build failure. The daemon
346 ;; interprets other non-zero codes as transient build failures.
347 (if (and space (< space (* 10 (expt 2 20))))
348 (begin
349 (format (current-error-port)
350 (G_ "build failure may have been caused by lack \
351of free disk space on '~a'~%")
352 (build-machine-name machine))
353 (primitive-exit 1))
354 (primitive-exit 100)))))
0237d797
LC
355 (parameterize ((current-build-output-port (build-log-port)))
356 (build-derivations store (list drv))))
cf283dd9 357
d06d54e3
LC
358 (retrieve-files* outputs store
359
360 ;; We cannot use the 'import-paths' RPC here because we
361 ;; already hold the locks for FILES.
362 #:import
363 (lambda (port)
364 (restore-file-set port
365 #:log-port (current-error-port)
366 #:lock? #f)))
367
61fe9ced
LC
368 (close-connection store)
369 (disconnect! session)
cf283dd9
LC
370 (format (current-error-port) "done with offloaded '~a'~%"
371 (derivation-file-name drv)))
372
59f704df
LC
373\f
374;;;
375;;; Scheduling.
376;;;
377
49e6291a
LC
378(define (machine-matches? machine requirements)
379 "Return #t if MACHINE matches REQUIREMENTS."
4b5a6fbc
MC
380 (and (member (build-requirements-system requirements)
381 (build-machine-systems machine))
49e6291a
LC
382 (lset<= string=?
383 (build-requirements-features requirements)
384 (build-machine-features machine))))
385
63b0c3ea
LC
386(define %minimum-disk-space
387 ;; Minimum disk space required on the build machine for a build to be
388 ;; offloaded. This keeps us from offloading to machines that are bound to
389 ;; run out of disk space.
390 (* 100 (expt 2 20))) ;100 MiB
391
bbe66a53
LC
392(define (node-load node)
393 "Return the load on NODE. Return +∞ if NODE is misbehaving."
ed7b4437
LC
394 (let ((line (inferior-eval '(begin
395 (use-modules (ice-9 rdelim))
396 (call-with-input-file "/proc/loadavg"
397 read-string))
398 node)))
bbe66a53
LC
399 (if (eof-object? line)
400 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
401 (match (string-tokenize line)
402 ((one five fifteen . x)
403 (string->number one))
404 (x
405 +inf.0)))))
406
407(define (normalized-load machine load)
408 "Divide LOAD by the number of parallel builds of MACHINE."
409 (if (rational? load)
410 (let* ((jobs (build-machine-parallel-builds machine))
411 (normalized (/ load jobs)))
412 (format (current-error-port) "load on machine '~a' is ~s\
165f4b2a 413 (normalized: ~s)~%"
bbe66a53
LC
414 (build-machine-name machine) load normalized)
415 normalized)
416 load))
165f4b2a 417
d8e89b1c
LC
418(define (random-seed)
419 (logxor (getpid) (car (gettimeofday))))
420
421(define shuffle
422 (let ((state (seed->random-state (random-seed))))
423 (lambda (lst)
424 "Return LST shuffled (using the Fisher-Yates algorithm.)"
425 (define vec (list->vector lst))
426 (let loop ((result '())
427 (i (vector-length vec)))
428 (if (zero? i)
429 result
430 (let* ((j (random i state))
431 (val (vector-ref vec j)))
432 (vector-set! vec j (vector-ref vec (- i 1)))
433 (loop (cons val result) (- i 1))))))))
434
88da0b68 435(define (choose-build-machine machines)
84620dd0
LC
436 "Return two values: the best machine among MACHINES and its build
437slot (which must later be released with 'release-build-slot'), or #f and #f."
d652b851
LC
438
439 ;; Proceed like this:
7f4d102c 440 ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
d652b851 441 ;; those machines for which we failed.
7f4d102c
LC
442 ;; 2. Choose the best machine among those that are left.
443 ;; 3. Release the previously-acquired build slots of the other machines.
444
445 (define machines+slots
446 (filter-map (lambda (machine)
447 (let ((slot (acquire-build-slot machine)))
448 (and slot (list machine slot))))
449 (shuffle machines)))
450
451 (define (undecorate pred)
452 (lambda (a b)
453 (match a
454 ((machine1 slot1)
455 (match b
456 ((machine2 slot2)
457 (pred machine1 machine2)))))))
458
459 (define (machine-faster? m1 m2)
460 ;; Return #t if M1 is faster than M2.
461 (> (build-machine-speed m1)
462 (build-machine-speed m2)))
463
464 (let loop ((machines+slots
465 (sort machines+slots (undecorate machine-faster?))))
466 (match machines+slots
467 (((best slot) others ...)
468 ;; Return the best machine unless it's already overloaded.
469 ;; Note: We call 'node-load' only as a last resort because it is
470 ;; too costly to call it once for every machine.
00d73219
LC
471 (let* ((session (false-if-exception (open-ssh-session best
472 %short-timeout)))
7f4d102c
LC
473 (node (and session (remote-inferior session)))
474 (load (and node (normalized-load best (node-load node))))
475 (space (and node (node-free-disk-space node))))
476 (when node (close-inferior node))
477 (when session (disconnect! session))
478 (if (and node (< load 2.) (>= space %minimum-disk-space))
479 (match others
480 (((machines slots) ...)
481 ;; Release slots from the uninteresting machines.
482 (for-each release-build-slot slots)
483
484 ;; The caller must keep SLOT to protect it from GC and to
485 ;; eventually release it.
486 (values best slot)))
487 (begin
488 ;; BEST is unsuitable, so try the next one.
489 (when (and space (< space %minimum-disk-space))
490 (format (current-error-port)
491 "skipping machine '~a' because it is low \
63b0c3ea 492on disk space (~,2f MiB free)~%"
7f4d102c
LC
493 (build-machine-name best)
494 (/ space (expt 2 20) 1.)))
495 (release-build-slot slot)
496 (loop others)))))
497 (()
498 (values #f #f)))))
49e6291a 499
a708de15
LC
500(define (call-with-timeout timeout drv thunk)
501 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
502THUNK. Use DRV as an indication of what we were building when the timeout
503expired."
504 (if (number? timeout)
505 (dynamic-wind
506 (lambda ()
507 (sigaction SIGALRM
508 (lambda _
509 ;; The exit code here will be 1, which guix-daemon will
510 ;; interpret as a transient failure.
511 (leave (G_ "timeout expired while offloading '~a'~%")
512 (derivation-file-name drv))))
513 (alarm timeout))
514 thunk
515 (lambda ()
516 (alarm 0)))
517 (thunk)))
518
519(define-syntax-rule (with-timeout timeout drv exp ...)
520 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
521If TIMEOUT is #f, simply evaluate EXP..."
522 (call-with-timeout timeout drv (lambda () exp ...)))
523
49e6291a
LC
524(define* (process-request wants-local? system drv features
525 #:key
526 print-build-trace? (max-silent-time 3600)
714084e6 527 build-timeout)
49e6291a 528 "Process a request to build DRV."
88da0b68
LC
529 (let* ((local? (and wants-local? (string=? system (%current-system))))
530 (reqs (build-requirements
531 (system system)
532 (features features)))
533 (candidates (filter (cut machine-matches? <> reqs)
534 (build-machines))))
535 (match candidates
536 (()
537 ;; We'll never be able to match REQS.
538 (display "# decline\n"))
ba97e454 539 ((x ...)
84620dd0
LC
540 (let-values (((machine slot)
541 (choose-build-machine candidates)))
88da0b68 542 (if machine
84620dd0
LC
543 (dynamic-wind
544 (const #f)
545 (lambda ()
546 ;; Offload DRV to MACHINE.
547 (display "# accept\n")
548 (let ((inputs (string-tokenize (read-line)))
549 (outputs (string-tokenize (read-line))))
a708de15
LC
550 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
551 ;; be issues with the connection or deadlocks that could
552 ;; lead the 'guix offload' process to remain stuck forever.
553 ;; To avoid that, install a timeout here as well.
554 (with-timeout build-timeout drv
555 (transfer-and-offload drv machine
556 #:inputs inputs
557 #:outputs outputs
558 #:max-silent-time max-silent-time
559 #:build-timeout build-timeout
560 #:print-build-trace?
561 print-build-trace?))))
84620dd0
LC
562 (lambda ()
563 (release-build-slot slot)))
88da0b68
LC
564
565 ;; Not now, all the machines are busy.
566 (display "# postpone\n")))))))
49e6291a 567
49e6291a 568\f
aebaee95
LC
569;;;
570;;; Installation tests.
571;;;
572
00d73219
LC
573(define %short-timeout
574 ;; Timeout in seconds used on SSH connections where reads and writes
575 ;; shouldn't take long.
576 15)
577
aebaee95
LC
578(define (assert-node-repl node name)
579 "Bail out if NODE is not running Guile."
580 (match (node-guile-version node)
581 (#f
4eb0f9ae 582 (report-guile-error name))
aebaee95 583 ((? string? version)
ed7b4437 584 (info (G_ "'~a' is running GNU Guile ~a~%")
aebaee95
LC
585 name (node-guile-version node)))))
586
587(define (assert-node-has-guix node name)
10b2834f
LC
588 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
589daemon is not running."
590 (unless (inferior? node)
591 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
592
593 (match (inferior-eval '(begin
594 (use-modules (guix))
595 (and add-text-to-store 'alright))
596 node)
597 ('alright #t)
598 (_ (report-module-error name)))
599
600 (match (inferior-eval '(begin
601 (use-modules (guix))
602 (with-store store
603 (add-text-to-store store "test"
604 "Hello, build machine!")))
605 node)
606 ((? string? str)
607 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
608 name str))
609 (x
610 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
611 name x))))
aebaee95
LC
612
613(define %random-state
614 (delay
615 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
616
987a29ba
LC
617(define* (nonce #:optional (name (gethostname)))
618 (string-append name "-"
aebaee95
LC
619 (number->string (random 1000000 (force %random-state)))))
620
ed7b4437 621(define (assert-node-can-import session node name daemon-socket)
aebaee95 622 "Bail out if NODE refuses to import our archives."
ed7b4437
LC
623 (with-store store
624 (let* ((item (add-text-to-store store "export-test" (nonce)))
625 (remote (connect-to-remote-daemon session daemon-socket)))
626 (with-store local
627 (send-files local (list item) remote))
628
629 (if (valid-path? remote item)
630 (info (G_ "'~a' successfully imported '~a'~%")
631 name item)
632 (leave (G_ "'~a' was not properly imported on '~a'~%")
633 item name)))))
634
635(define (assert-node-can-export session node name daemon-socket)
aebaee95 636 "Bail out if we cannot import signed archives from NODE."
ed7b4437 637 (let* ((remote (connect-to-remote-daemon session daemon-socket))
987a29ba 638 (item (add-text-to-store remote "import-test" (nonce name))))
aebaee95 639 (with-store store
987a29ba 640 (if (and (retrieve-files store (list item) remote)
aebaee95 641 (valid-path? store item))
69daee23 642 (info (G_ "successfully imported '~a' from '~a'~%")
aebaee95 643 item name)
69daee23 644 (leave (G_ "failed to import '~a' from '~a'~%")
aebaee95
LC
645 item name)))))
646
27991c97
LC
647(define (check-machine-availability machine-file pred)
648 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
649machine."
2b513387
LC
650 (define (build-machine=? m1 m2)
651 (and (string=? (build-machine-name m1) (build-machine-name m2))
652 (= (build-machine-port m1) (build-machine-port m2))))
653
654 ;; A given build machine may appear several times (e.g., once for
655 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
27991c97
LC
656 (let ((machines (filter pred
657 (delete-duplicates (build-machines machine-file)
658 build-machine=?))))
69daee23 659 (info (G_ "testing ~a build machines defined in '~a'...~%")
aebaee95
LC
660 (length machines) machine-file)
661 (let* ((names (map build-machine-name machines))
662 (sockets (map build-machine-daemon-socket machines))
00d73219 663 (sessions (map (cut open-ssh-session <> %short-timeout) machines))
ed7b4437 664 (nodes (map remote-inferior sessions)))
aebaee95 665 (for-each assert-node-has-guix nodes names)
10b2834f 666 (for-each assert-node-repl nodes names)
ed7b4437
LC
667 (for-each assert-node-can-import sessions nodes names sockets)
668 (for-each assert-node-can-export sessions nodes names sockets)
669 (for-each close-inferior nodes)
670 (for-each disconnect! sessions))))
aebaee95 671
dafc3daf
RW
672(define (check-machine-status machine-file pred)
673 "Print the load of each machine matching PRED in MACHINE-FILE."
674 (define (build-machine=? m1 m2)
675 (and (string=? (build-machine-name m1) (build-machine-name m2))
676 (= (build-machine-port m1) (build-machine-port m2))))
677
678 ;; A given build machine may appear several times (e.g., once for
679 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
680 (let ((machines (filter pred
681 (delete-duplicates (build-machines machine-file)
682 build-machine=?))))
683 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
684 (length machines) machine-file)
685 (for-each (lambda (machine)
10b2834f 686 (define session
00d73219 687 (open-ssh-session machine %short-timeout))
10b2834f
LC
688
689 (match (remote-inferior session)
690 (#f
691 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
692 (build-machine-name machine)))
693 ((? inferior? inferior)
02ec889e
LC
694 (let ((now (car (gettimeofday))))
695 (match (inferior-eval '(list (uname)
696 (car (gettimeofday)))
697 inferior)
698 ((uts time)
699 (when (< time now)
700 ;; Build machine clocks must not be behind as this
701 ;; could cause timestamp issues.
702 (warning (G_ "machine '~a' is ~a seconds behind~%")
703 (build-machine-name machine)
704 (- now time)))
705
706 (let ((load (node-load inferior))
707 (free (node-free-disk-space inferior)))
708 (close-inferior inferior)
709 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
710 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
711 time difference: ~a s~%"
712 (build-machine-name machine)
713 (utsname:sysname uts) (utsname:release uts)
714 (utsname:machine uts)
715 (utsname:nodename uts)
716 (normalized-load machine load)
717 (/ free (expt 2 20) 1.)
718 (- time now))))))))
10b2834f
LC
719
720 (disconnect! session))
dafc3daf
RW
721 machines)))
722
aebaee95 723\f
49e6291a
LC
724;;;
725;;; Entry point.
726;;;
727
728(define (guix-offload . args)
729 (define request-line-rx
730 ;; The request format. See 'tryBuildHook' method in build.cc.
731 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
732
733 (define not-coma
734 (char-set-complement (char-set #\,)))
735
736 ;; Make sure $HOME really corresponds to the current user. This is
737 ;; necessary since lsh uses that to determine the location of the yarrow
738 ;; seed file, and fails if it's owned by someone else.
739 (and=> (passwd:dir (getpw (getuid)))
740 (cut setenv "HOME" <>))
741
0b724753
LC
742 ;; We rely on protocol-level compression from libssh to optimize large data
743 ;; transfers. Warn if it's missing.
744 (unless (zlib-support?)
69daee23
LC
745 (warning (G_ "Guile-SSH lacks zlib support"))
746 (warning (G_ "data transfers will *not* be compressed!")))
0b724753 747
49e6291a
LC
748 (match args
749 ((system max-silent-time print-build-trace? build-timeout)
750 (let ((max-silent-time (string->number max-silent-time))
751 (build-timeout (string->number build-timeout))
752 (print-build-trace? (string=? print-build-trace? "1")))
8902d0f2 753 (set-thread-name "guix offload")
49e6291a
LC
754 (parameterize ((%current-system system))
755 (let loop ((line (read-line)))
756 (unless (eof-object? line)
757 (cond ((regexp-exec request-line-rx line)
758 =>
759 (lambda (match)
e8a5db80 760 (with-error-handling
49e6291a
LC
761 (process-request (equal? (match:substring match 1) "1")
762 (match:substring match 2) ; system
015f17e8
LC
763 (read-derivation-from-file
764 (match:substring match 3))
49e6291a
LC
765 (string-tokenize
766 (match:substring match 4) not-coma)
767 #:print-build-trace? print-build-trace?
768 #:max-silent-time max-silent-time
769 #:build-timeout build-timeout))))
770 (else
69daee23 771 (leave (G_ "invalid request line: ~s~%") line)))
49e6291a 772 (loop (read-line)))))))
aebaee95
LC
773 (("test" rest ...)
774 (with-error-handling
27991c97
LC
775 (let-values (((file pred)
776 (match rest
777 ((file regexp)
778 (values file
779 (compose (cut string-match regexp <>)
780 build-machine-name)))
781 ((file) (values file (const #t)))
782 (() (values %machine-file (const #t)))
69daee23 783 (x (leave (G_ "wrong number of arguments~%"))))))
27991c97 784 (check-machine-availability (or file %machine-file) pred))))
dafc3daf
RW
785 (("status" rest ...)
786 (with-error-handling
787 (let-values (((file pred)
788 (match rest
789 ((file regexp)
790 (values file
791 (compose (cut string-match regexp <>)
792 build-machine-name)))
793 ((file) (values file (const #t)))
794 (() (values %machine-file (const #t)))
795 (x (leave (G_ "wrong number of arguments~%"))))))
796 (check-machine-status (or file %machine-file) pred))))
49e6291a
LC
797 (("--version")
798 (show-version-and-exit "guix offload"))
799 (("--help")
54a87b2a
MC
800 (format #t (G_ "Usage: guix offload SYSTEM MAX-SILENT-TIME \
801PRINT-BUILD-TRACE? BUILD-TIMEOUT
49e6291a
LC
802Process build offload requests written on the standard input, possibly
803offloading builds to the machines listed in '~a'.~%")
804 %machine-file)
69daee23 805 (display (G_ "
49e6291a
LC
806This tool is meant to be used internally by 'guix-daemon'.\n"))
807 (show-bug-report-information))
808 (x
69daee23 809 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
49e6291a 810
f326fef8 811;;; Local Variables:
d81195bf 812;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
a708de15 813;;; eval: (put 'with-timeout 'scheme-indent-function 2)
f326fef8
LC
814;;; End:
815
49e6291a 816;;; offload.scm ends here