gnu: surgescript: Update to 0.5.4.4.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
CommitLineData
49e6291a 1;;; GNU Guix --- Functional package management for GNU
d51bfe24 2;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
dafc3daf 3;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
4b5a6fbc 4;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
49e6291a
LC
5;;;
6;;; This file is part of GNU Guix.
7;;;
8;;; GNU Guix is free software; you can redistribute it and/or modify it
9;;; under the terms of the GNU General Public License as published by
10;;; the Free Software Foundation; either version 3 of the License, or (at
11;;; your option) any later version.
12;;;
13;;; GNU Guix is distributed in the hope that it will be useful, but
14;;; WITHOUT ANY WARRANTY; without even the implied warranty of
15;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16;;; GNU General Public License for more details.
17;;;
18;;; You should have received a copy of the GNU General Public License
19;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
20
21(define-module (guix scripts offload)
21531add
LC
22 #:use-module (ssh key)
23 #:use-module (ssh auth)
24 #:use-module (ssh session)
25 #:use-module (ssh channel)
cf283dd9 26 #:use-module (ssh popen)
0b724753 27 #:use-module (ssh version)
49e6291a
LC
28 #:use-module (guix config)
29 #:use-module (guix records)
987a29ba 30 #:use-module (guix ssh)
49e6291a 31 #:use-module (guix store)
ed7b4437 32 #:use-module (guix inferior)
49e6291a 33 #:use-module (guix derivations)
2535635f
LC
34 #:use-module ((guix serialization)
35 #:select (nar-error? nar-error-file))
49e6291a 36 #:use-module (guix nar)
d51bfe24 37 #:use-module ((guix utils) #:select (%current-system))
8902d0f2
LC
38 #:use-module ((guix build syscalls)
39 #:select (fcntl-flock set-thread-name))
f326fef8 40 #:use-module ((guix build utils) #:select (which mkdir-p))
49e6291a 41 #:use-module (guix ui)
3794ce93 42 #:use-module (guix scripts)
d51bfe24 43 #:use-module (guix diagnostics)
49e6291a 44 #:use-module (srfi srfi-1)
6c41cce0 45 #:use-module (srfi srfi-11)
49e6291a
LC
46 #:use-module (srfi srfi-26)
47 #:use-module (srfi srfi-34)
48 #:use-module (srfi srfi-35)
49 #:use-module (ice-9 popen)
50 #:use-module (ice-9 rdelim)
51 #:use-module (ice-9 match)
52 #:use-module (ice-9 regex)
53 #:use-module (ice-9 format)
2535635f 54 #:use-module (ice-9 binary-ports)
49e6291a
LC
55 #:export (build-machine
56 build-requirements
57 guix-offload))
58
59;;; Commentary:
60;;;
61;;; Attempt to offload builds to the machines listed in
62;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
63;;; retrieving the build output(s) over SSH upon success.
64;;;
65;;; This command should not be used directly; instead, it is called on-demand
dc209d5a 66;;; by the daemon, unless it was started with '--no-offload' or a client
49e6291a
LC
67;;; inhibited build hooks.
68;;;
69;;; Code:
70
49e6291a
LC
71(define-record-type* <build-machine>
72 build-machine make-build-machine
73 build-machine?
74 (name build-machine-name) ; string
cecd72d5
LC
75 (port build-machine-port ; number
76 (default 22))
4b5a6fbc
MC
77 (systems %build-machine-systems ; list of strings
78 (default #f)) ; drop default after system is removed
79 (system %build-machine-system ; deprecated
80 (default #f))
49e6291a
LC
81 (user build-machine-user) ; string
82 (private-key build-machine-private-key ; file name
21531add
LC
83 (default (user-openssh-private-key)))
84 (host-key build-machine-host-key) ; string
1d48cf94
LC
85 (compression build-machine-compression ; string
86 (default "zlib@openssh.com,zlib"))
87 (compression-level build-machine-compression-level ;integer
88 (default 3))
cf283dd9
LC
89 (daemon-socket build-machine-daemon-socket ; string
90 (default "/var/guix/daemon-socket/socket"))
49e6291a
LC
91 (parallel-builds build-machine-parallel-builds ; number
92 (default 1))
93 (speed build-machine-speed ; inexact real
94 (default 1.0))
95 (features build-machine-features ; list of strings
96 (default '())))
97
4b5a6fbc
MC
98;;; Deprecated.
99(define (build-machine-system machine)
100 (warning (G_ "The 'system' field is deprecated, \
101please use 'systems' instead.~%"))
102 (%build-machine-system machine))
103
104;;; TODO: Remove after the deprecated 'system' field is removed.
105(define (build-machine-systems machine)
106 (or (%build-machine-systems machine)
107 (list (build-machine-system machine))
108 (leave (G_ "The build-machine object lacks a value for its 'systems'
109field."))))
110
49e6291a
LC
111(define-record-type* <build-requirements>
112 build-requirements make-build-requirements
113 build-requirements?
114 (system build-requirements-system) ; string
115 (features build-requirements-features ; list of strings
116 (default '())))
117
118(define %machine-file
119 ;; File that lists machines available as build slaves.
120 (string-append %config-directory "/machines.scm"))
121
21531add
LC
122(define (user-openssh-private-key)
123 "Return the user's default SSH private key, or #f if it could not be
49e6291a
LC
124determined."
125 (and=> (getenv "HOME")
21531add 126 (cut string-append <> "/.ssh/id_rsa")))
49e6291a
LC
127
128(define %user-module
129 ;; Module in which the machine description file is loaded.
130 (let ((module (make-fresh-user-module)))
131 (module-use! module (resolve-interface '(guix scripts offload)))
132 module))
133
134(define* (build-machines #:optional (file %machine-file))
135 "Read the list of build machines from FILE and return it."
136 (catch #t
137 (lambda ()
138 ;; Avoid ABI incompatibility with the <build-machine> record.
e2721a05 139 ;; (set! %fresh-auto-compile #t)
49e6291a
LC
140
141 (save-module-excursion
142 (lambda ()
143 (set-current-module %user-module)
a9a685cc
LC
144 (match (primitive-load file)
145 (((? build-machine? machines) ...)
146 machines)
147 (_
148 ;; Instead of crashing, assume the empty list.
149 (warning (G_ "'~a' did not return a list of build machines; \
150ignoring it~%")
151 file)
152 '())))))
49e6291a
LC
153 (lambda args
154 (match args
e465d9e1 155 (('system-error . rest)
49e6291a
LC
156 (let ((err (system-error-errno args)))
157 ;; Silently ignore missing file since this is a common case.
158 (if (= ENOENT err)
159 '()
69daee23 160 (leave (G_ "failed to open machine file '~a': ~a~%")
92cb2e28 161 file (strerror err)))))
c1202fb1
LC
162 (('syntax-error proc message properties form . rest)
163 (let ((loc (source-properties->location properties)))
69daee23 164 (leave (G_ "~a: ~a~%")
c1202fb1 165 (location->string loc) message)))
e465d9e1 166 (x
69daee23 167 (leave (G_ "failed to load machine file '~a': ~s~%")
92cb2e28 168 file args))))))
49e6291a 169
21531add
LC
170(define (private-key-from-file* file)
171 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
172can interpret meaningfully."
173 (catch 'guile-ssh-error
174 (lambda ()
175 (private-key-from-file file))
176 (lambda (key proc str . rest)
d51bfe24 177 (raise (formatted-message (G_ "failed to load SSH \
21531add 178private key from '~a': ~a")
d51bfe24 179 file str)))))
21531add 180
00d73219 181(define* (open-ssh-session machine #:optional (max-silent-time -1))
21531add
LC
182 "Open an SSH session for MACHINE and return it. Throw an error on failure."
183 (let ((private (private-key-from-file* (build-machine-private-key machine)))
184 (public (public-key-from-file
185 (string-append (build-machine-private-key machine)
186 ".pub")))
187 (session (make-session #:user (build-machine-user machine)
188 #:host (build-machine-name machine)
189 #:port (build-machine-port machine)
00d73219 190 #:timeout 10 ;initial timeout (seconds)
21531add
LC
191 ;; #:log-verbosity 'protocol
192 #:identity (build-machine-private-key machine)
193
bd834577
LC
194 ;; By default libssh reads ~/.ssh/known_hosts
195 ;; and uses that to adjust its choice of cipher
196 ;; suites, which changes the type of host key
197 ;; that the server sends (RSA vs. Ed25519,
198 ;; etc.). Opt for something reproducible and
199 ;; stateless instead.
200 #:knownhosts "/dev/null"
201
21531add
LC
202 ;; We need lightweight compression when
203 ;; exchanging full archives.
1d48cf94
LC
204 #:compression
205 (build-machine-compression machine)
206 #:compression-level
207 (build-machine-compression-level machine))))
74afca5d
LC
208 (match (connect! session)
209 ('ok
114dcb42
LC
210 ;; Make sure the server's key is what we expect.
211 (authenticate-server* session (build-machine-host-key machine))
74afca5d
LC
212
213 (let ((auth (userauth-public-key! session private)))
214 (unless (eq? 'success auth)
215 (disconnect! session)
69daee23 216 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
74afca5d
LC
217 (build-machine-name machine) (get-error session))))
218
00d73219
LC
219 ;; From then on use MAX-SILENT-TIME as the absolute timeout when
220 ;; reading from or write to a channel for this session.
221 (session-set! session 'timeout max-silent-time)
222
74afca5d
LC
223 session)
224 (x
225 ;; Connection failed or timeout expired.
69daee23 226 (leave (G_ "failed to connect to '~a': ~a~%")
74afca5d 227 (build-machine-name machine) (get-error session))))))
21531add 228
59f704df
LC
229\f
230;;;
231;;; Synchronization.
232;;;
233
59f704df
LC
234(define (machine-slot-file machine slot)
235 "Return the file name of MACHINE's file for SLOT."
236 ;; For each machine we have a bunch of files representing each build slot.
237 ;; When choosing a build machine, we attempt to get an exclusive lock on one
238 ;; of these; if we fail, that means all the build slots are already taken.
239 ;; Inspired by Nix's build-remote.pl.
240 (string-append (string-append %state-directory "/offload/"
a3af06ad
LC
241 (build-machine-name machine) ":"
242 (number->string (build-machine-port machine))
59f704df
LC
243 "/" (number->string slot))))
244
245(define (acquire-build-slot machine)
246 "Attempt to acquire a build slot on MACHINE. Return the port representing
247the slot, or #f if none is available.
248
249This mechanism allows us to set a hard limit on the number of simultaneous
250connections allowed to MACHINE."
251 (mkdir-p (dirname (machine-slot-file machine 0)))
0ef595b9
LC
252
253 ;; When several 'guix offload' processes run in parallel, there's a race
254 ;; among them, but since they try the slots in the same order, we're fine.
255 (any (lambda (slot)
256 (let ((port (open-file (machine-slot-file machine slot)
257 "w0")))
258 (catch 'flock-error
259 (lambda ()
260 (fcntl-flock port 'write-lock #:wait? #f)
261 ;; Got it!
262 (format (current-error-port)
263 "process ~a acquired build slot '~a'~%"
264 (getpid) (port-filename port))
265 port)
266 (lambda args
267 ;; PORT is already locked by another process.
268 (close-port port)
269 #f))))
270 (iota (build-machine-parallel-builds machine))))
59f704df
LC
271
272(define (release-build-slot slot)
273 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
274 (close-port slot))
275
276\f
277;;;
278;;; Offloading.
279;;;
280
d81195bf
LC
281(define (build-log-port)
282 "Return the default port where build logs should be sent. The default is
283file descriptor 4, which is open by the daemon before running the offload
284hook."
285 (let ((port (fdopen 4 "w0")))
286 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
287 (set-port-revealed! port 1)
288 port))
289
ed7b4437
LC
290(define (node-guile-version node)
291 (inferior-eval '(version) node))
292
63b0c3ea
LC
293(define (node-free-disk-space node)
294 "Return the free disk space, in bytes, in NODE's store."
ed7b4437
LC
295 (inferior-eval `(begin
296 (use-modules (guix build syscalls))
297 (free-disk-space ,(%store-prefix)))
298 node))
63b0c3ea 299
88da0b68
LC
300(define* (transfer-and-offload drv machine
301 #:key
302 (inputs '())
303 (outputs '())
304 (max-silent-time 3600)
714084e6 305 build-timeout
88da0b68
LC
306 print-build-trace?)
307 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
308INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
309MACHINE."
9e76eed3 310 (define session
00d73219 311 (open-ssh-session machine max-silent-time))
9e76eed3 312
cf283dd9
LC
313 (define store
314 (connect-to-remote-daemon session
315 (build-machine-daemon-socket machine)))
316
317 (set-build-options store
318 #:print-build-trace print-build-trace?
319 #:max-silent-time max-silent-time
320 #:timeout build-timeout)
321
322 ;; Protect DRV from garbage collection.
323 (add-temp-root store (derivation-file-name drv))
324
987a29ba
LC
325 (with-store local
326 (send-files local (cons (derivation-file-name drv) inputs) store
327 #:log-port (current-output-port)))
cf283dd9
LC
328 (format (current-error-port) "offloading '~a' to '~a'...~%"
329 (derivation-file-name drv) (build-machine-name machine))
330 (format (current-error-port) "@ build-remote ~a ~a~%"
331 (derivation-file-name drv) (build-machine-name machine))
332
f9e8a123 333 (guard (c ((store-protocol-error? c)
cf283dd9 334 (format (current-error-port)
69daee23 335 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
cf283dd9
LC
336 (derivation-file-name drv)
337 (build-machine-name machine)
f9e8a123 338 (store-protocol-error-message c))
ed7b4437
LC
339 (let* ((inferior (false-if-exception (remote-inferior session)))
340 (space (false-if-exception
341 (node-free-disk-space inferior))))
342
343 (when inferior
344 (close-inferior inferior))
b96e05ae
LC
345
346 ;; Use exit code 100 for a permanent build failure. The daemon
347 ;; interprets other non-zero codes as transient build failures.
348 (if (and space (< space (* 10 (expt 2 20))))
349 (begin
350 (format (current-error-port)
351 (G_ "build failure may have been caused by lack \
352of free disk space on '~a'~%")
353 (build-machine-name machine))
354 (primitive-exit 1))
355 (primitive-exit 100)))))
0237d797
LC
356 (parameterize ((current-build-output-port (build-log-port)))
357 (build-derivations store (list drv))))
cf283dd9 358
d06d54e3
LC
359 (retrieve-files* outputs store
360
361 ;; We cannot use the 'import-paths' RPC here because we
362 ;; already hold the locks for FILES.
363 #:import
364 (lambda (port)
365 (restore-file-set port
366 #:log-port (current-error-port)
367 #:lock? #f)))
368
61fe9ced
LC
369 (close-connection store)
370 (disconnect! session)
cf283dd9
LC
371 (format (current-error-port) "done with offloaded '~a'~%"
372 (derivation-file-name drv)))
373
59f704df
LC
374\f
375;;;
376;;; Scheduling.
377;;;
378
49e6291a
LC
379(define (machine-matches? machine requirements)
380 "Return #t if MACHINE matches REQUIREMENTS."
4b5a6fbc
MC
381 (and (member (build-requirements-system requirements)
382 (build-machine-systems machine))
49e6291a
LC
383 (lset<= string=?
384 (build-requirements-features requirements)
385 (build-machine-features machine))))
386
63b0c3ea
LC
387(define %minimum-disk-space
388 ;; Minimum disk space required on the build machine for a build to be
389 ;; offloaded. This keeps us from offloading to machines that are bound to
390 ;; run out of disk space.
391 (* 100 (expt 2 20))) ;100 MiB
392
bbe66a53
LC
393(define (node-load node)
394 "Return the load on NODE. Return +∞ if NODE is misbehaving."
ed7b4437
LC
395 (let ((line (inferior-eval '(begin
396 (use-modules (ice-9 rdelim))
397 (call-with-input-file "/proc/loadavg"
398 read-string))
399 node)))
bbe66a53
LC
400 (if (eof-object? line)
401 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
402 (match (string-tokenize line)
403 ((one five fifteen . x)
404 (string->number one))
405 (x
406 +inf.0)))))
407
408(define (normalized-load machine load)
409 "Divide LOAD by the number of parallel builds of MACHINE."
410 (if (rational? load)
411 (let* ((jobs (build-machine-parallel-builds machine))
412 (normalized (/ load jobs)))
413 (format (current-error-port) "load on machine '~a' is ~s\
165f4b2a 414 (normalized: ~s)~%"
bbe66a53
LC
415 (build-machine-name machine) load normalized)
416 normalized)
417 load))
165f4b2a 418
d8e89b1c
LC
419(define (random-seed)
420 (logxor (getpid) (car (gettimeofday))))
421
422(define shuffle
423 (let ((state (seed->random-state (random-seed))))
424 (lambda (lst)
425 "Return LST shuffled (using the Fisher-Yates algorithm.)"
426 (define vec (list->vector lst))
427 (let loop ((result '())
428 (i (vector-length vec)))
429 (if (zero? i)
430 result
431 (let* ((j (random i state))
432 (val (vector-ref vec j)))
433 (vector-set! vec j (vector-ref vec (- i 1)))
434 (loop (cons val result) (- i 1))))))))
435
88da0b68 436(define (choose-build-machine machines)
84620dd0
LC
437 "Return two values: the best machine among MACHINES and its build
438slot (which must later be released with 'release-build-slot'), or #f and #f."
d652b851
LC
439
440 ;; Proceed like this:
7f4d102c 441 ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
d652b851 442 ;; those machines for which we failed.
7f4d102c
LC
443 ;; 2. Choose the best machine among those that are left.
444 ;; 3. Release the previously-acquired build slots of the other machines.
445
446 (define machines+slots
447 (filter-map (lambda (machine)
448 (let ((slot (acquire-build-slot machine)))
449 (and slot (list machine slot))))
450 (shuffle machines)))
451
452 (define (undecorate pred)
453 (lambda (a b)
454 (match a
455 ((machine1 slot1)
456 (match b
457 ((machine2 slot2)
458 (pred machine1 machine2)))))))
459
460 (define (machine-faster? m1 m2)
461 ;; Return #t if M1 is faster than M2.
462 (> (build-machine-speed m1)
463 (build-machine-speed m2)))
464
465 (let loop ((machines+slots
466 (sort machines+slots (undecorate machine-faster?))))
467 (match machines+slots
468 (((best slot) others ...)
469 ;; Return the best machine unless it's already overloaded.
470 ;; Note: We call 'node-load' only as a last resort because it is
471 ;; too costly to call it once for every machine.
00d73219
LC
472 (let* ((session (false-if-exception (open-ssh-session best
473 %short-timeout)))
7f4d102c
LC
474 (node (and session (remote-inferior session)))
475 (load (and node (normalized-load best (node-load node))))
476 (space (and node (node-free-disk-space node))))
477 (when node (close-inferior node))
478 (when session (disconnect! session))
479 (if (and node (< load 2.) (>= space %minimum-disk-space))
480 (match others
481 (((machines slots) ...)
482 ;; Release slots from the uninteresting machines.
483 (for-each release-build-slot slots)
484
485 ;; The caller must keep SLOT to protect it from GC and to
486 ;; eventually release it.
487 (values best slot)))
488 (begin
489 ;; BEST is unsuitable, so try the next one.
490 (when (and space (< space %minimum-disk-space))
491 (format (current-error-port)
492 "skipping machine '~a' because it is low \
63b0c3ea 493on disk space (~,2f MiB free)~%"
7f4d102c
LC
494 (build-machine-name best)
495 (/ space (expt 2 20) 1.)))
496 (release-build-slot slot)
497 (loop others)))))
498 (()
499 (values #f #f)))))
49e6291a 500
a708de15
LC
501(define (call-with-timeout timeout drv thunk)
502 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
503THUNK. Use DRV as an indication of what we were building when the timeout
504expired."
505 (if (number? timeout)
506 (dynamic-wind
507 (lambda ()
508 (sigaction SIGALRM
509 (lambda _
510 ;; The exit code here will be 1, which guix-daemon will
511 ;; interpret as a transient failure.
512 (leave (G_ "timeout expired while offloading '~a'~%")
513 (derivation-file-name drv))))
514 (alarm timeout))
515 thunk
516 (lambda ()
517 (alarm 0)))
518 (thunk)))
519
520(define-syntax-rule (with-timeout timeout drv exp ...)
521 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
522If TIMEOUT is #f, simply evaluate EXP..."
523 (call-with-timeout timeout drv (lambda () exp ...)))
524
49e6291a
LC
525(define* (process-request wants-local? system drv features
526 #:key
527 print-build-trace? (max-silent-time 3600)
714084e6 528 build-timeout)
49e6291a 529 "Process a request to build DRV."
88da0b68
LC
530 (let* ((local? (and wants-local? (string=? system (%current-system))))
531 (reqs (build-requirements
532 (system system)
533 (features features)))
534 (candidates (filter (cut machine-matches? <> reqs)
535 (build-machines))))
536 (match candidates
537 (()
538 ;; We'll never be able to match REQS.
539 (display "# decline\n"))
ba97e454 540 ((x ...)
84620dd0
LC
541 (let-values (((machine slot)
542 (choose-build-machine candidates)))
88da0b68 543 (if machine
84620dd0
LC
544 (dynamic-wind
545 (const #f)
546 (lambda ()
547 ;; Offload DRV to MACHINE.
548 (display "# accept\n")
549 (let ((inputs (string-tokenize (read-line)))
550 (outputs (string-tokenize (read-line))))
a708de15
LC
551 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
552 ;; be issues with the connection or deadlocks that could
553 ;; lead the 'guix offload' process to remain stuck forever.
554 ;; To avoid that, install a timeout here as well.
555 (with-timeout build-timeout drv
556 (transfer-and-offload drv machine
557 #:inputs inputs
558 #:outputs outputs
559 #:max-silent-time max-silent-time
560 #:build-timeout build-timeout
561 #:print-build-trace?
562 print-build-trace?))))
84620dd0
LC
563 (lambda ()
564 (release-build-slot slot)))
88da0b68
LC
565
566 ;; Not now, all the machines are busy.
567 (display "# postpone\n")))))))
49e6291a 568
49e6291a 569\f
aebaee95
LC
570;;;
571;;; Installation tests.
572;;;
573
00d73219
LC
574(define %short-timeout
575 ;; Timeout in seconds used on SSH connections where reads and writes
576 ;; shouldn't take long.
577 15)
578
aebaee95
LC
579(define (assert-node-repl node name)
580 "Bail out if NODE is not running Guile."
581 (match (node-guile-version node)
582 (#f
4eb0f9ae 583 (report-guile-error name))
aebaee95 584 ((? string? version)
ed7b4437 585 (info (G_ "'~a' is running GNU Guile ~a~%")
aebaee95
LC
586 name (node-guile-version node)))))
587
588(define (assert-node-has-guix node name)
10b2834f
LC
589 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
590daemon is not running."
591 (unless (inferior? node)
592 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
593
594 (match (inferior-eval '(begin
595 (use-modules (guix))
596 (and add-text-to-store 'alright))
597 node)
598 ('alright #t)
599 (_ (report-module-error name)))
600
601 (match (inferior-eval '(begin
602 (use-modules (guix))
603 (with-store store
604 (add-text-to-store store "test"
605 "Hello, build machine!")))
606 node)
607 ((? string? str)
608 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
609 name str))
610 (x
611 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
612 name x))))
aebaee95
LC
613
614(define %random-state
615 (delay
616 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
617
987a29ba
LC
618(define* (nonce #:optional (name (gethostname)))
619 (string-append name "-"
aebaee95
LC
620 (number->string (random 1000000 (force %random-state)))))
621
ed7b4437 622(define (assert-node-can-import session node name daemon-socket)
aebaee95 623 "Bail out if NODE refuses to import our archives."
ed7b4437
LC
624 (with-store store
625 (let* ((item (add-text-to-store store "export-test" (nonce)))
626 (remote (connect-to-remote-daemon session daemon-socket)))
627 (with-store local
628 (send-files local (list item) remote))
629
630 (if (valid-path? remote item)
631 (info (G_ "'~a' successfully imported '~a'~%")
632 name item)
633 (leave (G_ "'~a' was not properly imported on '~a'~%")
634 item name)))))
635
636(define (assert-node-can-export session node name daemon-socket)
aebaee95 637 "Bail out if we cannot import signed archives from NODE."
ed7b4437 638 (let* ((remote (connect-to-remote-daemon session daemon-socket))
987a29ba 639 (item (add-text-to-store remote "import-test" (nonce name))))
aebaee95 640 (with-store store
987a29ba 641 (if (and (retrieve-files store (list item) remote)
aebaee95 642 (valid-path? store item))
69daee23 643 (info (G_ "successfully imported '~a' from '~a'~%")
aebaee95 644 item name)
69daee23 645 (leave (G_ "failed to import '~a' from '~a'~%")
aebaee95
LC
646 item name)))))
647
27991c97
LC
648(define (check-machine-availability machine-file pred)
649 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
650machine."
2b513387
LC
651 (define (build-machine=? m1 m2)
652 (and (string=? (build-machine-name m1) (build-machine-name m2))
653 (= (build-machine-port m1) (build-machine-port m2))))
654
655 ;; A given build machine may appear several times (e.g., once for
656 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
27991c97
LC
657 (let ((machines (filter pred
658 (delete-duplicates (build-machines machine-file)
659 build-machine=?))))
69daee23 660 (info (G_ "testing ~a build machines defined in '~a'...~%")
aebaee95
LC
661 (length machines) machine-file)
662 (let* ((names (map build-machine-name machines))
663 (sockets (map build-machine-daemon-socket machines))
00d73219 664 (sessions (map (cut open-ssh-session <> %short-timeout) machines))
ed7b4437 665 (nodes (map remote-inferior sessions)))
aebaee95 666 (for-each assert-node-has-guix nodes names)
10b2834f 667 (for-each assert-node-repl nodes names)
ed7b4437
LC
668 (for-each assert-node-can-import sessions nodes names sockets)
669 (for-each assert-node-can-export sessions nodes names sockets)
670 (for-each close-inferior nodes)
671 (for-each disconnect! sessions))))
aebaee95 672
dafc3daf
RW
673(define (check-machine-status machine-file pred)
674 "Print the load of each machine matching PRED in MACHINE-FILE."
675 (define (build-machine=? m1 m2)
676 (and (string=? (build-machine-name m1) (build-machine-name m2))
677 (= (build-machine-port m1) (build-machine-port m2))))
678
679 ;; A given build machine may appear several times (e.g., once for
680 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
681 (let ((machines (filter pred
682 (delete-duplicates (build-machines machine-file)
683 build-machine=?))))
684 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
685 (length machines) machine-file)
686 (for-each (lambda (machine)
10b2834f 687 (define session
00d73219 688 (open-ssh-session machine %short-timeout))
10b2834f
LC
689
690 (match (remote-inferior session)
691 (#f
692 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
693 (build-machine-name machine)))
694 ((? inferior? inferior)
02ec889e
LC
695 (let ((now (car (gettimeofday))))
696 (match (inferior-eval '(list (uname)
697 (car (gettimeofday)))
698 inferior)
699 ((uts time)
700 (when (< time now)
701 ;; Build machine clocks must not be behind as this
702 ;; could cause timestamp issues.
703 (warning (G_ "machine '~a' is ~a seconds behind~%")
704 (build-machine-name machine)
705 (- now time)))
706
707 (let ((load (node-load inferior))
708 (free (node-free-disk-space inferior)))
709 (close-inferior inferior)
710 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
711 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
712 time difference: ~a s~%"
713 (build-machine-name machine)
714 (utsname:sysname uts) (utsname:release uts)
715 (utsname:machine uts)
716 (utsname:nodename uts)
717 (normalized-load machine load)
718 (/ free (expt 2 20) 1.)
719 (- time now))))))))
10b2834f
LC
720
721 (disconnect! session))
dafc3daf
RW
722 machines)))
723
aebaee95 724\f
49e6291a
LC
725;;;
726;;; Entry point.
727;;;
728
3794ce93
LC
729(define-command (guix-offload . args)
730 (category plumbing)
731 (synopsis "set up and operate build offloading")
732
49e6291a
LC
733 (define request-line-rx
734 ;; The request format. See 'tryBuildHook' method in build.cc.
735 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
736
737 (define not-coma
738 (char-set-complement (char-set #\,)))
739
740 ;; Make sure $HOME really corresponds to the current user. This is
741 ;; necessary since lsh uses that to determine the location of the yarrow
742 ;; seed file, and fails if it's owned by someone else.
743 (and=> (passwd:dir (getpw (getuid)))
744 (cut setenv "HOME" <>))
745
0b724753
LC
746 ;; We rely on protocol-level compression from libssh to optimize large data
747 ;; transfers. Warn if it's missing.
748 (unless (zlib-support?)
69daee23
LC
749 (warning (G_ "Guile-SSH lacks zlib support"))
750 (warning (G_ "data transfers will *not* be compressed!")))
0b724753 751
49e6291a
LC
752 (match args
753 ((system max-silent-time print-build-trace? build-timeout)
754 (let ((max-silent-time (string->number max-silent-time))
755 (build-timeout (string->number build-timeout))
756 (print-build-trace? (string=? print-build-trace? "1")))
8902d0f2 757 (set-thread-name "guix offload")
49e6291a
LC
758 (parameterize ((%current-system system))
759 (let loop ((line (read-line)))
760 (unless (eof-object? line)
761 (cond ((regexp-exec request-line-rx line)
762 =>
763 (lambda (match)
e8a5db80 764 (with-error-handling
49e6291a
LC
765 (process-request (equal? (match:substring match 1) "1")
766 (match:substring match 2) ; system
015f17e8
LC
767 (read-derivation-from-file
768 (match:substring match 3))
49e6291a
LC
769 (string-tokenize
770 (match:substring match 4) not-coma)
771 #:print-build-trace? print-build-trace?
772 #:max-silent-time max-silent-time
773 #:build-timeout build-timeout))))
774 (else
69daee23 775 (leave (G_ "invalid request line: ~s~%") line)))
49e6291a 776 (loop (read-line)))))))
aebaee95
LC
777 (("test" rest ...)
778 (with-error-handling
27991c97
LC
779 (let-values (((file pred)
780 (match rest
781 ((file regexp)
782 (values file
783 (compose (cut string-match regexp <>)
784 build-machine-name)))
785 ((file) (values file (const #t)))
786 (() (values %machine-file (const #t)))
69daee23 787 (x (leave (G_ "wrong number of arguments~%"))))))
27991c97 788 (check-machine-availability (or file %machine-file) pred))))
dafc3daf
RW
789 (("status" rest ...)
790 (with-error-handling
791 (let-values (((file pred)
792 (match rest
793 ((file regexp)
794 (values file
795 (compose (cut string-match regexp <>)
796 build-machine-name)))
797 ((file) (values file (const #t)))
798 (() (values %machine-file (const #t)))
799 (x (leave (G_ "wrong number of arguments~%"))))))
800 (check-machine-status (or file %machine-file) pred))))
49e6291a
LC
801 (("--version")
802 (show-version-and-exit "guix offload"))
803 (("--help")
54a87b2a
MC
804 (format #t (G_ "Usage: guix offload SYSTEM MAX-SILENT-TIME \
805PRINT-BUILD-TRACE? BUILD-TIMEOUT
49e6291a
LC
806Process build offload requests written on the standard input, possibly
807offloading builds to the machines listed in '~a'.~%")
808 %machine-file)
69daee23 809 (display (G_ "
49e6291a
LC
810This tool is meant to be used internally by 'guix-daemon'.\n"))
811 (show-bug-report-information))
812 (x
69daee23 813 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
49e6291a 814
f326fef8 815;;; Local Variables:
d81195bf 816;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
a708de15 817;;; eval: (put 'with-timeout 'scheme-indent-function 2)
f326fef8
LC
818;;; End:
819
49e6291a 820;;; offload.scm ends here