pack: 'squashfs' backend records the profile's search paths.
[jackhill/guix/guix.git] / guix / scripts / offload.scm
CommitLineData
49e6291a 1;;; GNU Guix --- Functional package management for GNU
f9e8a123 2;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
dafc3daf 3;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
49e6291a
LC
4;;;
5;;; This file is part of GNU Guix.
6;;;
7;;; GNU Guix is free software; you can redistribute it and/or modify it
8;;; under the terms of the GNU General Public License as published by
9;;; the Free Software Foundation; either version 3 of the License, or (at
10;;; your option) any later version.
11;;;
12;;; GNU Guix is distributed in the hope that it will be useful, but
13;;; WITHOUT ANY WARRANTY; without even the implied warranty of
14;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;;; GNU General Public License for more details.
16;;;
17;;; You should have received a copy of the GNU General Public License
18;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19
20(define-module (guix scripts offload)
21531add
LC
21 #:use-module (ssh key)
22 #:use-module (ssh auth)
23 #:use-module (ssh session)
24 #:use-module (ssh channel)
cf283dd9 25 #:use-module (ssh popen)
0b724753 26 #:use-module (ssh version)
49e6291a
LC
27 #:use-module (guix config)
28 #:use-module (guix records)
987a29ba 29 #:use-module (guix ssh)
49e6291a 30 #:use-module (guix store)
ed7b4437 31 #:use-module (guix inferior)
49e6291a 32 #:use-module (guix derivations)
2535635f
LC
33 #:use-module ((guix serialization)
34 #:select (nar-error? nar-error-file))
49e6291a
LC
35 #:use-module (guix nar)
36 #:use-module (guix utils)
8902d0f2
LC
37 #:use-module ((guix build syscalls)
38 #:select (fcntl-flock set-thread-name))
f326fef8 39 #:use-module ((guix build utils) #:select (which mkdir-p))
49e6291a
LC
40 #:use-module (guix ui)
41 #:use-module (srfi srfi-1)
6c41cce0 42 #:use-module (srfi srfi-11)
49e6291a
LC
43 #:use-module (srfi srfi-26)
44 #:use-module (srfi srfi-34)
45 #:use-module (srfi srfi-35)
46 #:use-module (ice-9 popen)
47 #:use-module (ice-9 rdelim)
48 #:use-module (ice-9 match)
49 #:use-module (ice-9 regex)
50 #:use-module (ice-9 format)
2535635f 51 #:use-module (ice-9 binary-ports)
49e6291a
LC
52 #:export (build-machine
53 build-requirements
54 guix-offload))
55
56;;; Commentary:
57;;;
58;;; Attempt to offload builds to the machines listed in
59;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
60;;; retrieving the build output(s) over SSH upon success.
61;;;
62;;; This command should not be used directly; instead, it is called on-demand
63;;; by the daemon, unless it was started with '--no-build-hook' or a client
64;;; inhibited build hooks.
65;;;
66;;; Code:
67
68
69(define-record-type* <build-machine>
70 build-machine make-build-machine
71 build-machine?
72 (name build-machine-name) ; string
cecd72d5
LC
73 (port build-machine-port ; number
74 (default 22))
49e6291a
LC
75 (system build-machine-system) ; string
76 (user build-machine-user) ; string
77 (private-key build-machine-private-key ; file name
21531add
LC
78 (default (user-openssh-private-key)))
79 (host-key build-machine-host-key) ; string
1d48cf94
LC
80 (compression build-machine-compression ; string
81 (default "zlib@openssh.com,zlib"))
82 (compression-level build-machine-compression-level ;integer
83 (default 3))
cf283dd9
LC
84 (daemon-socket build-machine-daemon-socket ; string
85 (default "/var/guix/daemon-socket/socket"))
49e6291a
LC
86 (parallel-builds build-machine-parallel-builds ; number
87 (default 1))
88 (speed build-machine-speed ; inexact real
89 (default 1.0))
90 (features build-machine-features ; list of strings
91 (default '())))
92
93(define-record-type* <build-requirements>
94 build-requirements make-build-requirements
95 build-requirements?
96 (system build-requirements-system) ; string
97 (features build-requirements-features ; list of strings
98 (default '())))
99
100(define %machine-file
101 ;; File that lists machines available as build slaves.
102 (string-append %config-directory "/machines.scm"))
103
21531add
LC
104(define (user-openssh-private-key)
105 "Return the user's default SSH private key, or #f if it could not be
49e6291a
LC
106determined."
107 (and=> (getenv "HOME")
21531add 108 (cut string-append <> "/.ssh/id_rsa")))
49e6291a
LC
109
110(define %user-module
111 ;; Module in which the machine description file is loaded.
112 (let ((module (make-fresh-user-module)))
113 (module-use! module (resolve-interface '(guix scripts offload)))
114 module))
115
116(define* (build-machines #:optional (file %machine-file))
117 "Read the list of build machines from FILE and return it."
118 (catch #t
119 (lambda ()
120 ;; Avoid ABI incompatibility with the <build-machine> record.
e2721a05 121 ;; (set! %fresh-auto-compile #t)
49e6291a
LC
122
123 (save-module-excursion
124 (lambda ()
125 (set-current-module %user-module)
a9a685cc
LC
126 (match (primitive-load file)
127 (((? build-machine? machines) ...)
128 machines)
129 (_
130 ;; Instead of crashing, assume the empty list.
131 (warning (G_ "'~a' did not return a list of build machines; \
132ignoring it~%")
133 file)
134 '())))))
49e6291a
LC
135 (lambda args
136 (match args
e465d9e1 137 (('system-error . rest)
49e6291a
LC
138 (let ((err (system-error-errno args)))
139 ;; Silently ignore missing file since this is a common case.
140 (if (= ENOENT err)
141 '()
69daee23 142 (leave (G_ "failed to open machine file '~a': ~a~%")
92cb2e28 143 file (strerror err)))))
c1202fb1
LC
144 (('syntax-error proc message properties form . rest)
145 (let ((loc (source-properties->location properties)))
69daee23 146 (leave (G_ "~a: ~a~%")
c1202fb1 147 (location->string loc) message)))
e465d9e1 148 (x
69daee23 149 (leave (G_ "failed to load machine file '~a': ~s~%")
92cb2e28 150 file args))))))
49e6291a 151
21531add
LC
152(define (host-key->type+key host-key)
153 "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
154its key type as a symbol, and the actual base64-encoded string."
155 (define (type->symbol type)
156 (and (string-prefix? "ssh-" type)
157 (string->symbol (string-drop type 4))))
158
159 (match (string-tokenize host-key)
ba97e454 160 ((type key x)
21531add
LC
161 (values (type->symbol type) key))
162 ((type key)
163 (values (type->symbol type) key))))
164
165(define (private-key-from-file* file)
166 "Like 'private-key-from-file', but raise an error that 'with-error-handling'
167can interpret meaningfully."
168 (catch 'guile-ssh-error
169 (lambda ()
170 (private-key-from-file file))
171 (lambda (key proc str . rest)
172 (raise (condition
69daee23 173 (&message (message (format #f (G_ "failed to load SSH \
21531add
LC
174private key from '~a': ~a")
175 file str))))))))
176
177(define (open-ssh-session machine)
178 "Open an SSH session for MACHINE and return it. Throw an error on failure."
179 (let ((private (private-key-from-file* (build-machine-private-key machine)))
180 (public (public-key-from-file
181 (string-append (build-machine-private-key machine)
182 ".pub")))
183 (session (make-session #:user (build-machine-user machine)
184 #:host (build-machine-name machine)
185 #:port (build-machine-port machine)
8d125cfc 186 #:timeout 10 ;seconds
21531add
LC
187 ;; #:log-verbosity 'protocol
188 #:identity (build-machine-private-key machine)
189
bd834577
LC
190 ;; By default libssh reads ~/.ssh/known_hosts
191 ;; and uses that to adjust its choice of cipher
192 ;; suites, which changes the type of host key
193 ;; that the server sends (RSA vs. Ed25519,
194 ;; etc.). Opt for something reproducible and
195 ;; stateless instead.
196 #:knownhosts "/dev/null"
197
21531add
LC
198 ;; We need lightweight compression when
199 ;; exchanging full archives.
1d48cf94
LC
200 #:compression
201 (build-machine-compression machine)
202 #:compression-level
203 (build-machine-compression-level machine))))
74afca5d
LC
204 (match (connect! session)
205 ('ok
206 ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
207 ;; ed25519 keys and 'get-key-type' returns #f in that case.
208 (let-values (((server) (get-server-public-key session))
209 ((type key) (host-key->type+key
210 (build-machine-host-key machine))))
211 (unless (and (or (not (get-key-type server))
212 (eq? (get-key-type server) type))
213 (string=? (public-key->string server) key))
214 ;; Key mismatch: something's wrong. XXX: It could be that the server
215 ;; provided its Ed25519 key when we where expecting its RSA key.
69daee23 216 (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
21531add 217instead of '~a' of type '~a'~%")
74afca5d
LC
218 (build-machine-name machine)
219 (public-key->string server) (get-key-type server)
220 key type)))
221
222 (let ((auth (userauth-public-key! session private)))
223 (unless (eq? 'success auth)
224 (disconnect! session)
69daee23 225 (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
74afca5d
LC
226 (build-machine-name machine) (get-error session))))
227
228 session)
229 (x
230 ;; Connection failed or timeout expired.
69daee23 231 (leave (G_ "failed to connect to '~a': ~a~%")
74afca5d 232 (build-machine-name machine) (get-error session))))))
21531add 233
59f704df
LC
234\f
235;;;
236;;; Synchronization.
237;;;
238
59f704df
LC
239(define (machine-slot-file machine slot)
240 "Return the file name of MACHINE's file for SLOT."
241 ;; For each machine we have a bunch of files representing each build slot.
242 ;; When choosing a build machine, we attempt to get an exclusive lock on one
243 ;; of these; if we fail, that means all the build slots are already taken.
244 ;; Inspired by Nix's build-remote.pl.
245 (string-append (string-append %state-directory "/offload/"
246 (build-machine-name machine)
247 "/" (number->string slot))))
248
249(define (acquire-build-slot machine)
250 "Attempt to acquire a build slot on MACHINE. Return the port representing
251the slot, or #f if none is available.
252
253This mechanism allows us to set a hard limit on the number of simultaneous
254connections allowed to MACHINE."
255 (mkdir-p (dirname (machine-slot-file machine 0)))
0ef595b9
LC
256
257 ;; When several 'guix offload' processes run in parallel, there's a race
258 ;; among them, but since they try the slots in the same order, we're fine.
259 (any (lambda (slot)
260 (let ((port (open-file (machine-slot-file machine slot)
261 "w0")))
262 (catch 'flock-error
263 (lambda ()
264 (fcntl-flock port 'write-lock #:wait? #f)
265 ;; Got it!
266 (format (current-error-port)
267 "process ~a acquired build slot '~a'~%"
268 (getpid) (port-filename port))
269 port)
270 (lambda args
271 ;; PORT is already locked by another process.
272 (close-port port)
273 #f))))
274 (iota (build-machine-parallel-builds machine))))
59f704df
LC
275
276(define (release-build-slot slot)
277 "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
278 (close-port slot))
279
280\f
281;;;
282;;; Offloading.
283;;;
284
d81195bf
LC
285(define (build-log-port)
286 "Return the default port where build logs should be sent. The default is
287file descriptor 4, which is open by the daemon before running the offload
288hook."
289 (let ((port (fdopen 4 "w0")))
290 ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
291 (set-port-revealed! port 1)
292 port))
293
ed7b4437
LC
294(define (node-guile-version node)
295 (inferior-eval '(version) node))
296
63b0c3ea
LC
297(define (node-free-disk-space node)
298 "Return the free disk space, in bytes, in NODE's store."
ed7b4437
LC
299 (inferior-eval `(begin
300 (use-modules (guix build syscalls))
301 (free-disk-space ,(%store-prefix)))
302 node))
63b0c3ea 303
88da0b68
LC
304(define* (transfer-and-offload drv machine
305 #:key
306 (inputs '())
307 (outputs '())
308 (max-silent-time 3600)
714084e6 309 build-timeout
88da0b68
LC
310 print-build-trace?)
311 "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
312INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
313MACHINE."
9e76eed3
LC
314 (define session
315 (open-ssh-session machine))
316
cf283dd9
LC
317 (define store
318 (connect-to-remote-daemon session
319 (build-machine-daemon-socket machine)))
320
321 (set-build-options store
322 #:print-build-trace print-build-trace?
323 #:max-silent-time max-silent-time
324 #:timeout build-timeout)
325
326 ;; Protect DRV from garbage collection.
327 (add-temp-root store (derivation-file-name drv))
328
987a29ba
LC
329 (with-store local
330 (send-files local (cons (derivation-file-name drv) inputs) store
331 #:log-port (current-output-port)))
cf283dd9
LC
332 (format (current-error-port) "offloading '~a' to '~a'...~%"
333 (derivation-file-name drv) (build-machine-name machine))
334 (format (current-error-port) "@ build-remote ~a ~a~%"
335 (derivation-file-name drv) (build-machine-name machine))
336
f9e8a123 337 (guard (c ((store-protocol-error? c)
cf283dd9 338 (format (current-error-port)
69daee23 339 (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
cf283dd9
LC
340 (derivation-file-name drv)
341 (build-machine-name machine)
f9e8a123 342 (store-protocol-error-message c))
ed7b4437
LC
343 (let* ((inferior (false-if-exception (remote-inferior session)))
344 (space (false-if-exception
345 (node-free-disk-space inferior))))
346
347 (when inferior
348 (close-inferior inferior))
b96e05ae
LC
349
350 ;; Use exit code 100 for a permanent build failure. The daemon
351 ;; interprets other non-zero codes as transient build failures.
352 (if (and space (< space (* 10 (expt 2 20))))
353 (begin
354 (format (current-error-port)
355 (G_ "build failure may have been caused by lack \
356of free disk space on '~a'~%")
357 (build-machine-name machine))
358 (primitive-exit 1))
359 (primitive-exit 100)))))
0237d797
LC
360 (parameterize ((current-build-output-port (build-log-port)))
361 (build-derivations store (list drv))))
cf283dd9 362
d06d54e3
LC
363 (retrieve-files* outputs store
364
365 ;; We cannot use the 'import-paths' RPC here because we
366 ;; already hold the locks for FILES.
367 #:import
368 (lambda (port)
369 (restore-file-set port
370 #:log-port (current-error-port)
371 #:lock? #f)))
372
cf283dd9
LC
373 (format (current-error-port) "done with offloaded '~a'~%"
374 (derivation-file-name drv)))
375
59f704df
LC
376\f
377;;;
378;;; Scheduling.
379;;;
380
49e6291a
LC
381(define (machine-matches? machine requirements)
382 "Return #t if MACHINE matches REQUIREMENTS."
383 (and (string=? (build-requirements-system requirements)
384 (build-machine-system machine))
385 (lset<= string=?
386 (build-requirements-features requirements)
387 (build-machine-features machine))))
388
63b0c3ea
LC
389(define %minimum-disk-space
390 ;; Minimum disk space required on the build machine for a build to be
391 ;; offloaded. This keeps us from offloading to machines that are bound to
392 ;; run out of disk space.
393 (* 100 (expt 2 20))) ;100 MiB
394
bbe66a53
LC
395(define (node-load node)
396 "Return the load on NODE. Return +∞ if NODE is misbehaving."
ed7b4437
LC
397 (let ((line (inferior-eval '(begin
398 (use-modules (ice-9 rdelim))
399 (call-with-input-file "/proc/loadavg"
400 read-string))
401 node)))
bbe66a53
LC
402 (if (eof-object? line)
403 +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
404 (match (string-tokenize line)
405 ((one five fifteen . x)
406 (string->number one))
407 (x
408 +inf.0)))))
409
410(define (normalized-load machine load)
411 "Divide LOAD by the number of parallel builds of MACHINE."
412 (if (rational? load)
413 (let* ((jobs (build-machine-parallel-builds machine))
414 (normalized (/ load jobs)))
415 (format (current-error-port) "load on machine '~a' is ~s\
165f4b2a 416 (normalized: ~s)~%"
bbe66a53
LC
417 (build-machine-name machine) load normalized)
418 normalized)
419 load))
165f4b2a 420
d8e89b1c
LC
421(define (random-seed)
422 (logxor (getpid) (car (gettimeofday))))
423
424(define shuffle
425 (let ((state (seed->random-state (random-seed))))
426 (lambda (lst)
427 "Return LST shuffled (using the Fisher-Yates algorithm.)"
428 (define vec (list->vector lst))
429 (let loop ((result '())
430 (i (vector-length vec)))
431 (if (zero? i)
432 result
433 (let* ((j (random i state))
434 (val (vector-ref vec j)))
435 (vector-set! vec j (vector-ref vec (- i 1)))
436 (loop (cons val result) (- i 1))))))))
437
88da0b68 438(define (choose-build-machine machines)
84620dd0
LC
439 "Return two values: the best machine among MACHINES and its build
440slot (which must later be released with 'release-build-slot'), or #f and #f."
d652b851
LC
441
442 ;; Proceed like this:
7f4d102c 443 ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
d652b851 444 ;; those machines for which we failed.
7f4d102c
LC
445 ;; 2. Choose the best machine among those that are left.
446 ;; 3. Release the previously-acquired build slots of the other machines.
447
448 (define machines+slots
449 (filter-map (lambda (machine)
450 (let ((slot (acquire-build-slot machine)))
451 (and slot (list machine slot))))
452 (shuffle machines)))
453
454 (define (undecorate pred)
455 (lambda (a b)
456 (match a
457 ((machine1 slot1)
458 (match b
459 ((machine2 slot2)
460 (pred machine1 machine2)))))))
461
462 (define (machine-faster? m1 m2)
463 ;; Return #t if M1 is faster than M2.
464 (> (build-machine-speed m1)
465 (build-machine-speed m2)))
466
467 (let loop ((machines+slots
468 (sort machines+slots (undecorate machine-faster?))))
469 (match machines+slots
470 (((best slot) others ...)
471 ;; Return the best machine unless it's already overloaded.
472 ;; Note: We call 'node-load' only as a last resort because it is
473 ;; too costly to call it once for every machine.
474 (let* ((session (false-if-exception (open-ssh-session best)))
475 (node (and session (remote-inferior session)))
476 (load (and node (normalized-load best (node-load node))))
477 (space (and node (node-free-disk-space node))))
478 (when node (close-inferior node))
479 (when session (disconnect! session))
480 (if (and node (< load 2.) (>= space %minimum-disk-space))
481 (match others
482 (((machines slots) ...)
483 ;; Release slots from the uninteresting machines.
484 (for-each release-build-slot slots)
485
486 ;; The caller must keep SLOT to protect it from GC and to
487 ;; eventually release it.
488 (values best slot)))
489 (begin
490 ;; BEST is unsuitable, so try the next one.
491 (when (and space (< space %minimum-disk-space))
492 (format (current-error-port)
493 "skipping machine '~a' because it is low \
63b0c3ea 494on disk space (~,2f MiB free)~%"
7f4d102c
LC
495 (build-machine-name best)
496 (/ space (expt 2 20) 1.)))
497 (release-build-slot slot)
498 (loop others)))))
499 (()
500 (values #f #f)))))
49e6291a 501
a708de15
LC
502(define (call-with-timeout timeout drv thunk)
503 "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
504THUNK. Use DRV as an indication of what we were building when the timeout
505expired."
506 (if (number? timeout)
507 (dynamic-wind
508 (lambda ()
509 (sigaction SIGALRM
510 (lambda _
511 ;; The exit code here will be 1, which guix-daemon will
512 ;; interpret as a transient failure.
513 (leave (G_ "timeout expired while offloading '~a'~%")
514 (derivation-file-name drv))))
515 (alarm timeout))
516 thunk
517 (lambda ()
518 (alarm 0)))
519 (thunk)))
520
521(define-syntax-rule (with-timeout timeout drv exp ...)
522 "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
523If TIMEOUT is #f, simply evaluate EXP..."
524 (call-with-timeout timeout drv (lambda () exp ...)))
525
49e6291a
LC
526(define* (process-request wants-local? system drv features
527 #:key
528 print-build-trace? (max-silent-time 3600)
714084e6 529 build-timeout)
49e6291a 530 "Process a request to build DRV."
88da0b68
LC
531 (let* ((local? (and wants-local? (string=? system (%current-system))))
532 (reqs (build-requirements
533 (system system)
534 (features features)))
535 (candidates (filter (cut machine-matches? <> reqs)
536 (build-machines))))
537 (match candidates
538 (()
539 ;; We'll never be able to match REQS.
540 (display "# decline\n"))
ba97e454 541 ((x ...)
84620dd0
LC
542 (let-values (((machine slot)
543 (choose-build-machine candidates)))
88da0b68 544 (if machine
84620dd0
LC
545 (dynamic-wind
546 (const #f)
547 (lambda ()
548 ;; Offload DRV to MACHINE.
549 (display "# accept\n")
550 (let ((inputs (string-tokenize (read-line)))
551 (outputs (string-tokenize (read-line))))
a708de15
LC
552 ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
553 ;; be issues with the connection or deadlocks that could
554 ;; lead the 'guix offload' process to remain stuck forever.
555 ;; To avoid that, install a timeout here as well.
556 (with-timeout build-timeout drv
557 (transfer-and-offload drv machine
558 #:inputs inputs
559 #:outputs outputs
560 #:max-silent-time max-silent-time
561 #:build-timeout build-timeout
562 #:print-build-trace?
563 print-build-trace?))))
84620dd0
LC
564 (lambda ()
565 (release-build-slot slot)))
88da0b68
LC
566
567 ;; Not now, all the machines are busy.
568 (display "# postpone\n")))))))
49e6291a 569
49e6291a 570\f
aebaee95
LC
571;;;
572;;; Installation tests.
573;;;
574
575(define (assert-node-repl node name)
576 "Bail out if NODE is not running Guile."
577 (match (node-guile-version node)
578 (#f
4eb0f9ae 579 (report-guile-error name))
aebaee95 580 ((? string? version)
ed7b4437 581 (info (G_ "'~a' is running GNU Guile ~a~%")
aebaee95
LC
582 name (node-guile-version node)))))
583
584(define (assert-node-has-guix node name)
10b2834f
LC
585 "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
586daemon is not running."
587 (unless (inferior? node)
588 (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
589
590 (match (inferior-eval '(begin
591 (use-modules (guix))
592 (and add-text-to-store 'alright))
593 node)
594 ('alright #t)
595 (_ (report-module-error name)))
596
597 (match (inferior-eval '(begin
598 (use-modules (guix))
599 (with-store store
600 (add-text-to-store store "test"
601 "Hello, build machine!")))
602 node)
603 ((? string? str)
604 (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
605 name str))
606 (x
607 (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
608 name x))))
aebaee95
LC
609
610(define %random-state
611 (delay
612 (seed->random-state (logxor (getpid) (car (gettimeofday))))))
613
987a29ba
LC
614(define* (nonce #:optional (name (gethostname)))
615 (string-append name "-"
aebaee95
LC
616 (number->string (random 1000000 (force %random-state)))))
617
ed7b4437 618(define (assert-node-can-import session node name daemon-socket)
aebaee95 619 "Bail out if NODE refuses to import our archives."
ed7b4437
LC
620 (with-store store
621 (let* ((item (add-text-to-store store "export-test" (nonce)))
622 (remote (connect-to-remote-daemon session daemon-socket)))
623 (with-store local
624 (send-files local (list item) remote))
625
626 (if (valid-path? remote item)
627 (info (G_ "'~a' successfully imported '~a'~%")
628 name item)
629 (leave (G_ "'~a' was not properly imported on '~a'~%")
630 item name)))))
631
632(define (assert-node-can-export session node name daemon-socket)
aebaee95 633 "Bail out if we cannot import signed archives from NODE."
ed7b4437 634 (let* ((remote (connect-to-remote-daemon session daemon-socket))
987a29ba 635 (item (add-text-to-store remote "import-test" (nonce name))))
aebaee95 636 (with-store store
987a29ba 637 (if (and (retrieve-files store (list item) remote)
aebaee95 638 (valid-path? store item))
69daee23 639 (info (G_ "successfully imported '~a' from '~a'~%")
aebaee95 640 item name)
69daee23 641 (leave (G_ "failed to import '~a' from '~a'~%")
aebaee95
LC
642 item name)))))
643
27991c97
LC
644(define (check-machine-availability machine-file pred)
645 "Check that each machine matching PRED in MACHINE-FILE is usable as a build
646machine."
2b513387
LC
647 (define (build-machine=? m1 m2)
648 (and (string=? (build-machine-name m1) (build-machine-name m2))
649 (= (build-machine-port m1) (build-machine-port m2))))
650
651 ;; A given build machine may appear several times (e.g., once for
652 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
27991c97
LC
653 (let ((machines (filter pred
654 (delete-duplicates (build-machines machine-file)
655 build-machine=?))))
69daee23 656 (info (G_ "testing ~a build machines defined in '~a'...~%")
aebaee95
LC
657 (length machines) machine-file)
658 (let* ((names (map build-machine-name machines))
659 (sockets (map build-machine-daemon-socket machines))
660 (sessions (map open-ssh-session machines))
ed7b4437 661 (nodes (map remote-inferior sessions)))
aebaee95 662 (for-each assert-node-has-guix nodes names)
10b2834f 663 (for-each assert-node-repl nodes names)
ed7b4437
LC
664 (for-each assert-node-can-import sessions nodes names sockets)
665 (for-each assert-node-can-export sessions nodes names sockets)
666 (for-each close-inferior nodes)
667 (for-each disconnect! sessions))))
aebaee95 668
dafc3daf
RW
669(define (check-machine-status machine-file pred)
670 "Print the load of each machine matching PRED in MACHINE-FILE."
671 (define (build-machine=? m1 m2)
672 (and (string=? (build-machine-name m1) (build-machine-name m2))
673 (= (build-machine-port m1) (build-machine-port m2))))
674
675 ;; A given build machine may appear several times (e.g., once for
676 ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
677 (let ((machines (filter pred
678 (delete-duplicates (build-machines machine-file)
679 build-machine=?))))
680 (info (G_ "getting status of ~a build machines defined in '~a'...~%")
681 (length machines) machine-file)
682 (for-each (lambda (machine)
10b2834f
LC
683 (define session
684 (open-ssh-session machine))
685
686 (match (remote-inferior session)
687 (#f
688 (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
689 (build-machine-name machine)))
690 ((? inferior? inferior)
02ec889e
LC
691 (let ((now (car (gettimeofday))))
692 (match (inferior-eval '(list (uname)
693 (car (gettimeofday)))
694 inferior)
695 ((uts time)
696 (when (< time now)
697 ;; Build machine clocks must not be behind as this
698 ;; could cause timestamp issues.
699 (warning (G_ "machine '~a' is ~a seconds behind~%")
700 (build-machine-name machine)
701 (- now time)))
702
703 (let ((load (node-load inferior))
704 (free (node-free-disk-space inferior)))
705 (close-inferior inferior)
706 (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
707 host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
708 time difference: ~a s~%"
709 (build-machine-name machine)
710 (utsname:sysname uts) (utsname:release uts)
711 (utsname:machine uts)
712 (utsname:nodename uts)
713 (normalized-load machine load)
714 (/ free (expt 2 20) 1.)
715 (- time now))))))))
10b2834f
LC
716
717 (disconnect! session))
dafc3daf
RW
718 machines)))
719
aebaee95 720\f
49e6291a
LC
721;;;
722;;; Entry point.
723;;;
724
725(define (guix-offload . args)
726 (define request-line-rx
727 ;; The request format. See 'tryBuildHook' method in build.cc.
728 (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
729
730 (define not-coma
731 (char-set-complement (char-set #\,)))
732
733 ;; Make sure $HOME really corresponds to the current user. This is
734 ;; necessary since lsh uses that to determine the location of the yarrow
735 ;; seed file, and fails if it's owned by someone else.
736 (and=> (passwd:dir (getpw (getuid)))
737 (cut setenv "HOME" <>))
738
0b724753
LC
739 ;; We rely on protocol-level compression from libssh to optimize large data
740 ;; transfers. Warn if it's missing.
741 (unless (zlib-support?)
69daee23
LC
742 (warning (G_ "Guile-SSH lacks zlib support"))
743 (warning (G_ "data transfers will *not* be compressed!")))
0b724753 744
49e6291a
LC
745 (match args
746 ((system max-silent-time print-build-trace? build-timeout)
747 (let ((max-silent-time (string->number max-silent-time))
748 (build-timeout (string->number build-timeout))
749 (print-build-trace? (string=? print-build-trace? "1")))
8902d0f2 750 (set-thread-name "guix offload")
49e6291a
LC
751 (parameterize ((%current-system system))
752 (let loop ((line (read-line)))
753 (unless (eof-object? line)
754 (cond ((regexp-exec request-line-rx line)
755 =>
756 (lambda (match)
e8a5db80 757 (with-error-handling
49e6291a
LC
758 (process-request (equal? (match:substring match 1) "1")
759 (match:substring match 2) ; system
015f17e8
LC
760 (read-derivation-from-file
761 (match:substring match 3))
49e6291a
LC
762 (string-tokenize
763 (match:substring match 4) not-coma)
764 #:print-build-trace? print-build-trace?
765 #:max-silent-time max-silent-time
766 #:build-timeout build-timeout))))
767 (else
69daee23 768 (leave (G_ "invalid request line: ~s~%") line)))
49e6291a 769 (loop (read-line)))))))
aebaee95
LC
770 (("test" rest ...)
771 (with-error-handling
27991c97
LC
772 (let-values (((file pred)
773 (match rest
774 ((file regexp)
775 (values file
776 (compose (cut string-match regexp <>)
777 build-machine-name)))
778 ((file) (values file (const #t)))
779 (() (values %machine-file (const #t)))
69daee23 780 (x (leave (G_ "wrong number of arguments~%"))))))
27991c97 781 (check-machine-availability (or file %machine-file) pred))))
dafc3daf
RW
782 (("status" rest ...)
783 (with-error-handling
784 (let-values (((file pred)
785 (match rest
786 ((file regexp)
787 (values file
788 (compose (cut string-match regexp <>)
789 build-machine-name)))
790 ((file) (values file (const #t)))
791 (() (values %machine-file (const #t)))
792 (x (leave (G_ "wrong number of arguments~%"))))))
793 (check-machine-status (or file %machine-file) pred))))
49e6291a
LC
794 (("--version")
795 (show-version-and-exit "guix offload"))
796 (("--help")
69daee23 797 (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
49e6291a
LC
798Process build offload requests written on the standard input, possibly
799offloading builds to the machines listed in '~a'.~%")
800 %machine-file)
69daee23 801 (display (G_ "
49e6291a
LC
802This tool is meant to be used internally by 'guix-daemon'.\n"))
803 (show-bug-report-information))
804 (x
69daee23 805 (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
49e6291a 806
f326fef8 807;;; Local Variables:
d81195bf 808;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
a708de15 809;;; eval: (put 'with-timeout 'scheme-indent-function 2)
f326fef8
LC
810;;; End:
811
49e6291a 812;;; offload.scm ends here