Commit | Line | Data |
---|---|---|
49e6291a | 1 | ;;; GNU Guix --- Functional package management for GNU |
d51bfe24 | 2 | ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> |
dafc3daf | 3 | ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net> |
4b5a6fbc | 4 | ;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com> |
49e6291a LC |
5 | ;;; |
6 | ;;; This file is part of GNU Guix. | |
7 | ;;; | |
8 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
9 | ;;; under the terms of the GNU General Public License as published by | |
10 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
11 | ;;; your option) any later version. | |
12 | ;;; | |
13 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
14 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | ;;; GNU General Public License for more details. | |
17 | ;;; | |
18 | ;;; You should have received a copy of the GNU General Public License | |
19 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
20 | ||
21 | (define-module (guix scripts offload) | |
21531add LC |
22 | #:use-module (ssh key) |
23 | #:use-module (ssh auth) | |
24 | #:use-module (ssh session) | |
25 | #:use-module (ssh channel) | |
cf283dd9 | 26 | #:use-module (ssh popen) |
0b724753 | 27 | #:use-module (ssh version) |
49e6291a LC |
28 | #:use-module (guix config) |
29 | #:use-module (guix records) | |
987a29ba | 30 | #:use-module (guix ssh) |
49e6291a | 31 | #:use-module (guix store) |
ed7b4437 | 32 | #:use-module (guix inferior) |
49e6291a | 33 | #:use-module (guix derivations) |
2535635f LC |
34 | #:use-module ((guix serialization) |
35 | #:select (nar-error? nar-error-file)) | |
49e6291a | 36 | #:use-module (guix nar) |
d51bfe24 | 37 | #:use-module ((guix utils) #:select (%current-system)) |
8902d0f2 LC |
38 | #:use-module ((guix build syscalls) |
39 | #:select (fcntl-flock set-thread-name)) | |
f326fef8 | 40 | #:use-module ((guix build utils) #:select (which mkdir-p)) |
49e6291a | 41 | #:use-module (guix ui) |
3794ce93 | 42 | #:use-module (guix scripts) |
d51bfe24 | 43 | #:use-module (guix diagnostics) |
49e6291a | 44 | #:use-module (srfi srfi-1) |
6c41cce0 | 45 | #:use-module (srfi srfi-11) |
49e6291a LC |
46 | #:use-module (srfi srfi-26) |
47 | #:use-module (srfi srfi-34) | |
48 | #:use-module (srfi srfi-35) | |
49 | #:use-module (ice-9 popen) | |
50 | #:use-module (ice-9 rdelim) | |
51 | #:use-module (ice-9 match) | |
52 | #:use-module (ice-9 regex) | |
53 | #:use-module (ice-9 format) | |
2535635f | 54 | #:use-module (ice-9 binary-ports) |
49e6291a LC |
55 | #:export (build-machine |
56 | build-requirements | |
57 | guix-offload)) | |
58 | ||
59 | ;;; Commentary: | |
60 | ;;; | |
61 | ;;; Attempt to offload builds to the machines listed in | |
62 | ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and | |
63 | ;;; retrieving the build output(s) over SSH upon success. | |
64 | ;;; | |
65 | ;;; This command should not be used directly; instead, it is called on-demand | |
dc209d5a | 66 | ;;; by the daemon, unless it was started with '--no-offload' or a client |
49e6291a LC |
67 | ;;; inhibited build hooks. |
68 | ;;; | |
69 | ;;; Code: | |
70 | ||
49e6291a LC |
71 | (define-record-type* <build-machine> |
72 | build-machine make-build-machine | |
73 | build-machine? | |
74 | (name build-machine-name) ; string | |
cecd72d5 LC |
75 | (port build-machine-port ; number |
76 | (default 22)) | |
4b5a6fbc MC |
77 | (systems %build-machine-systems ; list of strings |
78 | (default #f)) ; drop default after system is removed | |
79 | (system %build-machine-system ; deprecated | |
80 | (default #f)) | |
49e6291a LC |
81 | (user build-machine-user) ; string |
82 | (private-key build-machine-private-key ; file name | |
21531add LC |
83 | (default (user-openssh-private-key))) |
84 | (host-key build-machine-host-key) ; string | |
1d48cf94 LC |
85 | (compression build-machine-compression ; string |
86 | (default "zlib@openssh.com,zlib")) | |
87 | (compression-level build-machine-compression-level ;integer | |
88 | (default 3)) | |
cf283dd9 LC |
89 | (daemon-socket build-machine-daemon-socket ; string |
90 | (default "/var/guix/daemon-socket/socket")) | |
49e6291a LC |
91 | (parallel-builds build-machine-parallel-builds ; number |
92 | (default 1)) | |
93 | (speed build-machine-speed ; inexact real | |
94 | (default 1.0)) | |
95 | (features build-machine-features ; list of strings | |
96 | (default '()))) | |
97 | ||
4b5a6fbc MC |
98 | ;;; Deprecated. |
99 | (define (build-machine-system machine) | |
100 | (warning (G_ "The 'system' field is deprecated, \ | |
101 | please use 'systems' instead.~%")) | |
102 | (%build-machine-system machine)) | |
103 | ||
104 | ;;; TODO: Remove after the deprecated 'system' field is removed. | |
105 | (define (build-machine-systems machine) | |
106 | (or (%build-machine-systems machine) | |
107 | (list (build-machine-system machine)) | |
108 | (leave (G_ "The build-machine object lacks a value for its 'systems' | |
109 | field.")))) | |
110 | ||
49e6291a LC |
111 | (define-record-type* <build-requirements> |
112 | build-requirements make-build-requirements | |
113 | build-requirements? | |
114 | (system build-requirements-system) ; string | |
115 | (features build-requirements-features ; list of strings | |
116 | (default '()))) | |
117 | ||
118 | (define %machine-file | |
119 | ;; File that lists machines available as build slaves. | |
120 | (string-append %config-directory "/machines.scm")) | |
121 | ||
21531add LC |
122 | (define (user-openssh-private-key) |
123 | "Return the user's default SSH private key, or #f if it could not be | |
49e6291a LC |
124 | determined." |
125 | (and=> (getenv "HOME") | |
21531add | 126 | (cut string-append <> "/.ssh/id_rsa"))) |
49e6291a LC |
127 | |
128 | (define %user-module | |
129 | ;; Module in which the machine description file is loaded. | |
130 | (let ((module (make-fresh-user-module))) | |
131 | (module-use! module (resolve-interface '(guix scripts offload))) | |
132 | module)) | |
133 | ||
134 | (define* (build-machines #:optional (file %machine-file)) | |
135 | "Read the list of build machines from FILE and return it." | |
136 | (catch #t | |
137 | (lambda () | |
138 | ;; Avoid ABI incompatibility with the <build-machine> record. | |
e2721a05 | 139 | ;; (set! %fresh-auto-compile #t) |
49e6291a LC |
140 | |
141 | (save-module-excursion | |
142 | (lambda () | |
143 | (set-current-module %user-module) | |
a9a685cc LC |
144 | (match (primitive-load file) |
145 | (((? build-machine? machines) ...) | |
146 | machines) | |
147 | (_ | |
148 | ;; Instead of crashing, assume the empty list. | |
149 | (warning (G_ "'~a' did not return a list of build machines; \ | |
150 | ignoring it~%") | |
151 | file) | |
152 | '()))))) | |
49e6291a LC |
153 | (lambda args |
154 | (match args | |
e465d9e1 | 155 | (('system-error . rest) |
49e6291a LC |
156 | (let ((err (system-error-errno args))) |
157 | ;; Silently ignore missing file since this is a common case. | |
158 | (if (= ENOENT err) | |
159 | '() | |
69daee23 | 160 | (leave (G_ "failed to open machine file '~a': ~a~%") |
92cb2e28 | 161 | file (strerror err))))) |
c1202fb1 LC |
162 | (('syntax-error proc message properties form . rest) |
163 | (let ((loc (source-properties->location properties))) | |
69daee23 | 164 | (leave (G_ "~a: ~a~%") |
c1202fb1 | 165 | (location->string loc) message))) |
e465d9e1 | 166 | (x |
69daee23 | 167 | (leave (G_ "failed to load machine file '~a': ~s~%") |
92cb2e28 | 168 | file args)))))) |
49e6291a | 169 | |
21531add LC |
170 | (define (private-key-from-file* file) |
171 | "Like 'private-key-from-file', but raise an error that 'with-error-handling' | |
172 | can interpret meaningfully." | |
173 | (catch 'guile-ssh-error | |
174 | (lambda () | |
175 | (private-key-from-file file)) | |
176 | (lambda (key proc str . rest) | |
d51bfe24 | 177 | (raise (formatted-message (G_ "failed to load SSH \ |
21531add | 178 | private key from '~a': ~a") |
d51bfe24 | 179 | file str))))) |
21531add | 180 | |
00d73219 | 181 | (define* (open-ssh-session machine #:optional (max-silent-time -1)) |
21531add LC |
182 | "Open an SSH session for MACHINE and return it. Throw an error on failure." |
183 | (let ((private (private-key-from-file* (build-machine-private-key machine))) | |
184 | (public (public-key-from-file | |
185 | (string-append (build-machine-private-key machine) | |
186 | ".pub"))) | |
187 | (session (make-session #:user (build-machine-user machine) | |
188 | #:host (build-machine-name machine) | |
189 | #:port (build-machine-port machine) | |
00d73219 | 190 | #:timeout 10 ;initial timeout (seconds) |
21531add LC |
191 | ;; #:log-verbosity 'protocol |
192 | #:identity (build-machine-private-key machine) | |
193 | ||
bd834577 LC |
194 | ;; By default libssh reads ~/.ssh/known_hosts |
195 | ;; and uses that to adjust its choice of cipher | |
196 | ;; suites, which changes the type of host key | |
197 | ;; that the server sends (RSA vs. Ed25519, | |
198 | ;; etc.). Opt for something reproducible and | |
199 | ;; stateless instead. | |
200 | #:knownhosts "/dev/null" | |
201 | ||
21531add LC |
202 | ;; We need lightweight compression when |
203 | ;; exchanging full archives. | |
1d48cf94 LC |
204 | #:compression |
205 | (build-machine-compression machine) | |
206 | #:compression-level | |
207 | (build-machine-compression-level machine)))) | |
74afca5d LC |
208 | (match (connect! session) |
209 | ('ok | |
114dcb42 LC |
210 | ;; Make sure the server's key is what we expect. |
211 | (authenticate-server* session (build-machine-host-key machine)) | |
74afca5d LC |
212 | |
213 | (let ((auth (userauth-public-key! session private))) | |
214 | (unless (eq? 'success auth) | |
215 | (disconnect! session) | |
69daee23 | 216 | (leave (G_ "SSH public key authentication failed for '~a': ~a~%") |
74afca5d LC |
217 | (build-machine-name machine) (get-error session)))) |
218 | ||
00d73219 LC |
219 | ;; From then on use MAX-SILENT-TIME as the absolute timeout when |
220 | ;; reading from or write to a channel for this session. | |
221 | (session-set! session 'timeout max-silent-time) | |
222 | ||
74afca5d LC |
223 | session) |
224 | (x | |
225 | ;; Connection failed or timeout expired. | |
69daee23 | 226 | (leave (G_ "failed to connect to '~a': ~a~%") |
74afca5d | 227 | (build-machine-name machine) (get-error session)))))) |
21531add | 228 | |
59f704df LC |
229 | \f |
230 | ;;; | |
231 | ;;; Synchronization. | |
232 | ;;; | |
233 | ||
59f704df LC |
234 | (define (machine-slot-file machine slot) |
235 | "Return the file name of MACHINE's file for SLOT." | |
236 | ;; For each machine we have a bunch of files representing each build slot. | |
237 | ;; When choosing a build machine, we attempt to get an exclusive lock on one | |
238 | ;; of these; if we fail, that means all the build slots are already taken. | |
239 | ;; Inspired by Nix's build-remote.pl. | |
240 | (string-append (string-append %state-directory "/offload/" | |
a3af06ad LC |
241 | (build-machine-name machine) ":" |
242 | (number->string (build-machine-port machine)) | |
59f704df LC |
243 | "/" (number->string slot)))) |
244 | ||
245 | (define (acquire-build-slot machine) | |
246 | "Attempt to acquire a build slot on MACHINE. Return the port representing | |
247 | the slot, or #f if none is available. | |
248 | ||
249 | This mechanism allows us to set a hard limit on the number of simultaneous | |
250 | connections allowed to MACHINE." | |
251 | (mkdir-p (dirname (machine-slot-file machine 0))) | |
0ef595b9 LC |
252 | |
253 | ;; When several 'guix offload' processes run in parallel, there's a race | |
254 | ;; among them, but since they try the slots in the same order, we're fine. | |
255 | (any (lambda (slot) | |
256 | (let ((port (open-file (machine-slot-file machine slot) | |
257 | "w0"))) | |
258 | (catch 'flock-error | |
259 | (lambda () | |
260 | (fcntl-flock port 'write-lock #:wait? #f) | |
261 | ;; Got it! | |
262 | (format (current-error-port) | |
263 | "process ~a acquired build slot '~a'~%" | |
264 | (getpid) (port-filename port)) | |
265 | port) | |
266 | (lambda args | |
267 | ;; PORT is already locked by another process. | |
268 | (close-port port) | |
269 | #f)))) | |
270 | (iota (build-machine-parallel-builds machine)))) | |
59f704df LC |
271 | |
272 | (define (release-build-slot slot) | |
273 | "Release SLOT, a build slot as returned as by 'acquire-build-slot'." | |
274 | (close-port slot)) | |
275 | ||
276 | \f | |
277 | ;;; | |
278 | ;;; Offloading. | |
279 | ;;; | |
280 | ||
d81195bf LC |
281 | (define (build-log-port) |
282 | "Return the default port where build logs should be sent. The default is | |
283 | file descriptor 4, which is open by the daemon before running the offload | |
284 | hook." | |
285 | (let ((port (fdopen 4 "w0"))) | |
286 | ;; Make sure file descriptor 4 isn't closed when PORT is GC'd. | |
287 | (set-port-revealed! port 1) | |
288 | port)) | |
289 | ||
ed7b4437 LC |
290 | (define (node-guile-version node) |
291 | (inferior-eval '(version) node)) | |
292 | ||
63b0c3ea LC |
293 | (define (node-free-disk-space node) |
294 | "Return the free disk space, in bytes, in NODE's store." | |
ed7b4437 LC |
295 | (inferior-eval `(begin |
296 | (use-modules (guix build syscalls)) | |
297 | (free-disk-space ,(%store-prefix))) | |
298 | node)) | |
63b0c3ea | 299 | |
88da0b68 LC |
300 | (define* (transfer-and-offload drv machine |
301 | #:key | |
302 | (inputs '()) | |
303 | (outputs '()) | |
304 | (max-silent-time 3600) | |
714084e6 | 305 | build-timeout |
88da0b68 LC |
306 | print-build-trace?) |
307 | "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of | |
308 | INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from | |
309 | MACHINE." | |
9e76eed3 | 310 | (define session |
00d73219 | 311 | (open-ssh-session machine max-silent-time)) |
9e76eed3 | 312 | |
cf283dd9 LC |
313 | (define store |
314 | (connect-to-remote-daemon session | |
315 | (build-machine-daemon-socket machine))) | |
316 | ||
317 | (set-build-options store | |
318 | #:print-build-trace print-build-trace? | |
319 | #:max-silent-time max-silent-time | |
320 | #:timeout build-timeout) | |
321 | ||
322 | ;; Protect DRV from garbage collection. | |
323 | (add-temp-root store (derivation-file-name drv)) | |
324 | ||
987a29ba LC |
325 | (with-store local |
326 | (send-files local (cons (derivation-file-name drv) inputs) store | |
327 | #:log-port (current-output-port))) | |
cf283dd9 LC |
328 | (format (current-error-port) "offloading '~a' to '~a'...~%" |
329 | (derivation-file-name drv) (build-machine-name machine)) | |
330 | (format (current-error-port) "@ build-remote ~a ~a~%" | |
331 | (derivation-file-name drv) (build-machine-name machine)) | |
332 | ||
f9e8a123 | 333 | (guard (c ((store-protocol-error? c) |
cf283dd9 | 334 | (format (current-error-port) |
69daee23 | 335 | (G_ "derivation '~a' offloaded to '~a' failed: ~a~%") |
cf283dd9 LC |
336 | (derivation-file-name drv) |
337 | (build-machine-name machine) | |
f9e8a123 | 338 | (store-protocol-error-message c)) |
ed7b4437 LC |
339 | (let* ((inferior (false-if-exception (remote-inferior session))) |
340 | (space (false-if-exception | |
341 | (node-free-disk-space inferior)))) | |
342 | ||
343 | (when inferior | |
344 | (close-inferior inferior)) | |
b96e05ae LC |
345 | |
346 | ;; Use exit code 100 for a permanent build failure. The daemon | |
347 | ;; interprets other non-zero codes as transient build failures. | |
348 | (if (and space (< space (* 10 (expt 2 20)))) | |
349 | (begin | |
350 | (format (current-error-port) | |
351 | (G_ "build failure may have been caused by lack \ | |
352 | of free disk space on '~a'~%") | |
353 | (build-machine-name machine)) | |
354 | (primitive-exit 1)) | |
355 | (primitive-exit 100))))) | |
0237d797 LC |
356 | (parameterize ((current-build-output-port (build-log-port))) |
357 | (build-derivations store (list drv)))) | |
cf283dd9 | 358 | |
d06d54e3 LC |
359 | (retrieve-files* outputs store |
360 | ||
361 | ;; We cannot use the 'import-paths' RPC here because we | |
362 | ;; already hold the locks for FILES. | |
363 | #:import | |
364 | (lambda (port) | |
365 | (restore-file-set port | |
366 | #:log-port (current-error-port) | |
367 | #:lock? #f))) | |
368 | ||
61fe9ced LC |
369 | (close-connection store) |
370 | (disconnect! session) | |
cf283dd9 LC |
371 | (format (current-error-port) "done with offloaded '~a'~%" |
372 | (derivation-file-name drv))) | |
373 | ||
59f704df LC |
374 | \f |
375 | ;;; | |
376 | ;;; Scheduling. | |
377 | ;;; | |
378 | ||
49e6291a LC |
379 | (define (machine-matches? machine requirements) |
380 | "Return #t if MACHINE matches REQUIREMENTS." | |
4b5a6fbc MC |
381 | (and (member (build-requirements-system requirements) |
382 | (build-machine-systems machine)) | |
49e6291a LC |
383 | (lset<= string=? |
384 | (build-requirements-features requirements) | |
385 | (build-machine-features machine)))) | |
386 | ||
63b0c3ea LC |
387 | (define %minimum-disk-space |
388 | ;; Minimum disk space required on the build machine for a build to be | |
389 | ;; offloaded. This keeps us from offloading to machines that are bound to | |
390 | ;; run out of disk space. | |
391 | (* 100 (expt 2 20))) ;100 MiB | |
392 | ||
bbe66a53 LC |
393 | (define (node-load node) |
394 | "Return the load on NODE. Return +∞ if NODE is misbehaving." | |
ed7b4437 LC |
395 | (let ((line (inferior-eval '(begin |
396 | (use-modules (ice-9 rdelim)) | |
397 | (call-with-input-file "/proc/loadavg" | |
398 | read-string)) | |
399 | node))) | |
bbe66a53 LC |
400 | (if (eof-object? line) |
401 | +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded | |
402 | (match (string-tokenize line) | |
403 | ((one five fifteen . x) | |
404 | (string->number one)) | |
405 | (x | |
406 | +inf.0))))) | |
407 | ||
408 | (define (normalized-load machine load) | |
409 | "Divide LOAD by the number of parallel builds of MACHINE." | |
410 | (if (rational? load) | |
411 | (let* ((jobs (build-machine-parallel-builds machine)) | |
412 | (normalized (/ load jobs))) | |
413 | (format (current-error-port) "load on machine '~a' is ~s\ | |
165f4b2a | 414 | (normalized: ~s)~%" |
bbe66a53 LC |
415 | (build-machine-name machine) load normalized) |
416 | normalized) | |
417 | load)) | |
165f4b2a | 418 | |
d8e89b1c LC |
419 | (define (random-seed) |
420 | (logxor (getpid) (car (gettimeofday)))) | |
421 | ||
422 | (define shuffle | |
423 | (let ((state (seed->random-state (random-seed)))) | |
424 | (lambda (lst) | |
425 | "Return LST shuffled (using the Fisher-Yates algorithm.)" | |
426 | (define vec (list->vector lst)) | |
427 | (let loop ((result '()) | |
428 | (i (vector-length vec))) | |
429 | (if (zero? i) | |
430 | result | |
431 | (let* ((j (random i state)) | |
432 | (val (vector-ref vec j))) | |
433 | (vector-set! vec j (vector-ref vec (- i 1))) | |
434 | (loop (cons val result) (- i 1)))))))) | |
435 | ||
88da0b68 | 436 | (define (choose-build-machine machines) |
84620dd0 LC |
437 | "Return two values: the best machine among MACHINES and its build |
438 | slot (which must later be released with 'release-build-slot'), or #f and #f." | |
d652b851 LC |
439 | |
440 | ;; Proceed like this: | |
7f4d102c | 441 | ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out |
d652b851 | 442 | ;; those machines for which we failed. |
7f4d102c LC |
443 | ;; 2. Choose the best machine among those that are left. |
444 | ;; 3. Release the previously-acquired build slots of the other machines. | |
445 | ||
446 | (define machines+slots | |
447 | (filter-map (lambda (machine) | |
448 | (let ((slot (acquire-build-slot machine))) | |
449 | (and slot (list machine slot)))) | |
450 | (shuffle machines))) | |
451 | ||
452 | (define (undecorate pred) | |
453 | (lambda (a b) | |
454 | (match a | |
455 | ((machine1 slot1) | |
456 | (match b | |
457 | ((machine2 slot2) | |
458 | (pred machine1 machine2))))))) | |
459 | ||
460 | (define (machine-faster? m1 m2) | |
461 | ;; Return #t if M1 is faster than M2. | |
462 | (> (build-machine-speed m1) | |
463 | (build-machine-speed m2))) | |
464 | ||
465 | (let loop ((machines+slots | |
466 | (sort machines+slots (undecorate machine-faster?)))) | |
467 | (match machines+slots | |
468 | (((best slot) others ...) | |
469 | ;; Return the best machine unless it's already overloaded. | |
470 | ;; Note: We call 'node-load' only as a last resort because it is | |
471 | ;; too costly to call it once for every machine. | |
00d73219 LC |
472 | (let* ((session (false-if-exception (open-ssh-session best |
473 | %short-timeout))) | |
7f4d102c LC |
474 | (node (and session (remote-inferior session))) |
475 | (load (and node (normalized-load best (node-load node)))) | |
476 | (space (and node (node-free-disk-space node)))) | |
477 | (when node (close-inferior node)) | |
478 | (when session (disconnect! session)) | |
479 | (if (and node (< load 2.) (>= space %minimum-disk-space)) | |
480 | (match others | |
481 | (((machines slots) ...) | |
482 | ;; Release slots from the uninteresting machines. | |
483 | (for-each release-build-slot slots) | |
484 | ||
485 | ;; The caller must keep SLOT to protect it from GC and to | |
486 | ;; eventually release it. | |
487 | (values best slot))) | |
488 | (begin | |
489 | ;; BEST is unsuitable, so try the next one. | |
490 | (when (and space (< space %minimum-disk-space)) | |
491 | (format (current-error-port) | |
492 | "skipping machine '~a' because it is low \ | |
63b0c3ea | 493 | on disk space (~,2f MiB free)~%" |
7f4d102c LC |
494 | (build-machine-name best) |
495 | (/ space (expt 2 20) 1.))) | |
496 | (release-build-slot slot) | |
497 | (loop others))))) | |
498 | (() | |
499 | (values #f #f))))) | |
49e6291a | 500 | |
a708de15 LC |
501 | (define (call-with-timeout timeout drv thunk) |
502 | "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call | |
503 | THUNK. Use DRV as an indication of what we were building when the timeout | |
504 | expired." | |
505 | (if (number? timeout) | |
506 | (dynamic-wind | |
507 | (lambda () | |
508 | (sigaction SIGALRM | |
509 | (lambda _ | |
510 | ;; The exit code here will be 1, which guix-daemon will | |
511 | ;; interpret as a transient failure. | |
512 | (leave (G_ "timeout expired while offloading '~a'~%") | |
513 | (derivation-file-name drv)))) | |
514 | (alarm timeout)) | |
515 | thunk | |
516 | (lambda () | |
517 | (alarm 0))) | |
518 | (thunk))) | |
519 | ||
520 | (define-syntax-rule (with-timeout timeout drv exp ...) | |
521 | "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed. | |
522 | If TIMEOUT is #f, simply evaluate EXP..." | |
523 | (call-with-timeout timeout drv (lambda () exp ...))) | |
524 | ||
49e6291a LC |
525 | (define* (process-request wants-local? system drv features |
526 | #:key | |
527 | print-build-trace? (max-silent-time 3600) | |
714084e6 | 528 | build-timeout) |
49e6291a | 529 | "Process a request to build DRV." |
88da0b68 LC |
530 | (let* ((local? (and wants-local? (string=? system (%current-system)))) |
531 | (reqs (build-requirements | |
532 | (system system) | |
533 | (features features))) | |
534 | (candidates (filter (cut machine-matches? <> reqs) | |
535 | (build-machines)))) | |
536 | (match candidates | |
537 | (() | |
538 | ;; We'll never be able to match REQS. | |
539 | (display "# decline\n")) | |
ba97e454 | 540 | ((x ...) |
84620dd0 LC |
541 | (let-values (((machine slot) |
542 | (choose-build-machine candidates))) | |
88da0b68 | 543 | (if machine |
84620dd0 LC |
544 | (dynamic-wind |
545 | (const #f) | |
546 | (lambda () | |
547 | ;; Offload DRV to MACHINE. | |
548 | (display "# accept\n") | |
549 | (let ((inputs (string-tokenize (read-line))) | |
550 | (outputs (string-tokenize (read-line)))) | |
a708de15 LC |
551 | ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can |
552 | ;; be issues with the connection or deadlocks that could | |
553 | ;; lead the 'guix offload' process to remain stuck forever. | |
554 | ;; To avoid that, install a timeout here as well. | |
555 | (with-timeout build-timeout drv | |
556 | (transfer-and-offload drv machine | |
557 | #:inputs inputs | |
558 | #:outputs outputs | |
559 | #:max-silent-time max-silent-time | |
560 | #:build-timeout build-timeout | |
561 | #:print-build-trace? | |
562 | print-build-trace?)))) | |
84620dd0 LC |
563 | (lambda () |
564 | (release-build-slot slot))) | |
88da0b68 LC |
565 | |
566 | ;; Not now, all the machines are busy. | |
567 | (display "# postpone\n"))))))) | |
49e6291a | 568 | |
49e6291a | 569 | \f |
aebaee95 LC |
570 | ;;; |
571 | ;;; Installation tests. | |
572 | ;;; | |
573 | ||
00d73219 LC |
574 | (define %short-timeout |
575 | ;; Timeout in seconds used on SSH connections where reads and writes | |
576 | ;; shouldn't take long. | |
577 | 15) | |
578 | ||
aebaee95 LC |
579 | (define (assert-node-repl node name) |
580 | "Bail out if NODE is not running Guile." | |
581 | (match (node-guile-version node) | |
582 | (#f | |
4eb0f9ae | 583 | (report-guile-error name)) |
aebaee95 | 584 | ((? string? version) |
ed7b4437 | 585 | (info (G_ "'~a' is running GNU Guile ~a~%") |
aebaee95 LC |
586 | name (node-guile-version node))))) |
587 | ||
588 | (define (assert-node-has-guix node name) | |
10b2834f LC |
589 | "Bail out if NODE if #f or if we fail to use the (guix) module, or if its |
590 | daemon is not running." | |
591 | (unless (inferior? node) | |
592 | (leave (G_ "failed to run 'guix repl' on '~a'~%") name)) | |
593 | ||
594 | (match (inferior-eval '(begin | |
595 | (use-modules (guix)) | |
596 | (and add-text-to-store 'alright)) | |
597 | node) | |
598 | ('alright #t) | |
599 | (_ (report-module-error name))) | |
600 | ||
601 | (match (inferior-eval '(begin | |
602 | (use-modules (guix)) | |
603 | (with-store store | |
604 | (add-text-to-store store "test" | |
605 | "Hello, build machine!"))) | |
606 | node) | |
607 | ((? string? str) | |
608 | (info (G_ "Guix is usable on '~a' (test returned ~s)~%") | |
609 | name str)) | |
610 | (x | |
611 | (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%") | |
612 | name x)))) | |
aebaee95 LC |
613 | |
614 | (define %random-state | |
615 | (delay | |
616 | (seed->random-state (logxor (getpid) (car (gettimeofday)))))) | |
617 | ||
987a29ba LC |
618 | (define* (nonce #:optional (name (gethostname))) |
619 | (string-append name "-" | |
aebaee95 LC |
620 | (number->string (random 1000000 (force %random-state))))) |
621 | ||
ed7b4437 | 622 | (define (assert-node-can-import session node name daemon-socket) |
aebaee95 | 623 | "Bail out if NODE refuses to import our archives." |
ed7b4437 LC |
624 | (with-store store |
625 | (let* ((item (add-text-to-store store "export-test" (nonce))) | |
626 | (remote (connect-to-remote-daemon session daemon-socket))) | |
627 | (with-store local | |
628 | (send-files local (list item) remote)) | |
629 | ||
630 | (if (valid-path? remote item) | |
631 | (info (G_ "'~a' successfully imported '~a'~%") | |
632 | name item) | |
633 | (leave (G_ "'~a' was not properly imported on '~a'~%") | |
634 | item name))))) | |
635 | ||
636 | (define (assert-node-can-export session node name daemon-socket) | |
aebaee95 | 637 | "Bail out if we cannot import signed archives from NODE." |
ed7b4437 | 638 | (let* ((remote (connect-to-remote-daemon session daemon-socket)) |
987a29ba | 639 | (item (add-text-to-store remote "import-test" (nonce name)))) |
aebaee95 | 640 | (with-store store |
987a29ba | 641 | (if (and (retrieve-files store (list item) remote) |
aebaee95 | 642 | (valid-path? store item)) |
69daee23 | 643 | (info (G_ "successfully imported '~a' from '~a'~%") |
aebaee95 | 644 | item name) |
69daee23 | 645 | (leave (G_ "failed to import '~a' from '~a'~%") |
aebaee95 LC |
646 | item name))))) |
647 | ||
27991c97 LC |
648 | (define (check-machine-availability machine-file pred) |
649 | "Check that each machine matching PRED in MACHINE-FILE is usable as a build | |
650 | machine." | |
2b513387 LC |
651 | (define (build-machine=? m1 m2) |
652 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
653 | (= (build-machine-port m1) (build-machine-port m2)))) | |
654 | ||
655 | ;; A given build machine may appear several times (e.g., once for | |
656 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
27991c97 LC |
657 | (let ((machines (filter pred |
658 | (delete-duplicates (build-machines machine-file) | |
659 | build-machine=?)))) | |
69daee23 | 660 | (info (G_ "testing ~a build machines defined in '~a'...~%") |
aebaee95 LC |
661 | (length machines) machine-file) |
662 | (let* ((names (map build-machine-name machines)) | |
663 | (sockets (map build-machine-daemon-socket machines)) | |
00d73219 | 664 | (sessions (map (cut open-ssh-session <> %short-timeout) machines)) |
ed7b4437 | 665 | (nodes (map remote-inferior sessions))) |
aebaee95 | 666 | (for-each assert-node-has-guix nodes names) |
10b2834f | 667 | (for-each assert-node-repl nodes names) |
ed7b4437 LC |
668 | (for-each assert-node-can-import sessions nodes names sockets) |
669 | (for-each assert-node-can-export sessions nodes names sockets) | |
670 | (for-each close-inferior nodes) | |
671 | (for-each disconnect! sessions)))) | |
aebaee95 | 672 | |
dafc3daf RW |
673 | (define (check-machine-status machine-file pred) |
674 | "Print the load of each machine matching PRED in MACHINE-FILE." | |
675 | (define (build-machine=? m1 m2) | |
676 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
677 | (= (build-machine-port m1) (build-machine-port m2)))) | |
678 | ||
679 | ;; A given build machine may appear several times (e.g., once for | |
680 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
681 | (let ((machines (filter pred | |
682 | (delete-duplicates (build-machines machine-file) | |
683 | build-machine=?)))) | |
684 | (info (G_ "getting status of ~a build machines defined in '~a'...~%") | |
685 | (length machines) machine-file) | |
686 | (for-each (lambda (machine) | |
10b2834f | 687 | (define session |
00d73219 | 688 | (open-ssh-session machine %short-timeout)) |
10b2834f LC |
689 | |
690 | (match (remote-inferior session) | |
691 | (#f | |
692 | (warning (G_ "failed to run 'guix repl' on machine '~a'~%") | |
693 | (build-machine-name machine))) | |
694 | ((? inferior? inferior) | |
02ec889e LC |
695 | (let ((now (car (gettimeofday)))) |
696 | (match (inferior-eval '(list (uname) | |
697 | (car (gettimeofday))) | |
698 | inferior) | |
699 | ((uts time) | |
700 | (when (< time now) | |
701 | ;; Build machine clocks must not be behind as this | |
702 | ;; could cause timestamp issues. | |
703 | (warning (G_ "machine '~a' is ~a seconds behind~%") | |
704 | (build-machine-name machine) | |
705 | (- now time))) | |
706 | ||
707 | (let ((load (node-load inferior)) | |
708 | (free (node-free-disk-space inferior))) | |
709 | (close-inferior inferior) | |
710 | (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\ | |
711 | host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\ | |
712 | time difference: ~a s~%" | |
713 | (build-machine-name machine) | |
714 | (utsname:sysname uts) (utsname:release uts) | |
715 | (utsname:machine uts) | |
716 | (utsname:nodename uts) | |
717 | (normalized-load machine load) | |
718 | (/ free (expt 2 20) 1.) | |
719 | (- time now)))))))) | |
10b2834f LC |
720 | |
721 | (disconnect! session)) | |
dafc3daf RW |
722 | machines))) |
723 | ||
aebaee95 | 724 | \f |
49e6291a LC |
725 | ;;; |
726 | ;;; Entry point. | |
727 | ;;; | |
728 | ||
3794ce93 LC |
729 | (define-command (guix-offload . args) |
730 | (category plumbing) | |
731 | (synopsis "set up and operate build offloading") | |
732 | ||
49e6291a LC |
733 | (define request-line-rx |
734 | ;; The request format. See 'tryBuildHook' method in build.cc. | |
735 | (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)")) | |
736 | ||
737 | (define not-coma | |
738 | (char-set-complement (char-set #\,))) | |
739 | ||
740 | ;; Make sure $HOME really corresponds to the current user. This is | |
741 | ;; necessary since lsh uses that to determine the location of the yarrow | |
742 | ;; seed file, and fails if it's owned by someone else. | |
743 | (and=> (passwd:dir (getpw (getuid))) | |
744 | (cut setenv "HOME" <>)) | |
745 | ||
0b724753 LC |
746 | ;; We rely on protocol-level compression from libssh to optimize large data |
747 | ;; transfers. Warn if it's missing. | |
748 | (unless (zlib-support?) | |
69daee23 LC |
749 | (warning (G_ "Guile-SSH lacks zlib support")) |
750 | (warning (G_ "data transfers will *not* be compressed!"))) | |
0b724753 | 751 | |
49e6291a LC |
752 | (match args |
753 | ((system max-silent-time print-build-trace? build-timeout) | |
754 | (let ((max-silent-time (string->number max-silent-time)) | |
755 | (build-timeout (string->number build-timeout)) | |
756 | (print-build-trace? (string=? print-build-trace? "1"))) | |
8902d0f2 | 757 | (set-thread-name "guix offload") |
49e6291a LC |
758 | (parameterize ((%current-system system)) |
759 | (let loop ((line (read-line))) | |
760 | (unless (eof-object? line) | |
761 | (cond ((regexp-exec request-line-rx line) | |
762 | => | |
763 | (lambda (match) | |
e8a5db80 | 764 | (with-error-handling |
49e6291a LC |
765 | (process-request (equal? (match:substring match 1) "1") |
766 | (match:substring match 2) ; system | |
015f17e8 LC |
767 | (read-derivation-from-file |
768 | (match:substring match 3)) | |
49e6291a LC |
769 | (string-tokenize |
770 | (match:substring match 4) not-coma) | |
771 | #:print-build-trace? print-build-trace? | |
772 | #:max-silent-time max-silent-time | |
773 | #:build-timeout build-timeout)))) | |
774 | (else | |
69daee23 | 775 | (leave (G_ "invalid request line: ~s~%") line))) |
49e6291a | 776 | (loop (read-line))))))) |
aebaee95 LC |
777 | (("test" rest ...) |
778 | (with-error-handling | |
27991c97 LC |
779 | (let-values (((file pred) |
780 | (match rest | |
781 | ((file regexp) | |
782 | (values file | |
783 | (compose (cut string-match regexp <>) | |
784 | build-machine-name))) | |
785 | ((file) (values file (const #t))) | |
786 | (() (values %machine-file (const #t))) | |
69daee23 | 787 | (x (leave (G_ "wrong number of arguments~%")))))) |
27991c97 | 788 | (check-machine-availability (or file %machine-file) pred)))) |
dafc3daf RW |
789 | (("status" rest ...) |
790 | (with-error-handling | |
791 | (let-values (((file pred) | |
792 | (match rest | |
793 | ((file regexp) | |
794 | (values file | |
795 | (compose (cut string-match regexp <>) | |
796 | build-machine-name))) | |
797 | ((file) (values file (const #t))) | |
798 | (() (values %machine-file (const #t))) | |
799 | (x (leave (G_ "wrong number of arguments~%")))))) | |
800 | (check-machine-status (or file %machine-file) pred)))) | |
49e6291a LC |
801 | (("--version") |
802 | (show-version-and-exit "guix offload")) | |
803 | (("--help") | |
54a87b2a MC |
804 | (format #t (G_ "Usage: guix offload SYSTEM MAX-SILENT-TIME \ |
805 | PRINT-BUILD-TRACE? BUILD-TIMEOUT | |
49e6291a LC |
806 | Process build offload requests written on the standard input, possibly |
807 | offloading builds to the machines listed in '~a'.~%") | |
808 | %machine-file) | |
69daee23 | 809 | (display (G_ " |
49e6291a LC |
810 | This tool is meant to be used internally by 'guix-daemon'.\n")) |
811 | (show-bug-report-information)) | |
812 | (x | |
69daee23 | 813 | (leave (G_ "invalid arguments: ~{~s ~}~%") x)))) |
49e6291a | 814 | |
f326fef8 | 815 | ;;; Local Variables: |
d81195bf | 816 | ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1) |
a708de15 | 817 | ;;; eval: (put 'with-timeout 'scheme-indent-function 2) |
f326fef8 LC |
818 | ;;; End: |
819 | ||
49e6291a | 820 | ;;; offload.scm ends here |