Commit | Line | Data |
---|---|---|
49e6291a | 1 | ;;; GNU Guix --- Functional package management for GNU |
d51bfe24 | 2 | ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org> |
dafc3daf | 3 | ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net> |
4b5a6fbc | 4 | ;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com> |
49e6291a LC |
5 | ;;; |
6 | ;;; This file is part of GNU Guix. | |
7 | ;;; | |
8 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
9 | ;;; under the terms of the GNU General Public License as published by | |
10 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
11 | ;;; your option) any later version. | |
12 | ;;; | |
13 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
14 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | ;;; GNU General Public License for more details. | |
17 | ;;; | |
18 | ;;; You should have received a copy of the GNU General Public License | |
19 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
20 | ||
21 | (define-module (guix scripts offload) | |
21531add LC |
22 | #:use-module (ssh key) |
23 | #:use-module (ssh auth) | |
24 | #:use-module (ssh session) | |
25 | #:use-module (ssh channel) | |
cf283dd9 | 26 | #:use-module (ssh popen) |
0b724753 | 27 | #:use-module (ssh version) |
49e6291a LC |
28 | #:use-module (guix config) |
29 | #:use-module (guix records) | |
987a29ba | 30 | #:use-module (guix ssh) |
49e6291a | 31 | #:use-module (guix store) |
ed7b4437 | 32 | #:use-module (guix inferior) |
49e6291a | 33 | #:use-module (guix derivations) |
2535635f LC |
34 | #:use-module ((guix serialization) |
35 | #:select (nar-error? nar-error-file)) | |
49e6291a | 36 | #:use-module (guix nar) |
d51bfe24 | 37 | #:use-module ((guix utils) #:select (%current-system)) |
8902d0f2 LC |
38 | #:use-module ((guix build syscalls) |
39 | #:select (fcntl-flock set-thread-name)) | |
f326fef8 | 40 | #:use-module ((guix build utils) #:select (which mkdir-p)) |
49e6291a | 41 | #:use-module (guix ui) |
d51bfe24 | 42 | #:use-module (guix diagnostics) |
49e6291a | 43 | #:use-module (srfi srfi-1) |
6c41cce0 | 44 | #:use-module (srfi srfi-11) |
49e6291a LC |
45 | #:use-module (srfi srfi-26) |
46 | #:use-module (srfi srfi-34) | |
47 | #:use-module (srfi srfi-35) | |
48 | #:use-module (ice-9 popen) | |
49 | #:use-module (ice-9 rdelim) | |
50 | #:use-module (ice-9 match) | |
51 | #:use-module (ice-9 regex) | |
52 | #:use-module (ice-9 format) | |
2535635f | 53 | #:use-module (ice-9 binary-ports) |
49e6291a LC |
54 | #:export (build-machine |
55 | build-requirements | |
56 | guix-offload)) | |
57 | ||
58 | ;;; Commentary: | |
59 | ;;; | |
60 | ;;; Attempt to offload builds to the machines listed in | |
61 | ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and | |
62 | ;;; retrieving the build output(s) over SSH upon success. | |
63 | ;;; | |
64 | ;;; This command should not be used directly; instead, it is called on-demand | |
dc209d5a | 65 | ;;; by the daemon, unless it was started with '--no-offload' or a client |
49e6291a LC |
66 | ;;; inhibited build hooks. |
67 | ;;; | |
68 | ;;; Code: | |
69 | ||
49e6291a LC |
70 | (define-record-type* <build-machine> |
71 | build-machine make-build-machine | |
72 | build-machine? | |
73 | (name build-machine-name) ; string | |
cecd72d5 LC |
74 | (port build-machine-port ; number |
75 | (default 22)) | |
4b5a6fbc MC |
76 | (systems %build-machine-systems ; list of strings |
77 | (default #f)) ; drop default after system is removed | |
78 | (system %build-machine-system ; deprecated | |
79 | (default #f)) | |
49e6291a LC |
80 | (user build-machine-user) ; string |
81 | (private-key build-machine-private-key ; file name | |
21531add LC |
82 | (default (user-openssh-private-key))) |
83 | (host-key build-machine-host-key) ; string | |
1d48cf94 LC |
84 | (compression build-machine-compression ; string |
85 | (default "zlib@openssh.com,zlib")) | |
86 | (compression-level build-machine-compression-level ;integer | |
87 | (default 3)) | |
cf283dd9 LC |
88 | (daemon-socket build-machine-daemon-socket ; string |
89 | (default "/var/guix/daemon-socket/socket")) | |
49e6291a LC |
90 | (parallel-builds build-machine-parallel-builds ; number |
91 | (default 1)) | |
92 | (speed build-machine-speed ; inexact real | |
93 | (default 1.0)) | |
94 | (features build-machine-features ; list of strings | |
95 | (default '()))) | |
96 | ||
4b5a6fbc MC |
97 | ;;; Deprecated. |
98 | (define (build-machine-system machine) | |
99 | (warning (G_ "The 'system' field is deprecated, \ | |
100 | please use 'systems' instead.~%")) | |
101 | (%build-machine-system machine)) | |
102 | ||
103 | ;;; TODO: Remove after the deprecated 'system' field is removed. | |
104 | (define (build-machine-systems machine) | |
105 | (or (%build-machine-systems machine) | |
106 | (list (build-machine-system machine)) | |
107 | (leave (G_ "The build-machine object lacks a value for its 'systems' | |
108 | field.")))) | |
109 | ||
49e6291a LC |
110 | (define-record-type* <build-requirements> |
111 | build-requirements make-build-requirements | |
112 | build-requirements? | |
113 | (system build-requirements-system) ; string | |
114 | (features build-requirements-features ; list of strings | |
115 | (default '()))) | |
116 | ||
117 | (define %machine-file | |
118 | ;; File that lists machines available as build slaves. | |
119 | (string-append %config-directory "/machines.scm")) | |
120 | ||
21531add LC |
121 | (define (user-openssh-private-key) |
122 | "Return the user's default SSH private key, or #f if it could not be | |
49e6291a LC |
123 | determined." |
124 | (and=> (getenv "HOME") | |
21531add | 125 | (cut string-append <> "/.ssh/id_rsa"))) |
49e6291a LC |
126 | |
127 | (define %user-module | |
128 | ;; Module in which the machine description file is loaded. | |
129 | (let ((module (make-fresh-user-module))) | |
130 | (module-use! module (resolve-interface '(guix scripts offload))) | |
131 | module)) | |
132 | ||
133 | (define* (build-machines #:optional (file %machine-file)) | |
134 | "Read the list of build machines from FILE and return it." | |
135 | (catch #t | |
136 | (lambda () | |
137 | ;; Avoid ABI incompatibility with the <build-machine> record. | |
e2721a05 | 138 | ;; (set! %fresh-auto-compile #t) |
49e6291a LC |
139 | |
140 | (save-module-excursion | |
141 | (lambda () | |
142 | (set-current-module %user-module) | |
a9a685cc LC |
143 | (match (primitive-load file) |
144 | (((? build-machine? machines) ...) | |
145 | machines) | |
146 | (_ | |
147 | ;; Instead of crashing, assume the empty list. | |
148 | (warning (G_ "'~a' did not return a list of build machines; \ | |
149 | ignoring it~%") | |
150 | file) | |
151 | '()))))) | |
49e6291a LC |
152 | (lambda args |
153 | (match args | |
e465d9e1 | 154 | (('system-error . rest) |
49e6291a LC |
155 | (let ((err (system-error-errno args))) |
156 | ;; Silently ignore missing file since this is a common case. | |
157 | (if (= ENOENT err) | |
158 | '() | |
69daee23 | 159 | (leave (G_ "failed to open machine file '~a': ~a~%") |
92cb2e28 | 160 | file (strerror err))))) |
c1202fb1 LC |
161 | (('syntax-error proc message properties form . rest) |
162 | (let ((loc (source-properties->location properties))) | |
69daee23 | 163 | (leave (G_ "~a: ~a~%") |
c1202fb1 | 164 | (location->string loc) message))) |
e465d9e1 | 165 | (x |
69daee23 | 166 | (leave (G_ "failed to load machine file '~a': ~s~%") |
92cb2e28 | 167 | file args)))))) |
49e6291a | 168 | |
21531add LC |
169 | (define (private-key-from-file* file) |
170 | "Like 'private-key-from-file', but raise an error that 'with-error-handling' | |
171 | can interpret meaningfully." | |
172 | (catch 'guile-ssh-error | |
173 | (lambda () | |
174 | (private-key-from-file file)) | |
175 | (lambda (key proc str . rest) | |
d51bfe24 | 176 | (raise (formatted-message (G_ "failed to load SSH \ |
21531add | 177 | private key from '~a': ~a") |
d51bfe24 | 178 | file str))))) |
21531add | 179 | |
00d73219 | 180 | (define* (open-ssh-session machine #:optional (max-silent-time -1)) |
21531add LC |
181 | "Open an SSH session for MACHINE and return it. Throw an error on failure." |
182 | (let ((private (private-key-from-file* (build-machine-private-key machine))) | |
183 | (public (public-key-from-file | |
184 | (string-append (build-machine-private-key machine) | |
185 | ".pub"))) | |
186 | (session (make-session #:user (build-machine-user machine) | |
187 | #:host (build-machine-name machine) | |
188 | #:port (build-machine-port machine) | |
00d73219 | 189 | #:timeout 10 ;initial timeout (seconds) |
21531add LC |
190 | ;; #:log-verbosity 'protocol |
191 | #:identity (build-machine-private-key machine) | |
192 | ||
bd834577 LC |
193 | ;; By default libssh reads ~/.ssh/known_hosts |
194 | ;; and uses that to adjust its choice of cipher | |
195 | ;; suites, which changes the type of host key | |
196 | ;; that the server sends (RSA vs. Ed25519, | |
197 | ;; etc.). Opt for something reproducible and | |
198 | ;; stateless instead. | |
199 | #:knownhosts "/dev/null" | |
200 | ||
21531add LC |
201 | ;; We need lightweight compression when |
202 | ;; exchanging full archives. | |
1d48cf94 LC |
203 | #:compression |
204 | (build-machine-compression machine) | |
205 | #:compression-level | |
206 | (build-machine-compression-level machine)))) | |
74afca5d LC |
207 | (match (connect! session) |
208 | ('ok | |
114dcb42 LC |
209 | ;; Make sure the server's key is what we expect. |
210 | (authenticate-server* session (build-machine-host-key machine)) | |
74afca5d LC |
211 | |
212 | (let ((auth (userauth-public-key! session private))) | |
213 | (unless (eq? 'success auth) | |
214 | (disconnect! session) | |
69daee23 | 215 | (leave (G_ "SSH public key authentication failed for '~a': ~a~%") |
74afca5d LC |
216 | (build-machine-name machine) (get-error session)))) |
217 | ||
00d73219 LC |
218 | ;; From then on use MAX-SILENT-TIME as the absolute timeout when |
219 | ;; reading from or write to a channel for this session. | |
220 | (session-set! session 'timeout max-silent-time) | |
221 | ||
74afca5d LC |
222 | session) |
223 | (x | |
224 | ;; Connection failed or timeout expired. | |
69daee23 | 225 | (leave (G_ "failed to connect to '~a': ~a~%") |
74afca5d | 226 | (build-machine-name machine) (get-error session)))))) |
21531add | 227 | |
59f704df LC |
228 | \f |
229 | ;;; | |
230 | ;;; Synchronization. | |
231 | ;;; | |
232 | ||
59f704df LC |
233 | (define (machine-slot-file machine slot) |
234 | "Return the file name of MACHINE's file for SLOT." | |
235 | ;; For each machine we have a bunch of files representing each build slot. | |
236 | ;; When choosing a build machine, we attempt to get an exclusive lock on one | |
237 | ;; of these; if we fail, that means all the build slots are already taken. | |
238 | ;; Inspired by Nix's build-remote.pl. | |
239 | (string-append (string-append %state-directory "/offload/" | |
a3af06ad LC |
240 | (build-machine-name machine) ":" |
241 | (number->string (build-machine-port machine)) | |
59f704df LC |
242 | "/" (number->string slot)))) |
243 | ||
244 | (define (acquire-build-slot machine) | |
245 | "Attempt to acquire a build slot on MACHINE. Return the port representing | |
246 | the slot, or #f if none is available. | |
247 | ||
248 | This mechanism allows us to set a hard limit on the number of simultaneous | |
249 | connections allowed to MACHINE." | |
250 | (mkdir-p (dirname (machine-slot-file machine 0))) | |
0ef595b9 LC |
251 | |
252 | ;; When several 'guix offload' processes run in parallel, there's a race | |
253 | ;; among them, but since they try the slots in the same order, we're fine. | |
254 | (any (lambda (slot) | |
255 | (let ((port (open-file (machine-slot-file machine slot) | |
256 | "w0"))) | |
257 | (catch 'flock-error | |
258 | (lambda () | |
259 | (fcntl-flock port 'write-lock #:wait? #f) | |
260 | ;; Got it! | |
261 | (format (current-error-port) | |
262 | "process ~a acquired build slot '~a'~%" | |
263 | (getpid) (port-filename port)) | |
264 | port) | |
265 | (lambda args | |
266 | ;; PORT is already locked by another process. | |
267 | (close-port port) | |
268 | #f)))) | |
269 | (iota (build-machine-parallel-builds machine)))) | |
59f704df LC |
270 | |
271 | (define (release-build-slot slot) | |
272 | "Release SLOT, a build slot as returned as by 'acquire-build-slot'." | |
273 | (close-port slot)) | |
274 | ||
275 | \f | |
276 | ;;; | |
277 | ;;; Offloading. | |
278 | ;;; | |
279 | ||
d81195bf LC |
280 | (define (build-log-port) |
281 | "Return the default port where build logs should be sent. The default is | |
282 | file descriptor 4, which is open by the daemon before running the offload | |
283 | hook." | |
284 | (let ((port (fdopen 4 "w0"))) | |
285 | ;; Make sure file descriptor 4 isn't closed when PORT is GC'd. | |
286 | (set-port-revealed! port 1) | |
287 | port)) | |
288 | ||
ed7b4437 LC |
289 | (define (node-guile-version node) |
290 | (inferior-eval '(version) node)) | |
291 | ||
63b0c3ea LC |
292 | (define (node-free-disk-space node) |
293 | "Return the free disk space, in bytes, in NODE's store." | |
ed7b4437 LC |
294 | (inferior-eval `(begin |
295 | (use-modules (guix build syscalls)) | |
296 | (free-disk-space ,(%store-prefix))) | |
297 | node)) | |
63b0c3ea | 298 | |
88da0b68 LC |
299 | (define* (transfer-and-offload drv machine |
300 | #:key | |
301 | (inputs '()) | |
302 | (outputs '()) | |
303 | (max-silent-time 3600) | |
714084e6 | 304 | build-timeout |
88da0b68 LC |
305 | print-build-trace?) |
306 | "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of | |
307 | INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from | |
308 | MACHINE." | |
9e76eed3 | 309 | (define session |
00d73219 | 310 | (open-ssh-session machine max-silent-time)) |
9e76eed3 | 311 | |
cf283dd9 LC |
312 | (define store |
313 | (connect-to-remote-daemon session | |
314 | (build-machine-daemon-socket machine))) | |
315 | ||
316 | (set-build-options store | |
317 | #:print-build-trace print-build-trace? | |
318 | #:max-silent-time max-silent-time | |
319 | #:timeout build-timeout) | |
320 | ||
321 | ;; Protect DRV from garbage collection. | |
322 | (add-temp-root store (derivation-file-name drv)) | |
323 | ||
987a29ba LC |
324 | (with-store local |
325 | (send-files local (cons (derivation-file-name drv) inputs) store | |
326 | #:log-port (current-output-port))) | |
cf283dd9 LC |
327 | (format (current-error-port) "offloading '~a' to '~a'...~%" |
328 | (derivation-file-name drv) (build-machine-name machine)) | |
329 | (format (current-error-port) "@ build-remote ~a ~a~%" | |
330 | (derivation-file-name drv) (build-machine-name machine)) | |
331 | ||
f9e8a123 | 332 | (guard (c ((store-protocol-error? c) |
cf283dd9 | 333 | (format (current-error-port) |
69daee23 | 334 | (G_ "derivation '~a' offloaded to '~a' failed: ~a~%") |
cf283dd9 LC |
335 | (derivation-file-name drv) |
336 | (build-machine-name machine) | |
f9e8a123 | 337 | (store-protocol-error-message c)) |
ed7b4437 LC |
338 | (let* ((inferior (false-if-exception (remote-inferior session))) |
339 | (space (false-if-exception | |
340 | (node-free-disk-space inferior)))) | |
341 | ||
342 | (when inferior | |
343 | (close-inferior inferior)) | |
b96e05ae LC |
344 | |
345 | ;; Use exit code 100 for a permanent build failure. The daemon | |
346 | ;; interprets other non-zero codes as transient build failures. | |
347 | (if (and space (< space (* 10 (expt 2 20)))) | |
348 | (begin | |
349 | (format (current-error-port) | |
350 | (G_ "build failure may have been caused by lack \ | |
351 | of free disk space on '~a'~%") | |
352 | (build-machine-name machine)) | |
353 | (primitive-exit 1)) | |
354 | (primitive-exit 100))))) | |
0237d797 LC |
355 | (parameterize ((current-build-output-port (build-log-port))) |
356 | (build-derivations store (list drv)))) | |
cf283dd9 | 357 | |
d06d54e3 LC |
358 | (retrieve-files* outputs store |
359 | ||
360 | ;; We cannot use the 'import-paths' RPC here because we | |
361 | ;; already hold the locks for FILES. | |
362 | #:import | |
363 | (lambda (port) | |
364 | (restore-file-set port | |
365 | #:log-port (current-error-port) | |
366 | #:lock? #f))) | |
367 | ||
61fe9ced LC |
368 | (close-connection store) |
369 | (disconnect! session) | |
cf283dd9 LC |
370 | (format (current-error-port) "done with offloaded '~a'~%" |
371 | (derivation-file-name drv))) | |
372 | ||
59f704df LC |
373 | \f |
374 | ;;; | |
375 | ;;; Scheduling. | |
376 | ;;; | |
377 | ||
49e6291a LC |
378 | (define (machine-matches? machine requirements) |
379 | "Return #t if MACHINE matches REQUIREMENTS." | |
4b5a6fbc MC |
380 | (and (member (build-requirements-system requirements) |
381 | (build-machine-systems machine)) | |
49e6291a LC |
382 | (lset<= string=? |
383 | (build-requirements-features requirements) | |
384 | (build-machine-features machine)))) | |
385 | ||
63b0c3ea LC |
386 | (define %minimum-disk-space |
387 | ;; Minimum disk space required on the build machine for a build to be | |
388 | ;; offloaded. This keeps us from offloading to machines that are bound to | |
389 | ;; run out of disk space. | |
390 | (* 100 (expt 2 20))) ;100 MiB | |
391 | ||
bbe66a53 LC |
392 | (define (node-load node) |
393 | "Return the load on NODE. Return +∞ if NODE is misbehaving." | |
ed7b4437 LC |
394 | (let ((line (inferior-eval '(begin |
395 | (use-modules (ice-9 rdelim)) | |
396 | (call-with-input-file "/proc/loadavg" | |
397 | read-string)) | |
398 | node))) | |
bbe66a53 LC |
399 | (if (eof-object? line) |
400 | +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded | |
401 | (match (string-tokenize line) | |
402 | ((one five fifteen . x) | |
403 | (string->number one)) | |
404 | (x | |
405 | +inf.0))))) | |
406 | ||
407 | (define (normalized-load machine load) | |
408 | "Divide LOAD by the number of parallel builds of MACHINE." | |
409 | (if (rational? load) | |
410 | (let* ((jobs (build-machine-parallel-builds machine)) | |
411 | (normalized (/ load jobs))) | |
412 | (format (current-error-port) "load on machine '~a' is ~s\ | |
165f4b2a | 413 | (normalized: ~s)~%" |
bbe66a53 LC |
414 | (build-machine-name machine) load normalized) |
415 | normalized) | |
416 | load)) | |
165f4b2a | 417 | |
d8e89b1c LC |
418 | (define (random-seed) |
419 | (logxor (getpid) (car (gettimeofday)))) | |
420 | ||
421 | (define shuffle | |
422 | (let ((state (seed->random-state (random-seed)))) | |
423 | (lambda (lst) | |
424 | "Return LST shuffled (using the Fisher-Yates algorithm.)" | |
425 | (define vec (list->vector lst)) | |
426 | (let loop ((result '()) | |
427 | (i (vector-length vec))) | |
428 | (if (zero? i) | |
429 | result | |
430 | (let* ((j (random i state)) | |
431 | (val (vector-ref vec j))) | |
432 | (vector-set! vec j (vector-ref vec (- i 1))) | |
433 | (loop (cons val result) (- i 1)))))))) | |
434 | ||
88da0b68 | 435 | (define (choose-build-machine machines) |
84620dd0 LC |
436 | "Return two values: the best machine among MACHINES and its build |
437 | slot (which must later be released with 'release-build-slot'), or #f and #f." | |
d652b851 LC |
438 | |
439 | ;; Proceed like this: | |
7f4d102c | 440 | ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out |
d652b851 | 441 | ;; those machines for which we failed. |
7f4d102c LC |
442 | ;; 2. Choose the best machine among those that are left. |
443 | ;; 3. Release the previously-acquired build slots of the other machines. | |
444 | ||
445 | (define machines+slots | |
446 | (filter-map (lambda (machine) | |
447 | (let ((slot (acquire-build-slot machine))) | |
448 | (and slot (list machine slot)))) | |
449 | (shuffle machines))) | |
450 | ||
451 | (define (undecorate pred) | |
452 | (lambda (a b) | |
453 | (match a | |
454 | ((machine1 slot1) | |
455 | (match b | |
456 | ((machine2 slot2) | |
457 | (pred machine1 machine2))))))) | |
458 | ||
459 | (define (machine-faster? m1 m2) | |
460 | ;; Return #t if M1 is faster than M2. | |
461 | (> (build-machine-speed m1) | |
462 | (build-machine-speed m2))) | |
463 | ||
464 | (let loop ((machines+slots | |
465 | (sort machines+slots (undecorate machine-faster?)))) | |
466 | (match machines+slots | |
467 | (((best slot) others ...) | |
468 | ;; Return the best machine unless it's already overloaded. | |
469 | ;; Note: We call 'node-load' only as a last resort because it is | |
470 | ;; too costly to call it once for every machine. | |
00d73219 LC |
471 | (let* ((session (false-if-exception (open-ssh-session best |
472 | %short-timeout))) | |
7f4d102c LC |
473 | (node (and session (remote-inferior session))) |
474 | (load (and node (normalized-load best (node-load node)))) | |
475 | (space (and node (node-free-disk-space node)))) | |
476 | (when node (close-inferior node)) | |
477 | (when session (disconnect! session)) | |
478 | (if (and node (< load 2.) (>= space %minimum-disk-space)) | |
479 | (match others | |
480 | (((machines slots) ...) | |
481 | ;; Release slots from the uninteresting machines. | |
482 | (for-each release-build-slot slots) | |
483 | ||
484 | ;; The caller must keep SLOT to protect it from GC and to | |
485 | ;; eventually release it. | |
486 | (values best slot))) | |
487 | (begin | |
488 | ;; BEST is unsuitable, so try the next one. | |
489 | (when (and space (< space %minimum-disk-space)) | |
490 | (format (current-error-port) | |
491 | "skipping machine '~a' because it is low \ | |
63b0c3ea | 492 | on disk space (~,2f MiB free)~%" |
7f4d102c LC |
493 | (build-machine-name best) |
494 | (/ space (expt 2 20) 1.))) | |
495 | (release-build-slot slot) | |
496 | (loop others))))) | |
497 | (() | |
498 | (values #f #f))))) | |
49e6291a | 499 | |
a708de15 LC |
500 | (define (call-with-timeout timeout drv thunk) |
501 | "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call | |
502 | THUNK. Use DRV as an indication of what we were building when the timeout | |
503 | expired." | |
504 | (if (number? timeout) | |
505 | (dynamic-wind | |
506 | (lambda () | |
507 | (sigaction SIGALRM | |
508 | (lambda _ | |
509 | ;; The exit code here will be 1, which guix-daemon will | |
510 | ;; interpret as a transient failure. | |
511 | (leave (G_ "timeout expired while offloading '~a'~%") | |
512 | (derivation-file-name drv)))) | |
513 | (alarm timeout)) | |
514 | thunk | |
515 | (lambda () | |
516 | (alarm 0))) | |
517 | (thunk))) | |
518 | ||
519 | (define-syntax-rule (with-timeout timeout drv exp ...) | |
520 | "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed. | |
521 | If TIMEOUT is #f, simply evaluate EXP..." | |
522 | (call-with-timeout timeout drv (lambda () exp ...))) | |
523 | ||
49e6291a LC |
524 | (define* (process-request wants-local? system drv features |
525 | #:key | |
526 | print-build-trace? (max-silent-time 3600) | |
714084e6 | 527 | build-timeout) |
49e6291a | 528 | "Process a request to build DRV." |
88da0b68 LC |
529 | (let* ((local? (and wants-local? (string=? system (%current-system)))) |
530 | (reqs (build-requirements | |
531 | (system system) | |
532 | (features features))) | |
533 | (candidates (filter (cut machine-matches? <> reqs) | |
534 | (build-machines)))) | |
535 | (match candidates | |
536 | (() | |
537 | ;; We'll never be able to match REQS. | |
538 | (display "# decline\n")) | |
ba97e454 | 539 | ((x ...) |
84620dd0 LC |
540 | (let-values (((machine slot) |
541 | (choose-build-machine candidates))) | |
88da0b68 | 542 | (if machine |
84620dd0 LC |
543 | (dynamic-wind |
544 | (const #f) | |
545 | (lambda () | |
546 | ;; Offload DRV to MACHINE. | |
547 | (display "# accept\n") | |
548 | (let ((inputs (string-tokenize (read-line))) | |
549 | (outputs (string-tokenize (read-line)))) | |
a708de15 LC |
550 | ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can |
551 | ;; be issues with the connection or deadlocks that could | |
552 | ;; lead the 'guix offload' process to remain stuck forever. | |
553 | ;; To avoid that, install a timeout here as well. | |
554 | (with-timeout build-timeout drv | |
555 | (transfer-and-offload drv machine | |
556 | #:inputs inputs | |
557 | #:outputs outputs | |
558 | #:max-silent-time max-silent-time | |
559 | #:build-timeout build-timeout | |
560 | #:print-build-trace? | |
561 | print-build-trace?)))) | |
84620dd0 LC |
562 | (lambda () |
563 | (release-build-slot slot))) | |
88da0b68 LC |
564 | |
565 | ;; Not now, all the machines are busy. | |
566 | (display "# postpone\n"))))))) | |
49e6291a | 567 | |
49e6291a | 568 | \f |
aebaee95 LC |
569 | ;;; |
570 | ;;; Installation tests. | |
571 | ;;; | |
572 | ||
00d73219 LC |
573 | (define %short-timeout |
574 | ;; Timeout in seconds used on SSH connections where reads and writes | |
575 | ;; shouldn't take long. | |
576 | 15) | |
577 | ||
aebaee95 LC |
578 | (define (assert-node-repl node name) |
579 | "Bail out if NODE is not running Guile." | |
580 | (match (node-guile-version node) | |
581 | (#f | |
4eb0f9ae | 582 | (report-guile-error name)) |
aebaee95 | 583 | ((? string? version) |
ed7b4437 | 584 | (info (G_ "'~a' is running GNU Guile ~a~%") |
aebaee95 LC |
585 | name (node-guile-version node))))) |
586 | ||
587 | (define (assert-node-has-guix node name) | |
10b2834f LC |
588 | "Bail out if NODE if #f or if we fail to use the (guix) module, or if its |
589 | daemon is not running." | |
590 | (unless (inferior? node) | |
591 | (leave (G_ "failed to run 'guix repl' on '~a'~%") name)) | |
592 | ||
593 | (match (inferior-eval '(begin | |
594 | (use-modules (guix)) | |
595 | (and add-text-to-store 'alright)) | |
596 | node) | |
597 | ('alright #t) | |
598 | (_ (report-module-error name))) | |
599 | ||
600 | (match (inferior-eval '(begin | |
601 | (use-modules (guix)) | |
602 | (with-store store | |
603 | (add-text-to-store store "test" | |
604 | "Hello, build machine!"))) | |
605 | node) | |
606 | ((? string? str) | |
607 | (info (G_ "Guix is usable on '~a' (test returned ~s)~%") | |
608 | name str)) | |
609 | (x | |
610 | (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%") | |
611 | name x)))) | |
aebaee95 LC |
612 | |
613 | (define %random-state | |
614 | (delay | |
615 | (seed->random-state (logxor (getpid) (car (gettimeofday)))))) | |
616 | ||
987a29ba LC |
617 | (define* (nonce #:optional (name (gethostname))) |
618 | (string-append name "-" | |
aebaee95 LC |
619 | (number->string (random 1000000 (force %random-state))))) |
620 | ||
ed7b4437 | 621 | (define (assert-node-can-import session node name daemon-socket) |
aebaee95 | 622 | "Bail out if NODE refuses to import our archives." |
ed7b4437 LC |
623 | (with-store store |
624 | (let* ((item (add-text-to-store store "export-test" (nonce))) | |
625 | (remote (connect-to-remote-daemon session daemon-socket))) | |
626 | (with-store local | |
627 | (send-files local (list item) remote)) | |
628 | ||
629 | (if (valid-path? remote item) | |
630 | (info (G_ "'~a' successfully imported '~a'~%") | |
631 | name item) | |
632 | (leave (G_ "'~a' was not properly imported on '~a'~%") | |
633 | item name))))) | |
634 | ||
635 | (define (assert-node-can-export session node name daemon-socket) | |
aebaee95 | 636 | "Bail out if we cannot import signed archives from NODE." |
ed7b4437 | 637 | (let* ((remote (connect-to-remote-daemon session daemon-socket)) |
987a29ba | 638 | (item (add-text-to-store remote "import-test" (nonce name)))) |
aebaee95 | 639 | (with-store store |
987a29ba | 640 | (if (and (retrieve-files store (list item) remote) |
aebaee95 | 641 | (valid-path? store item)) |
69daee23 | 642 | (info (G_ "successfully imported '~a' from '~a'~%") |
aebaee95 | 643 | item name) |
69daee23 | 644 | (leave (G_ "failed to import '~a' from '~a'~%") |
aebaee95 LC |
645 | item name))))) |
646 | ||
27991c97 LC |
647 | (define (check-machine-availability machine-file pred) |
648 | "Check that each machine matching PRED in MACHINE-FILE is usable as a build | |
649 | machine." | |
2b513387 LC |
650 | (define (build-machine=? m1 m2) |
651 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
652 | (= (build-machine-port m1) (build-machine-port m2)))) | |
653 | ||
654 | ;; A given build machine may appear several times (e.g., once for | |
655 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
27991c97 LC |
656 | (let ((machines (filter pred |
657 | (delete-duplicates (build-machines machine-file) | |
658 | build-machine=?)))) | |
69daee23 | 659 | (info (G_ "testing ~a build machines defined in '~a'...~%") |
aebaee95 LC |
660 | (length machines) machine-file) |
661 | (let* ((names (map build-machine-name machines)) | |
662 | (sockets (map build-machine-daemon-socket machines)) | |
00d73219 | 663 | (sessions (map (cut open-ssh-session <> %short-timeout) machines)) |
ed7b4437 | 664 | (nodes (map remote-inferior sessions))) |
aebaee95 | 665 | (for-each assert-node-has-guix nodes names) |
10b2834f | 666 | (for-each assert-node-repl nodes names) |
ed7b4437 LC |
667 | (for-each assert-node-can-import sessions nodes names sockets) |
668 | (for-each assert-node-can-export sessions nodes names sockets) | |
669 | (for-each close-inferior nodes) | |
670 | (for-each disconnect! sessions)))) | |
aebaee95 | 671 | |
dafc3daf RW |
672 | (define (check-machine-status machine-file pred) |
673 | "Print the load of each machine matching PRED in MACHINE-FILE." | |
674 | (define (build-machine=? m1 m2) | |
675 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
676 | (= (build-machine-port m1) (build-machine-port m2)))) | |
677 | ||
678 | ;; A given build machine may appear several times (e.g., once for | |
679 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
680 | (let ((machines (filter pred | |
681 | (delete-duplicates (build-machines machine-file) | |
682 | build-machine=?)))) | |
683 | (info (G_ "getting status of ~a build machines defined in '~a'...~%") | |
684 | (length machines) machine-file) | |
685 | (for-each (lambda (machine) | |
10b2834f | 686 | (define session |
00d73219 | 687 | (open-ssh-session machine %short-timeout)) |
10b2834f LC |
688 | |
689 | (match (remote-inferior session) | |
690 | (#f | |
691 | (warning (G_ "failed to run 'guix repl' on machine '~a'~%") | |
692 | (build-machine-name machine))) | |
693 | ((? inferior? inferior) | |
02ec889e LC |
694 | (let ((now (car (gettimeofday)))) |
695 | (match (inferior-eval '(list (uname) | |
696 | (car (gettimeofday))) | |
697 | inferior) | |
698 | ((uts time) | |
699 | (when (< time now) | |
700 | ;; Build machine clocks must not be behind as this | |
701 | ;; could cause timestamp issues. | |
702 | (warning (G_ "machine '~a' is ~a seconds behind~%") | |
703 | (build-machine-name machine) | |
704 | (- now time))) | |
705 | ||
706 | (let ((load (node-load inferior)) | |
707 | (free (node-free-disk-space inferior))) | |
708 | (close-inferior inferior) | |
709 | (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\ | |
710 | host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\ | |
711 | time difference: ~a s~%" | |
712 | (build-machine-name machine) | |
713 | (utsname:sysname uts) (utsname:release uts) | |
714 | (utsname:machine uts) | |
715 | (utsname:nodename uts) | |
716 | (normalized-load machine load) | |
717 | (/ free (expt 2 20) 1.) | |
718 | (- time now)))))))) | |
10b2834f LC |
719 | |
720 | (disconnect! session)) | |
dafc3daf RW |
721 | machines))) |
722 | ||
aebaee95 | 723 | \f |
49e6291a LC |
724 | ;;; |
725 | ;;; Entry point. | |
726 | ;;; | |
727 | ||
728 | (define (guix-offload . args) | |
729 | (define request-line-rx | |
730 | ;; The request format. See 'tryBuildHook' method in build.cc. | |
731 | (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)")) | |
732 | ||
733 | (define not-coma | |
734 | (char-set-complement (char-set #\,))) | |
735 | ||
736 | ;; Make sure $HOME really corresponds to the current user. This is | |
737 | ;; necessary since lsh uses that to determine the location of the yarrow | |
738 | ;; seed file, and fails if it's owned by someone else. | |
739 | (and=> (passwd:dir (getpw (getuid))) | |
740 | (cut setenv "HOME" <>)) | |
741 | ||
0b724753 LC |
742 | ;; We rely on protocol-level compression from libssh to optimize large data |
743 | ;; transfers. Warn if it's missing. | |
744 | (unless (zlib-support?) | |
69daee23 LC |
745 | (warning (G_ "Guile-SSH lacks zlib support")) |
746 | (warning (G_ "data transfers will *not* be compressed!"))) | |
0b724753 | 747 | |
49e6291a LC |
748 | (match args |
749 | ((system max-silent-time print-build-trace? build-timeout) | |
750 | (let ((max-silent-time (string->number max-silent-time)) | |
751 | (build-timeout (string->number build-timeout)) | |
752 | (print-build-trace? (string=? print-build-trace? "1"))) | |
8902d0f2 | 753 | (set-thread-name "guix offload") |
49e6291a LC |
754 | (parameterize ((%current-system system)) |
755 | (let loop ((line (read-line))) | |
756 | (unless (eof-object? line) | |
757 | (cond ((regexp-exec request-line-rx line) | |
758 | => | |
759 | (lambda (match) | |
e8a5db80 | 760 | (with-error-handling |
49e6291a LC |
761 | (process-request (equal? (match:substring match 1) "1") |
762 | (match:substring match 2) ; system | |
015f17e8 LC |
763 | (read-derivation-from-file |
764 | (match:substring match 3)) | |
49e6291a LC |
765 | (string-tokenize |
766 | (match:substring match 4) not-coma) | |
767 | #:print-build-trace? print-build-trace? | |
768 | #:max-silent-time max-silent-time | |
769 | #:build-timeout build-timeout)))) | |
770 | (else | |
69daee23 | 771 | (leave (G_ "invalid request line: ~s~%") line))) |
49e6291a | 772 | (loop (read-line))))))) |
aebaee95 LC |
773 | (("test" rest ...) |
774 | (with-error-handling | |
27991c97 LC |
775 | (let-values (((file pred) |
776 | (match rest | |
777 | ((file regexp) | |
778 | (values file | |
779 | (compose (cut string-match regexp <>) | |
780 | build-machine-name))) | |
781 | ((file) (values file (const #t))) | |
782 | (() (values %machine-file (const #t))) | |
69daee23 | 783 | (x (leave (G_ "wrong number of arguments~%")))))) |
27991c97 | 784 | (check-machine-availability (or file %machine-file) pred)))) |
dafc3daf RW |
785 | (("status" rest ...) |
786 | (with-error-handling | |
787 | (let-values (((file pred) | |
788 | (match rest | |
789 | ((file regexp) | |
790 | (values file | |
791 | (compose (cut string-match regexp <>) | |
792 | build-machine-name))) | |
793 | ((file) (values file (const #t))) | |
794 | (() (values %machine-file (const #t))) | |
795 | (x (leave (G_ "wrong number of arguments~%")))))) | |
796 | (check-machine-status (or file %machine-file) pred)))) | |
49e6291a LC |
797 | (("--version") |
798 | (show-version-and-exit "guix offload")) | |
799 | (("--help") | |
54a87b2a MC |
800 | (format #t (G_ "Usage: guix offload SYSTEM MAX-SILENT-TIME \ |
801 | PRINT-BUILD-TRACE? BUILD-TIMEOUT | |
49e6291a LC |
802 | Process build offload requests written on the standard input, possibly |
803 | offloading builds to the machines listed in '~a'.~%") | |
804 | %machine-file) | |
69daee23 | 805 | (display (G_ " |
49e6291a LC |
806 | This tool is meant to be used internally by 'guix-daemon'.\n")) |
807 | (show-bug-report-information)) | |
808 | (x | |
69daee23 | 809 | (leave (G_ "invalid arguments: ~{~s ~}~%") x)))) |
49e6291a | 810 | |
f326fef8 | 811 | ;;; Local Variables: |
d81195bf | 812 | ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1) |
a708de15 | 813 | ;;; eval: (put 'with-timeout 'scheme-indent-function 2) |
f326fef8 LC |
814 | ;;; End: |
815 | ||
49e6291a | 816 | ;;; offload.scm ends here |