Commit | Line | Data |
---|---|---|
49e6291a | 1 | ;;; GNU Guix --- Functional package management for GNU |
f9e8a123 | 2 | ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org> |
dafc3daf | 3 | ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net> |
49e6291a LC |
4 | ;;; |
5 | ;;; This file is part of GNU Guix. | |
6 | ;;; | |
7 | ;;; GNU Guix is free software; you can redistribute it and/or modify it | |
8 | ;;; under the terms of the GNU General Public License as published by | |
9 | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
10 | ;;; your option) any later version. | |
11 | ;;; | |
12 | ;;; GNU Guix is distributed in the hope that it will be useful, but | |
13 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;;; GNU General Public License for more details. | |
16 | ;;; | |
17 | ;;; You should have received a copy of the GNU General Public License | |
18 | ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. | |
19 | ||
20 | (define-module (guix scripts offload) | |
21531add LC |
21 | #:use-module (ssh key) |
22 | #:use-module (ssh auth) | |
23 | #:use-module (ssh session) | |
24 | #:use-module (ssh channel) | |
cf283dd9 | 25 | #:use-module (ssh popen) |
0b724753 | 26 | #:use-module (ssh version) |
49e6291a LC |
27 | #:use-module (guix config) |
28 | #:use-module (guix records) | |
987a29ba | 29 | #:use-module (guix ssh) |
49e6291a | 30 | #:use-module (guix store) |
ed7b4437 | 31 | #:use-module (guix inferior) |
49e6291a | 32 | #:use-module (guix derivations) |
2535635f LC |
33 | #:use-module ((guix serialization) |
34 | #:select (nar-error? nar-error-file)) | |
49e6291a LC |
35 | #:use-module (guix nar) |
36 | #:use-module (guix utils) | |
8902d0f2 LC |
37 | #:use-module ((guix build syscalls) |
38 | #:select (fcntl-flock set-thread-name)) | |
f326fef8 | 39 | #:use-module ((guix build utils) #:select (which mkdir-p)) |
49e6291a LC |
40 | #:use-module (guix ui) |
41 | #:use-module (srfi srfi-1) | |
6c41cce0 | 42 | #:use-module (srfi srfi-11) |
49e6291a LC |
43 | #:use-module (srfi srfi-26) |
44 | #:use-module (srfi srfi-34) | |
45 | #:use-module (srfi srfi-35) | |
46 | #:use-module (ice-9 popen) | |
47 | #:use-module (ice-9 rdelim) | |
48 | #:use-module (ice-9 match) | |
49 | #:use-module (ice-9 regex) | |
50 | #:use-module (ice-9 format) | |
2535635f | 51 | #:use-module (ice-9 binary-ports) |
49e6291a LC |
52 | #:export (build-machine |
53 | build-requirements | |
54 | guix-offload)) | |
55 | ||
56 | ;;; Commentary: | |
57 | ;;; | |
58 | ;;; Attempt to offload builds to the machines listed in | |
59 | ;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and | |
60 | ;;; retrieving the build output(s) over SSH upon success. | |
61 | ;;; | |
62 | ;;; This command should not be used directly; instead, it is called on-demand | |
63 | ;;; by the daemon, unless it was started with '--no-build-hook' or a client | |
64 | ;;; inhibited build hooks. | |
65 | ;;; | |
66 | ;;; Code: | |
67 | ||
68 | ||
69 | (define-record-type* <build-machine> | |
70 | build-machine make-build-machine | |
71 | build-machine? | |
72 | (name build-machine-name) ; string | |
cecd72d5 LC |
73 | (port build-machine-port ; number |
74 | (default 22)) | |
49e6291a LC |
75 | (system build-machine-system) ; string |
76 | (user build-machine-user) ; string | |
77 | (private-key build-machine-private-key ; file name | |
21531add LC |
78 | (default (user-openssh-private-key))) |
79 | (host-key build-machine-host-key) ; string | |
1d48cf94 LC |
80 | (compression build-machine-compression ; string |
81 | (default "zlib@openssh.com,zlib")) | |
82 | (compression-level build-machine-compression-level ;integer | |
83 | (default 3)) | |
cf283dd9 LC |
84 | (daemon-socket build-machine-daemon-socket ; string |
85 | (default "/var/guix/daemon-socket/socket")) | |
49e6291a LC |
86 | (parallel-builds build-machine-parallel-builds ; number |
87 | (default 1)) | |
88 | (speed build-machine-speed ; inexact real | |
89 | (default 1.0)) | |
90 | (features build-machine-features ; list of strings | |
91 | (default '()))) | |
92 | ||
93 | (define-record-type* <build-requirements> | |
94 | build-requirements make-build-requirements | |
95 | build-requirements? | |
96 | (system build-requirements-system) ; string | |
97 | (features build-requirements-features ; list of strings | |
98 | (default '()))) | |
99 | ||
100 | (define %machine-file | |
101 | ;; File that lists machines available as build slaves. | |
102 | (string-append %config-directory "/machines.scm")) | |
103 | ||
21531add LC |
104 | (define (user-openssh-private-key) |
105 | "Return the user's default SSH private key, or #f if it could not be | |
49e6291a LC |
106 | determined." |
107 | (and=> (getenv "HOME") | |
21531add | 108 | (cut string-append <> "/.ssh/id_rsa"))) |
49e6291a LC |
109 | |
110 | (define %user-module | |
111 | ;; Module in which the machine description file is loaded. | |
112 | (let ((module (make-fresh-user-module))) | |
113 | (module-use! module (resolve-interface '(guix scripts offload))) | |
114 | module)) | |
115 | ||
116 | (define* (build-machines #:optional (file %machine-file)) | |
117 | "Read the list of build machines from FILE and return it." | |
118 | (catch #t | |
119 | (lambda () | |
120 | ;; Avoid ABI incompatibility with the <build-machine> record. | |
e2721a05 | 121 | ;; (set! %fresh-auto-compile #t) |
49e6291a LC |
122 | |
123 | (save-module-excursion | |
124 | (lambda () | |
125 | (set-current-module %user-module) | |
a9a685cc LC |
126 | (match (primitive-load file) |
127 | (((? build-machine? machines) ...) | |
128 | machines) | |
129 | (_ | |
130 | ;; Instead of crashing, assume the empty list. | |
131 | (warning (G_ "'~a' did not return a list of build machines; \ | |
132 | ignoring it~%") | |
133 | file) | |
134 | '()))))) | |
49e6291a LC |
135 | (lambda args |
136 | (match args | |
e465d9e1 | 137 | (('system-error . rest) |
49e6291a LC |
138 | (let ((err (system-error-errno args))) |
139 | ;; Silently ignore missing file since this is a common case. | |
140 | (if (= ENOENT err) | |
141 | '() | |
69daee23 | 142 | (leave (G_ "failed to open machine file '~a': ~a~%") |
92cb2e28 | 143 | file (strerror err))))) |
c1202fb1 LC |
144 | (('syntax-error proc message properties form . rest) |
145 | (let ((loc (source-properties->location properties))) | |
69daee23 | 146 | (leave (G_ "~a: ~a~%") |
c1202fb1 | 147 | (location->string loc) message))) |
e465d9e1 | 148 | (x |
69daee23 | 149 | (leave (G_ "failed to load machine file '~a': ~s~%") |
92cb2e28 | 150 | file args)))))) |
49e6291a | 151 | |
21531add LC |
152 | (define (host-key->type+key host-key) |
153 | "Destructure HOST-KEY, an OpenSSH host key string, and return two values: | |
154 | its key type as a symbol, and the actual base64-encoded string." | |
155 | (define (type->symbol type) | |
156 | (and (string-prefix? "ssh-" type) | |
157 | (string->symbol (string-drop type 4)))) | |
158 | ||
159 | (match (string-tokenize host-key) | |
ba97e454 | 160 | ((type key x) |
21531add LC |
161 | (values (type->symbol type) key)) |
162 | ((type key) | |
163 | (values (type->symbol type) key)))) | |
164 | ||
165 | (define (private-key-from-file* file) | |
166 | "Like 'private-key-from-file', but raise an error that 'with-error-handling' | |
167 | can interpret meaningfully." | |
168 | (catch 'guile-ssh-error | |
169 | (lambda () | |
170 | (private-key-from-file file)) | |
171 | (lambda (key proc str . rest) | |
172 | (raise (condition | |
69daee23 | 173 | (&message (message (format #f (G_ "failed to load SSH \ |
21531add LC |
174 | private key from '~a': ~a") |
175 | file str)))))))) | |
176 | ||
177 | (define (open-ssh-session machine) | |
178 | "Open an SSH session for MACHINE and return it. Throw an error on failure." | |
179 | (let ((private (private-key-from-file* (build-machine-private-key machine))) | |
180 | (public (public-key-from-file | |
181 | (string-append (build-machine-private-key machine) | |
182 | ".pub"))) | |
183 | (session (make-session #:user (build-machine-user machine) | |
184 | #:host (build-machine-name machine) | |
185 | #:port (build-machine-port machine) | |
8d125cfc | 186 | #:timeout 10 ;seconds |
21531add LC |
187 | ;; #:log-verbosity 'protocol |
188 | #:identity (build-machine-private-key machine) | |
189 | ||
bd834577 LC |
190 | ;; By default libssh reads ~/.ssh/known_hosts |
191 | ;; and uses that to adjust its choice of cipher | |
192 | ;; suites, which changes the type of host key | |
193 | ;; that the server sends (RSA vs. Ed25519, | |
194 | ;; etc.). Opt for something reproducible and | |
195 | ;; stateless instead. | |
196 | #:knownhosts "/dev/null" | |
197 | ||
21531add LC |
198 | ;; We need lightweight compression when |
199 | ;; exchanging full archives. | |
1d48cf94 LC |
200 | #:compression |
201 | (build-machine-compression machine) | |
202 | #:compression-level | |
203 | (build-machine-compression-level machine)))) | |
74afca5d LC |
204 | (match (connect! session) |
205 | ('ok | |
206 | ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about | |
207 | ;; ed25519 keys and 'get-key-type' returns #f in that case. | |
208 | (let-values (((server) (get-server-public-key session)) | |
209 | ((type key) (host-key->type+key | |
210 | (build-machine-host-key machine)))) | |
211 | (unless (and (or (not (get-key-type server)) | |
212 | (eq? (get-key-type server) type)) | |
213 | (string=? (public-key->string server) key)) | |
214 | ;; Key mismatch: something's wrong. XXX: It could be that the server | |
215 | ;; provided its Ed25519 key when we where expecting its RSA key. | |
69daee23 | 216 | (leave (G_ "server at '~a' returned host key '~a' of type '~a' \ |
21531add | 217 | instead of '~a' of type '~a'~%") |
74afca5d LC |
218 | (build-machine-name machine) |
219 | (public-key->string server) (get-key-type server) | |
220 | key type))) | |
221 | ||
222 | (let ((auth (userauth-public-key! session private))) | |
223 | (unless (eq? 'success auth) | |
224 | (disconnect! session) | |
69daee23 | 225 | (leave (G_ "SSH public key authentication failed for '~a': ~a~%") |
74afca5d LC |
226 | (build-machine-name machine) (get-error session)))) |
227 | ||
228 | session) | |
229 | (x | |
230 | ;; Connection failed or timeout expired. | |
69daee23 | 231 | (leave (G_ "failed to connect to '~a': ~a~%") |
74afca5d | 232 | (build-machine-name machine) (get-error session)))))) |
21531add | 233 | |
59f704df LC |
234 | \f |
235 | ;;; | |
236 | ;;; Synchronization. | |
237 | ;;; | |
238 | ||
59f704df LC |
239 | (define (machine-slot-file machine slot) |
240 | "Return the file name of MACHINE's file for SLOT." | |
241 | ;; For each machine we have a bunch of files representing each build slot. | |
242 | ;; When choosing a build machine, we attempt to get an exclusive lock on one | |
243 | ;; of these; if we fail, that means all the build slots are already taken. | |
244 | ;; Inspired by Nix's build-remote.pl. | |
245 | (string-append (string-append %state-directory "/offload/" | |
246 | (build-machine-name machine) | |
247 | "/" (number->string slot)))) | |
248 | ||
249 | (define (acquire-build-slot machine) | |
250 | "Attempt to acquire a build slot on MACHINE. Return the port representing | |
251 | the slot, or #f if none is available. | |
252 | ||
253 | This mechanism allows us to set a hard limit on the number of simultaneous | |
254 | connections allowed to MACHINE." | |
255 | (mkdir-p (dirname (machine-slot-file machine 0))) | |
0ef595b9 LC |
256 | |
257 | ;; When several 'guix offload' processes run in parallel, there's a race | |
258 | ;; among them, but since they try the slots in the same order, we're fine. | |
259 | (any (lambda (slot) | |
260 | (let ((port (open-file (machine-slot-file machine slot) | |
261 | "w0"))) | |
262 | (catch 'flock-error | |
263 | (lambda () | |
264 | (fcntl-flock port 'write-lock #:wait? #f) | |
265 | ;; Got it! | |
266 | (format (current-error-port) | |
267 | "process ~a acquired build slot '~a'~%" | |
268 | (getpid) (port-filename port)) | |
269 | port) | |
270 | (lambda args | |
271 | ;; PORT is already locked by another process. | |
272 | (close-port port) | |
273 | #f)))) | |
274 | (iota (build-machine-parallel-builds machine)))) | |
59f704df LC |
275 | |
276 | (define (release-build-slot slot) | |
277 | "Release SLOT, a build slot as returned as by 'acquire-build-slot'." | |
278 | (close-port slot)) | |
279 | ||
280 | \f | |
281 | ;;; | |
282 | ;;; Offloading. | |
283 | ;;; | |
284 | ||
d81195bf LC |
285 | (define (build-log-port) |
286 | "Return the default port where build logs should be sent. The default is | |
287 | file descriptor 4, which is open by the daemon before running the offload | |
288 | hook." | |
289 | (let ((port (fdopen 4 "w0"))) | |
290 | ;; Make sure file descriptor 4 isn't closed when PORT is GC'd. | |
291 | (set-port-revealed! port 1) | |
292 | port)) | |
293 | ||
ed7b4437 LC |
294 | (define (node-guile-version node) |
295 | (inferior-eval '(version) node)) | |
296 | ||
63b0c3ea LC |
297 | (define (node-free-disk-space node) |
298 | "Return the free disk space, in bytes, in NODE's store." | |
ed7b4437 LC |
299 | (inferior-eval `(begin |
300 | (use-modules (guix build syscalls)) | |
301 | (free-disk-space ,(%store-prefix))) | |
302 | node)) | |
63b0c3ea | 303 | |
88da0b68 LC |
304 | (define* (transfer-and-offload drv machine |
305 | #:key | |
306 | (inputs '()) | |
307 | (outputs '()) | |
308 | (max-silent-time 3600) | |
714084e6 | 309 | build-timeout |
88da0b68 LC |
310 | print-build-trace?) |
311 | "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of | |
312 | INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from | |
313 | MACHINE." | |
9e76eed3 LC |
314 | (define session |
315 | (open-ssh-session machine)) | |
316 | ||
cf283dd9 LC |
317 | (define store |
318 | (connect-to-remote-daemon session | |
319 | (build-machine-daemon-socket machine))) | |
320 | ||
321 | (set-build-options store | |
322 | #:print-build-trace print-build-trace? | |
323 | #:max-silent-time max-silent-time | |
324 | #:timeout build-timeout) | |
325 | ||
326 | ;; Protect DRV from garbage collection. | |
327 | (add-temp-root store (derivation-file-name drv)) | |
328 | ||
987a29ba LC |
329 | (with-store local |
330 | (send-files local (cons (derivation-file-name drv) inputs) store | |
331 | #:log-port (current-output-port))) | |
cf283dd9 LC |
332 | (format (current-error-port) "offloading '~a' to '~a'...~%" |
333 | (derivation-file-name drv) (build-machine-name machine)) | |
334 | (format (current-error-port) "@ build-remote ~a ~a~%" | |
335 | (derivation-file-name drv) (build-machine-name machine)) | |
336 | ||
f9e8a123 | 337 | (guard (c ((store-protocol-error? c) |
cf283dd9 | 338 | (format (current-error-port) |
69daee23 | 339 | (G_ "derivation '~a' offloaded to '~a' failed: ~a~%") |
cf283dd9 LC |
340 | (derivation-file-name drv) |
341 | (build-machine-name machine) | |
f9e8a123 | 342 | (store-protocol-error-message c)) |
ed7b4437 LC |
343 | (let* ((inferior (false-if-exception (remote-inferior session))) |
344 | (space (false-if-exception | |
345 | (node-free-disk-space inferior)))) | |
346 | ||
347 | (when inferior | |
348 | (close-inferior inferior)) | |
b96e05ae LC |
349 | |
350 | ;; Use exit code 100 for a permanent build failure. The daemon | |
351 | ;; interprets other non-zero codes as transient build failures. | |
352 | (if (and space (< space (* 10 (expt 2 20)))) | |
353 | (begin | |
354 | (format (current-error-port) | |
355 | (G_ "build failure may have been caused by lack \ | |
356 | of free disk space on '~a'~%") | |
357 | (build-machine-name machine)) | |
358 | (primitive-exit 1)) | |
359 | (primitive-exit 100))))) | |
0237d797 LC |
360 | (parameterize ((current-build-output-port (build-log-port))) |
361 | (build-derivations store (list drv)))) | |
cf283dd9 | 362 | |
d06d54e3 LC |
363 | (retrieve-files* outputs store |
364 | ||
365 | ;; We cannot use the 'import-paths' RPC here because we | |
366 | ;; already hold the locks for FILES. | |
367 | #:import | |
368 | (lambda (port) | |
369 | (restore-file-set port | |
370 | #:log-port (current-error-port) | |
371 | #:lock? #f))) | |
372 | ||
cf283dd9 LC |
373 | (format (current-error-port) "done with offloaded '~a'~%" |
374 | (derivation-file-name drv))) | |
375 | ||
59f704df LC |
376 | \f |
377 | ;;; | |
378 | ;;; Scheduling. | |
379 | ;;; | |
380 | ||
49e6291a LC |
381 | (define (machine-matches? machine requirements) |
382 | "Return #t if MACHINE matches REQUIREMENTS." | |
383 | (and (string=? (build-requirements-system requirements) | |
384 | (build-machine-system machine)) | |
385 | (lset<= string=? | |
386 | (build-requirements-features requirements) | |
387 | (build-machine-features machine)))) | |
388 | ||
63b0c3ea LC |
389 | (define %minimum-disk-space |
390 | ;; Minimum disk space required on the build machine for a build to be | |
391 | ;; offloaded. This keeps us from offloading to machines that are bound to | |
392 | ;; run out of disk space. | |
393 | (* 100 (expt 2 20))) ;100 MiB | |
394 | ||
bbe66a53 LC |
395 | (define (node-load node) |
396 | "Return the load on NODE. Return +∞ if NODE is misbehaving." | |
ed7b4437 LC |
397 | (let ((line (inferior-eval '(begin |
398 | (use-modules (ice-9 rdelim)) | |
399 | (call-with-input-file "/proc/loadavg" | |
400 | read-string)) | |
401 | node))) | |
bbe66a53 LC |
402 | (if (eof-object? line) |
403 | +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded | |
404 | (match (string-tokenize line) | |
405 | ((one five fifteen . x) | |
406 | (string->number one)) | |
407 | (x | |
408 | +inf.0))))) | |
409 | ||
410 | (define (normalized-load machine load) | |
411 | "Divide LOAD by the number of parallel builds of MACHINE." | |
412 | (if (rational? load) | |
413 | (let* ((jobs (build-machine-parallel-builds machine)) | |
414 | (normalized (/ load jobs))) | |
415 | (format (current-error-port) "load on machine '~a' is ~s\ | |
165f4b2a | 416 | (normalized: ~s)~%" |
bbe66a53 LC |
417 | (build-machine-name machine) load normalized) |
418 | normalized) | |
419 | load)) | |
165f4b2a | 420 | |
d8e89b1c LC |
421 | (define (random-seed) |
422 | (logxor (getpid) (car (gettimeofday)))) | |
423 | ||
424 | (define shuffle | |
425 | (let ((state (seed->random-state (random-seed)))) | |
426 | (lambda (lst) | |
427 | "Return LST shuffled (using the Fisher-Yates algorithm.)" | |
428 | (define vec (list->vector lst)) | |
429 | (let loop ((result '()) | |
430 | (i (vector-length vec))) | |
431 | (if (zero? i) | |
432 | result | |
433 | (let* ((j (random i state)) | |
434 | (val (vector-ref vec j))) | |
435 | (vector-set! vec j (vector-ref vec (- i 1))) | |
436 | (loop (cons val result) (- i 1)))))))) | |
437 | ||
88da0b68 | 438 | (define (choose-build-machine machines) |
84620dd0 LC |
439 | "Return two values: the best machine among MACHINES and its build |
440 | slot (which must later be released with 'release-build-slot'), or #f and #f." | |
d652b851 LC |
441 | |
442 | ;; Proceed like this: | |
7f4d102c | 443 | ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out |
d652b851 | 444 | ;; those machines for which we failed. |
7f4d102c LC |
445 | ;; 2. Choose the best machine among those that are left. |
446 | ;; 3. Release the previously-acquired build slots of the other machines. | |
447 | ||
448 | (define machines+slots | |
449 | (filter-map (lambda (machine) | |
450 | (let ((slot (acquire-build-slot machine))) | |
451 | (and slot (list machine slot)))) | |
452 | (shuffle machines))) | |
453 | ||
454 | (define (undecorate pred) | |
455 | (lambda (a b) | |
456 | (match a | |
457 | ((machine1 slot1) | |
458 | (match b | |
459 | ((machine2 slot2) | |
460 | (pred machine1 machine2))))))) | |
461 | ||
462 | (define (machine-faster? m1 m2) | |
463 | ;; Return #t if M1 is faster than M2. | |
464 | (> (build-machine-speed m1) | |
465 | (build-machine-speed m2))) | |
466 | ||
467 | (let loop ((machines+slots | |
468 | (sort machines+slots (undecorate machine-faster?)))) | |
469 | (match machines+slots | |
470 | (((best slot) others ...) | |
471 | ;; Return the best machine unless it's already overloaded. | |
472 | ;; Note: We call 'node-load' only as a last resort because it is | |
473 | ;; too costly to call it once for every machine. | |
474 | (let* ((session (false-if-exception (open-ssh-session best))) | |
475 | (node (and session (remote-inferior session))) | |
476 | (load (and node (normalized-load best (node-load node)))) | |
477 | (space (and node (node-free-disk-space node)))) | |
478 | (when node (close-inferior node)) | |
479 | (when session (disconnect! session)) | |
480 | (if (and node (< load 2.) (>= space %minimum-disk-space)) | |
481 | (match others | |
482 | (((machines slots) ...) | |
483 | ;; Release slots from the uninteresting machines. | |
484 | (for-each release-build-slot slots) | |
485 | ||
486 | ;; The caller must keep SLOT to protect it from GC and to | |
487 | ;; eventually release it. | |
488 | (values best slot))) | |
489 | (begin | |
490 | ;; BEST is unsuitable, so try the next one. | |
491 | (when (and space (< space %minimum-disk-space)) | |
492 | (format (current-error-port) | |
493 | "skipping machine '~a' because it is low \ | |
63b0c3ea | 494 | on disk space (~,2f MiB free)~%" |
7f4d102c LC |
495 | (build-machine-name best) |
496 | (/ space (expt 2 20) 1.))) | |
497 | (release-build-slot slot) | |
498 | (loop others))))) | |
499 | (() | |
500 | (values #f #f))))) | |
49e6291a | 501 | |
a708de15 LC |
502 | (define (call-with-timeout timeout drv thunk) |
503 | "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call | |
504 | THUNK. Use DRV as an indication of what we were building when the timeout | |
505 | expired." | |
506 | (if (number? timeout) | |
507 | (dynamic-wind | |
508 | (lambda () | |
509 | (sigaction SIGALRM | |
510 | (lambda _ | |
511 | ;; The exit code here will be 1, which guix-daemon will | |
512 | ;; interpret as a transient failure. | |
513 | (leave (G_ "timeout expired while offloading '~a'~%") | |
514 | (derivation-file-name drv)))) | |
515 | (alarm timeout)) | |
516 | thunk | |
517 | (lambda () | |
518 | (alarm 0))) | |
519 | (thunk))) | |
520 | ||
521 | (define-syntax-rule (with-timeout timeout drv exp ...) | |
522 | "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed. | |
523 | If TIMEOUT is #f, simply evaluate EXP..." | |
524 | (call-with-timeout timeout drv (lambda () exp ...))) | |
525 | ||
49e6291a LC |
526 | (define* (process-request wants-local? system drv features |
527 | #:key | |
528 | print-build-trace? (max-silent-time 3600) | |
714084e6 | 529 | build-timeout) |
49e6291a | 530 | "Process a request to build DRV." |
88da0b68 LC |
531 | (let* ((local? (and wants-local? (string=? system (%current-system)))) |
532 | (reqs (build-requirements | |
533 | (system system) | |
534 | (features features))) | |
535 | (candidates (filter (cut machine-matches? <> reqs) | |
536 | (build-machines)))) | |
537 | (match candidates | |
538 | (() | |
539 | ;; We'll never be able to match REQS. | |
540 | (display "# decline\n")) | |
ba97e454 | 541 | ((x ...) |
84620dd0 LC |
542 | (let-values (((machine slot) |
543 | (choose-build-machine candidates))) | |
88da0b68 | 544 | (if machine |
84620dd0 LC |
545 | (dynamic-wind |
546 | (const #f) | |
547 | (lambda () | |
548 | ;; Offload DRV to MACHINE. | |
549 | (display "# accept\n") | |
550 | (let ((inputs (string-tokenize (read-line))) | |
551 | (outputs (string-tokenize (read-line)))) | |
a708de15 LC |
552 | ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can |
553 | ;; be issues with the connection or deadlocks that could | |
554 | ;; lead the 'guix offload' process to remain stuck forever. | |
555 | ;; To avoid that, install a timeout here as well. | |
556 | (with-timeout build-timeout drv | |
557 | (transfer-and-offload drv machine | |
558 | #:inputs inputs | |
559 | #:outputs outputs | |
560 | #:max-silent-time max-silent-time | |
561 | #:build-timeout build-timeout | |
562 | #:print-build-trace? | |
563 | print-build-trace?)))) | |
84620dd0 LC |
564 | (lambda () |
565 | (release-build-slot slot))) | |
88da0b68 LC |
566 | |
567 | ;; Not now, all the machines are busy. | |
568 | (display "# postpone\n"))))))) | |
49e6291a | 569 | |
49e6291a | 570 | \f |
aebaee95 LC |
571 | ;;; |
572 | ;;; Installation tests. | |
573 | ;;; | |
574 | ||
575 | (define (assert-node-repl node name) | |
576 | "Bail out if NODE is not running Guile." | |
577 | (match (node-guile-version node) | |
578 | (#f | |
4eb0f9ae | 579 | (report-guile-error name)) |
aebaee95 | 580 | ((? string? version) |
ed7b4437 | 581 | (info (G_ "'~a' is running GNU Guile ~a~%") |
aebaee95 LC |
582 | name (node-guile-version node))))) |
583 | ||
584 | (define (assert-node-has-guix node name) | |
10b2834f LC |
585 | "Bail out if NODE if #f or if we fail to use the (guix) module, or if its |
586 | daemon is not running." | |
587 | (unless (inferior? node) | |
588 | (leave (G_ "failed to run 'guix repl' on '~a'~%") name)) | |
589 | ||
590 | (match (inferior-eval '(begin | |
591 | (use-modules (guix)) | |
592 | (and add-text-to-store 'alright)) | |
593 | node) | |
594 | ('alright #t) | |
595 | (_ (report-module-error name))) | |
596 | ||
597 | (match (inferior-eval '(begin | |
598 | (use-modules (guix)) | |
599 | (with-store store | |
600 | (add-text-to-store store "test" | |
601 | "Hello, build machine!"))) | |
602 | node) | |
603 | ((? string? str) | |
604 | (info (G_ "Guix is usable on '~a' (test returned ~s)~%") | |
605 | name str)) | |
606 | (x | |
607 | (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%") | |
608 | name x)))) | |
aebaee95 LC |
609 | |
610 | (define %random-state | |
611 | (delay | |
612 | (seed->random-state (logxor (getpid) (car (gettimeofday)))))) | |
613 | ||
987a29ba LC |
614 | (define* (nonce #:optional (name (gethostname))) |
615 | (string-append name "-" | |
aebaee95 LC |
616 | (number->string (random 1000000 (force %random-state))))) |
617 | ||
ed7b4437 | 618 | (define (assert-node-can-import session node name daemon-socket) |
aebaee95 | 619 | "Bail out if NODE refuses to import our archives." |
ed7b4437 LC |
620 | (with-store store |
621 | (let* ((item (add-text-to-store store "export-test" (nonce))) | |
622 | (remote (connect-to-remote-daemon session daemon-socket))) | |
623 | (with-store local | |
624 | (send-files local (list item) remote)) | |
625 | ||
626 | (if (valid-path? remote item) | |
627 | (info (G_ "'~a' successfully imported '~a'~%") | |
628 | name item) | |
629 | (leave (G_ "'~a' was not properly imported on '~a'~%") | |
630 | item name))))) | |
631 | ||
632 | (define (assert-node-can-export session node name daemon-socket) | |
aebaee95 | 633 | "Bail out if we cannot import signed archives from NODE." |
ed7b4437 | 634 | (let* ((remote (connect-to-remote-daemon session daemon-socket)) |
987a29ba | 635 | (item (add-text-to-store remote "import-test" (nonce name)))) |
aebaee95 | 636 | (with-store store |
987a29ba | 637 | (if (and (retrieve-files store (list item) remote) |
aebaee95 | 638 | (valid-path? store item)) |
69daee23 | 639 | (info (G_ "successfully imported '~a' from '~a'~%") |
aebaee95 | 640 | item name) |
69daee23 | 641 | (leave (G_ "failed to import '~a' from '~a'~%") |
aebaee95 LC |
642 | item name))))) |
643 | ||
27991c97 LC |
644 | (define (check-machine-availability machine-file pred) |
645 | "Check that each machine matching PRED in MACHINE-FILE is usable as a build | |
646 | machine." | |
2b513387 LC |
647 | (define (build-machine=? m1 m2) |
648 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
649 | (= (build-machine-port m1) (build-machine-port m2)))) | |
650 | ||
651 | ;; A given build machine may appear several times (e.g., once for | |
652 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
27991c97 LC |
653 | (let ((machines (filter pred |
654 | (delete-duplicates (build-machines machine-file) | |
655 | build-machine=?)))) | |
69daee23 | 656 | (info (G_ "testing ~a build machines defined in '~a'...~%") |
aebaee95 LC |
657 | (length machines) machine-file) |
658 | (let* ((names (map build-machine-name machines)) | |
659 | (sockets (map build-machine-daemon-socket machines)) | |
660 | (sessions (map open-ssh-session machines)) | |
ed7b4437 | 661 | (nodes (map remote-inferior sessions))) |
aebaee95 | 662 | (for-each assert-node-has-guix nodes names) |
10b2834f | 663 | (for-each assert-node-repl nodes names) |
ed7b4437 LC |
664 | (for-each assert-node-can-import sessions nodes names sockets) |
665 | (for-each assert-node-can-export sessions nodes names sockets) | |
666 | (for-each close-inferior nodes) | |
667 | (for-each disconnect! sessions)))) | |
aebaee95 | 668 | |
dafc3daf RW |
669 | (define (check-machine-status machine-file pred) |
670 | "Print the load of each machine matching PRED in MACHINE-FILE." | |
671 | (define (build-machine=? m1 m2) | |
672 | (and (string=? (build-machine-name m1) (build-machine-name m2)) | |
673 | (= (build-machine-port m1) (build-machine-port m2)))) | |
674 | ||
675 | ;; A given build machine may appear several times (e.g., once for | |
676 | ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. | |
677 | (let ((machines (filter pred | |
678 | (delete-duplicates (build-machines machine-file) | |
679 | build-machine=?)))) | |
680 | (info (G_ "getting status of ~a build machines defined in '~a'...~%") | |
681 | (length machines) machine-file) | |
682 | (for-each (lambda (machine) | |
10b2834f LC |
683 | (define session |
684 | (open-ssh-session machine)) | |
685 | ||
686 | (match (remote-inferior session) | |
687 | (#f | |
688 | (warning (G_ "failed to run 'guix repl' on machine '~a'~%") | |
689 | (build-machine-name machine))) | |
690 | ((? inferior? inferior) | |
02ec889e LC |
691 | (let ((now (car (gettimeofday)))) |
692 | (match (inferior-eval '(list (uname) | |
693 | (car (gettimeofday))) | |
694 | inferior) | |
695 | ((uts time) | |
696 | (when (< time now) | |
697 | ;; Build machine clocks must not be behind as this | |
698 | ;; could cause timestamp issues. | |
699 | (warning (G_ "machine '~a' is ~a seconds behind~%") | |
700 | (build-machine-name machine) | |
701 | (- now time))) | |
702 | ||
703 | (let ((load (node-load inferior)) | |
704 | (free (node-free-disk-space inferior))) | |
705 | (close-inferior inferior) | |
706 | (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\ | |
707 | host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\ | |
708 | time difference: ~a s~%" | |
709 | (build-machine-name machine) | |
710 | (utsname:sysname uts) (utsname:release uts) | |
711 | (utsname:machine uts) | |
712 | (utsname:nodename uts) | |
713 | (normalized-load machine load) | |
714 | (/ free (expt 2 20) 1.) | |
715 | (- time now)))))))) | |
10b2834f LC |
716 | |
717 | (disconnect! session)) | |
dafc3daf RW |
718 | machines))) |
719 | ||
aebaee95 | 720 | \f |
49e6291a LC |
721 | ;;; |
722 | ;;; Entry point. | |
723 | ;;; | |
724 | ||
725 | (define (guix-offload . args) | |
726 | (define request-line-rx | |
727 | ;; The request format. See 'tryBuildHook' method in build.cc. | |
728 | (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)")) | |
729 | ||
730 | (define not-coma | |
731 | (char-set-complement (char-set #\,))) | |
732 | ||
733 | ;; Make sure $HOME really corresponds to the current user. This is | |
734 | ;; necessary since lsh uses that to determine the location of the yarrow | |
735 | ;; seed file, and fails if it's owned by someone else. | |
736 | (and=> (passwd:dir (getpw (getuid))) | |
737 | (cut setenv "HOME" <>)) | |
738 | ||
0b724753 LC |
739 | ;; We rely on protocol-level compression from libssh to optimize large data |
740 | ;; transfers. Warn if it's missing. | |
741 | (unless (zlib-support?) | |
69daee23 LC |
742 | (warning (G_ "Guile-SSH lacks zlib support")) |
743 | (warning (G_ "data transfers will *not* be compressed!"))) | |
0b724753 | 744 | |
49e6291a LC |
745 | (match args |
746 | ((system max-silent-time print-build-trace? build-timeout) | |
747 | (let ((max-silent-time (string->number max-silent-time)) | |
748 | (build-timeout (string->number build-timeout)) | |
749 | (print-build-trace? (string=? print-build-trace? "1"))) | |
8902d0f2 | 750 | (set-thread-name "guix offload") |
49e6291a LC |
751 | (parameterize ((%current-system system)) |
752 | (let loop ((line (read-line))) | |
753 | (unless (eof-object? line) | |
754 | (cond ((regexp-exec request-line-rx line) | |
755 | => | |
756 | (lambda (match) | |
e8a5db80 | 757 | (with-error-handling |
49e6291a LC |
758 | (process-request (equal? (match:substring match 1) "1") |
759 | (match:substring match 2) ; system | |
015f17e8 LC |
760 | (read-derivation-from-file |
761 | (match:substring match 3)) | |
49e6291a LC |
762 | (string-tokenize |
763 | (match:substring match 4) not-coma) | |
764 | #:print-build-trace? print-build-trace? | |
765 | #:max-silent-time max-silent-time | |
766 | #:build-timeout build-timeout)))) | |
767 | (else | |
69daee23 | 768 | (leave (G_ "invalid request line: ~s~%") line))) |
49e6291a | 769 | (loop (read-line))))))) |
aebaee95 LC |
770 | (("test" rest ...) |
771 | (with-error-handling | |
27991c97 LC |
772 | (let-values (((file pred) |
773 | (match rest | |
774 | ((file regexp) | |
775 | (values file | |
776 | (compose (cut string-match regexp <>) | |
777 | build-machine-name))) | |
778 | ((file) (values file (const #t))) | |
779 | (() (values %machine-file (const #t))) | |
69daee23 | 780 | (x (leave (G_ "wrong number of arguments~%")))))) |
27991c97 | 781 | (check-machine-availability (or file %machine-file) pred)))) |
dafc3daf RW |
782 | (("status" rest ...) |
783 | (with-error-handling | |
784 | (let-values (((file pred) | |
785 | (match rest | |
786 | ((file regexp) | |
787 | (values file | |
788 | (compose (cut string-match regexp <>) | |
789 | build-machine-name))) | |
790 | ((file) (values file (const #t))) | |
791 | (() (values %machine-file (const #t))) | |
792 | (x (leave (G_ "wrong number of arguments~%")))))) | |
793 | (check-machine-status (or file %machine-file) pred)))) | |
49e6291a LC |
794 | (("--version") |
795 | (show-version-and-exit "guix offload")) | |
796 | (("--help") | |
69daee23 | 797 | (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE |
49e6291a LC |
798 | Process build offload requests written on the standard input, possibly |
799 | offloading builds to the machines listed in '~a'.~%") | |
800 | %machine-file) | |
69daee23 | 801 | (display (G_ " |
49e6291a LC |
802 | This tool is meant to be used internally by 'guix-daemon'.\n")) |
803 | (show-bug-report-information)) | |
804 | (x | |
69daee23 | 805 | (leave (G_ "invalid arguments: ~{~s ~}~%") x)))) |
49e6291a | 806 | |
f326fef8 | 807 | ;;; Local Variables: |
d81195bf | 808 | ;;; eval: (put 'with-error-to-port 'scheme-indent-function 1) |
a708de15 | 809 | ;;; eval: (put 'with-timeout 'scheme-indent-function 2) |
f326fef8 LC |
810 | ;;; End: |
811 | ||
49e6291a | 812 | ;;; offload.scm ends here |