;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
;;;
;;; This file is part of GNU Guix.
;;;
#:use-module (ssh session)
#:use-module (ssh channel)
#:use-module (ssh popen)
- #:use-module (ssh dist)
- #:use-module (ssh dist node)
#:use-module (ssh version)
#:use-module (guix config)
#:use-module (guix records)
+ #:use-module (guix ssh)
#:use-module (guix store)
+ #:use-module (guix inferior)
#:use-module (guix derivations)
#:use-module ((guix serialization)
#:select (nar-error? nar-error-file))
#:use-module (guix nar)
#:use-module (guix utils)
- #:use-module ((guix build syscalls) #:select (fcntl-flock))
+ #:use-module ((guix build syscalls)
+ #:select (fcntl-flock set-thread-name))
#:use-module ((guix build utils) #:select (which mkdir-p))
#:use-module (guix ui)
#:use-module (srfi srfi-1)
(catch #t
(lambda ()
;; Avoid ABI incompatibility with the <build-machine> record.
- (set! %fresh-auto-compile #t)
+ ;; (set! %fresh-auto-compile #t)
(save-module-excursion
(lambda ()
(set-current-module %user-module)
- (primitive-load file))))
+ (match (primitive-load file)
+ (((? build-machine? machines) ...)
+ machines)
+ (_
+ ;; Instead of crashing, assume the empty list.
+ (warning (G_ "'~a' did not return a list of build machines; \
+ignoring it~%")
+ file)
+ '())))))
(lambda args
(match args
(('system-error . rest)
;; Silently ignore missing file since this is a common case.
(if (= ENOENT err)
'()
- (leave (_ "failed to open machine file '~a': ~a~%")
+ (leave (G_ "failed to open machine file '~a': ~a~%")
file (strerror err)))))
(('syntax-error proc message properties form . rest)
(let ((loc (source-properties->location properties)))
- (leave (_ "~a: ~a~%")
+ (leave (G_ "~a: ~a~%")
(location->string loc) message)))
(x
- (leave (_ "failed to load machine file '~a': ~s~%")
+ (leave (G_ "failed to load machine file '~a': ~s~%")
file args))))))
(define (host-key->type+key host-key)
(string->symbol (string-drop type 4))))
(match (string-tokenize host-key)
- ((type key _)
+ ((type key x)
(values (type->symbol type) key))
((type key)
(values (type->symbol type) key))))
(private-key-from-file file))
(lambda (key proc str . rest)
(raise (condition
- (&message (message (format #f (_ "failed to load SSH \
+ (&message (message (format #f (G_ "failed to load SSH \
private key from '~a': ~a")
file str))))))))
;; #:log-verbosity 'protocol
#:identity (build-machine-private-key machine)
+ ;; By default libssh reads ~/.ssh/known_hosts
+ ;; and uses that to adjust its choice of cipher
+ ;; suites, which changes the type of host key
+ ;; that the server sends (RSA vs. Ed25519,
+ ;; etc.). Opt for something reproducible and
+ ;; stateless instead.
+ #:knownhosts "/dev/null"
+
;; We need lightweight compression when
;; exchanging full archives.
#:compression
(string=? (public-key->string server) key))
;; Key mismatch: something's wrong. XXX: It could be that the server
;; provided its Ed25519 key when we where expecting its RSA key.
- (leave (_ "server at '~a' returned host key '~a' of type '~a' \
+ (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
instead of '~a' of type '~a'~%")
(build-machine-name machine)
(public-key->string server) (get-key-type server)
(let ((auth (userauth-public-key! session private)))
(unless (eq? 'success auth)
(disconnect! session)
- (leave (_ "SSH public key authentication failed for '~a': ~a~%")
+ (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
(build-machine-name machine) (get-error session))))
session)
(x
;; Connection failed or timeout expired.
- (leave (_ "failed to connect to '~a': ~a~%")
+ (leave (G_ "failed to connect to '~a': ~a~%")
(build-machine-name machine) (get-error session))))))
-(define* (connect-to-remote-daemon session
- #:optional
- (socket-name "/var/guix/daemon-socket/socket"))
- "Connect to the remote build daemon listening on SOCKET-NAME over SESSION,
-an SSH session. Return a <nix-server> object."
- (define redirect
- ;; Code run in SESSION to redirect the remote process' stdin/stdout to the
- ;; daemon's socket, à la socat. The SSH protocol supports forwarding to
- ;; Unix-domain sockets but libssh doesn't have an API for that, hence this
- ;; hack.
- `(begin
- (use-modules (ice-9 match) (rnrs io ports))
-
- (let ((sock (socket AF_UNIX SOCK_STREAM 0))
- (stdin (current-input-port))
- (stdout (current-output-port)))
- (setvbuf stdin _IONBF)
- (setvbuf stdout _IONBF)
- (connect sock AF_UNIX ,socket-name)
-
- (let loop ()
- (match (select (list stdin sock) '() (list stdin stdout sock))
- ((reads writes ())
- (when (memq stdin reads)
- (match (get-bytevector-some stdin)
- ((? eof-object?)
- (primitive-exit 0))
- (bv
- (put-bytevector sock bv))))
- (when (memq sock reads)
- (match (get-bytevector-some sock)
- ((? eof-object?)
- (primitive-exit 0))
- (bv
- (put-bytevector stdout bv))))
- (loop))
- (_
- (primitive-exit 1)))))))
-
- (let ((channel
- (open-remote-pipe* session OPEN_BOTH
- ;; Sort-of shell-quote REDIRECT.
- "guile" "-c"
- (object->string
- (object->string redirect)))))
- (open-connection #:port channel)))
-
\f
;;;
;;; Synchronization.
(lambda ()
(unlock-file port)))))
-(define-syntax-rule (with-machine-lock machine hint exp ...)
- "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
-context."
- (with-file-lock (machine-lock-file machine hint)
- exp ...))
-
-
(define (machine-slot-file machine slot)
"Return the file name of MACHINE's file for SLOT."
;; For each machine we have a bunch of files representing each build slot.
This mechanism allows us to set a hard limit on the number of simultaneous
connections allowed to MACHINE."
(mkdir-p (dirname (machine-slot-file machine 0)))
- (with-machine-lock machine 'slots
- (any (lambda (slot)
- (let ((port (open-file (machine-slot-file machine slot)
- "w0")))
- (catch 'flock-error
- (lambda ()
- (fcntl-flock port 'write-lock #:wait? #f)
- ;; Got it!
- (format (current-error-port)
- "process ~a acquired build slot '~a'~%"
- (getpid) (port-filename port))
- port)
- (lambda args
- ;; PORT is already locked by another process.
- (close-port port)
- #f))))
- (iota (build-machine-parallel-builds machine)))))
+
+ ;; When several 'guix offload' processes run in parallel, there's a race
+ ;; among them, but since they try the slots in the same order, we're fine.
+ (any (lambda (slot)
+ (let ((port (open-file (machine-slot-file machine slot)
+ "w0")))
+ (catch 'flock-error
+ (lambda ()
+ (fcntl-flock port 'write-lock #:wait? #f)
+ ;; Got it!
+ (format (current-error-port)
+ "process ~a acquired build slot '~a'~%"
+ (getpid) (port-filename port))
+ port)
+ (lambda args
+ ;; PORT is already locked by another process.
+ (close-port port)
+ #f))))
+ (iota (build-machine-parallel-builds machine))))
(define (release-build-slot slot)
"Release SLOT, a build slot as returned as by 'acquire-build-slot'."
(set-port-revealed! port 1)
port))
+(define (node-guile-version node)
+ (inferior-eval '(version) node))
+
+(define (node-free-disk-space node)
+ "Return the free disk space, in bytes, in NODE's store."
+ (inferior-eval `(begin
+ (use-modules (guix build syscalls))
+ (free-disk-space ,(%store-prefix)))
+ node))
+
(define* (transfer-and-offload drv machine
#:key
(inputs '())
;; Protect DRV from garbage collection.
(add-temp-root store (derivation-file-name drv))
- (send-files (cons (derivation-file-name drv) inputs)
- store)
+ (with-store local
+ (send-files local (cons (derivation-file-name drv) inputs) store
+ #:log-port (current-output-port)))
(format (current-error-port) "offloading '~a' to '~a'...~%"
(derivation-file-name drv) (build-machine-name machine))
(format (current-error-port) "@ build-remote ~a ~a~%"
(derivation-file-name drv) (build-machine-name machine))
- (guard (c ((nix-protocol-error? c)
+ (guard (c ((store-protocol-error? c)
(format (current-error-port)
- (_ "derivation '~a' offloaded to '~a' failed: ~a~%")
+ (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
(derivation-file-name drv)
(build-machine-name machine)
- (nix-protocol-error-message c))
- ;; Use exit code 100 for a permanent build failure. The daemon
- ;; interprets other non-zero codes as transient build failures.
- (primitive-exit 100)))
+ (store-protocol-error-message c))
+ (let* ((inferior (false-if-exception (remote-inferior session)))
+ (space (false-if-exception
+ (node-free-disk-space inferior))))
+
+ (when inferior
+ (close-inferior inferior))
+
+ ;; Use exit code 100 for a permanent build failure. The daemon
+ ;; interprets other non-zero codes as transient build failures.
+ (if (and space (< space (* 10 (expt 2 20))))
+ (begin
+ (format (current-error-port)
+ (G_ "build failure may have been caused by lack \
+of free disk space on '~a'~%")
+ (build-machine-name machine))
+ (primitive-exit 1))
+ (primitive-exit 100)))))
(parameterize ((current-build-output-port (build-log-port)))
(build-derivations store (list drv))))
- (retrieve-files outputs store)
+ (retrieve-files* outputs store
+
+ ;; We cannot use the 'import-paths' RPC here because we
+ ;; already hold the locks for FILES.
+ #:import
+ (lambda (port)
+ (restore-file-set port
+ #:log-port (current-error-port)
+ #:lock? #f)))
+
(format (current-error-port) "done with offloaded '~a'~%"
(derivation-file-name drv)))
-(define (store-import-channel session)
- "Return an output port to which archives to be exported to SESSION's store
-can be written."
- ;; Using the 'import-paths' RPC on a remote store would be slow because it
- ;; makes a round trip every time 32 KiB have been transferred. This
- ;; procedure instead opens a separate channel to use the remote
- ;; 'import-paths' procedure, which consumes all the data in a single round
- ;; trip.
- (define import
- `(begin
- (use-modules (guix))
-
- (with-store store
- (setvbuf (current-input-port) _IONBF)
- (import-paths store (current-input-port)))))
-
- (open-remote-output-pipe session
- (string-join
- `("guile" "-c"
- ,(object->string
- (object->string import))))))
-
-(define (store-export-channel session files)
- "Return an input port from which an export of FILES from SESSION's store can
-be read."
- ;; Same as above: this is more efficient than calling 'export-paths' on a
- ;; remote store.
- (define export
- `(begin
- (use-modules (guix))
-
- (with-store store
- (setvbuf (current-output-port) _IONBF)
- (export-paths store ',files (current-output-port)))))
-
- (open-remote-input-pipe session
- (string-join
- `("guile" "-c"
- ,(object->string
- (object->string export))))))
-
-(define (send-files files remote)
- "Send the subset of FILES that's missing to REMOTE, a remote store."
- (with-store store
- ;; Compute the subset of FILES missing on SESSION and send them.
- (let* ((session (channel-get-session (nix-server-socket remote)))
- (node (make-node session))
- (missing (node-eval node
- `(begin
- (use-modules (guix)
- (srfi srfi-1) (srfi srfi-26))
-
- (with-store store
- (remove (cut valid-path? store <>)
- ',files)))))
- (count (length missing))
- (port (store-import-channel session)))
- (format #t (N_ "sending ~a store item to '~a'...~%"
- "sending ~a store items to '~a'...~%" count)
- count (session-get session 'host))
-
- ;; Send MISSING in topological order.
- (export-paths store missing port)
-
- ;; Tell the remote process that we're done. (In theory the
- ;; end-of-archive mark of 'export-paths' would be enough, but in
- ;; practice it's not.)
- (channel-send-eof port)
-
- ;; Wait for completion of the remote process.
- (let ((result (zero? (channel-get-exit-status port))))
- (close-port port)
- result))))
-
-(define (retrieve-files files remote)
- "Retrieve FILES from SESSION's store, and import them."
- (let* ((session (channel-get-session (nix-server-socket remote)))
- (host (session-get session 'host))
- (port (store-export-channel session files))
- (count (length files)))
- (format #t (N_ "retrieving ~a store item from '~a'...~%"
- "retrieving ~a store items from '~a'...~%" count)
- count host)
-
- ;; We cannot use the 'import-paths' RPC here because we already
- ;; hold the locks for FILES.
- (let ((result (restore-file-set port
- #:log-port (current-error-port)
- #:lock? #f)))
- (close-port port)
- result)))
-
\f
;;;
;;; Scheduling.
(build-requirements-features requirements)
(build-machine-features machine))))
-(define (machine-load machine)
- "Return the load of MACHINE, divided by the number of parallel builds
-allowed on MACHINE. Return +∞ if MACHINE is unreachable."
- ;; Note: This procedure is costly since it creates a new SSH session.
- (match (false-if-exception (open-ssh-session machine))
- ((? session? session)
- (let* ((pipe (open-remote-pipe* session OPEN_READ
- "cat" "/proc/loadavg"))
- (line (read-line pipe)))
- (close-port pipe)
-
- (if (eof-object? line)
- +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
- (match (string-tokenize line)
- ((one five fifteen . _)
- (let* ((raw (string->number five))
- (jobs (build-machine-parallel-builds machine))
- (normalized (/ raw jobs)))
- (format (current-error-port) "load on machine '~a' is ~s\
+(define %minimum-disk-space
+ ;; Minimum disk space required on the build machine for a build to be
+ ;; offloaded. This keeps us from offloading to machines that are bound to
+ ;; run out of disk space.
+ (* 100 (expt 2 20))) ;100 MiB
+
+(define (node-load node)
+ "Return the load on NODE. Return +∞ if NODE is misbehaving."
+ (let ((line (inferior-eval '(begin
+ (use-modules (ice-9 rdelim))
+ (call-with-input-file "/proc/loadavg"
+ read-string))
+ node)))
+ (if (eof-object? line)
+ +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
+ (match (string-tokenize line)
+ ((one five fifteen . x)
+ (string->number one))
+ (x
+ +inf.0)))))
+
+(define (normalized-load machine load)
+ "Divide LOAD by the number of parallel builds of MACHINE."
+ (if (rational? load)
+ (let* ((jobs (build-machine-parallel-builds machine))
+ (normalized (/ load jobs)))
+ (format (current-error-port) "load on machine '~a' is ~s\
(normalized: ~s)~%"
- (build-machine-name machine) raw normalized)
- normalized))
- (_
- +inf.0))))) ;something's fishy about MACHINE, so avoid it
- (_
- +inf.0))) ;failed to connect to MACHINE, so avoid it
-
-(define (machine-lock-file machine hint)
- "Return the name of MACHINE's lock file for HINT."
- (string-append %state-directory "/offload/"
- (build-machine-name machine)
- "." (symbol->string hint) ".lock"))
-
-(define (machine-choice-lock-file)
- "Return the name of the file used as a lock when choosing a build machine."
- (string-append %state-directory "/offload/machine-choice.lock"))
-
-
-(define %slots
- ;; List of acquired build slots (open ports).
- '())
+ (build-machine-name machine) load normalized)
+ normalized)
+ load))
+
+(define (random-seed)
+ (logxor (getpid) (car (gettimeofday))))
+
+(define shuffle
+ (let ((state (seed->random-state (random-seed))))
+ (lambda (lst)
+ "Return LST shuffled (using the Fisher-Yates algorithm.)"
+ (define vec (list->vector lst))
+ (let loop ((result '())
+ (i (vector-length vec)))
+ (if (zero? i)
+ result
+ (let* ((j (random i state))
+ (val (vector-ref vec j)))
+ (vector-set! vec j (vector-ref vec (- i 1)))
+ (loop (cons val result) (- i 1))))))))
(define (choose-build-machine machines)
- "Return the best machine among MACHINES, or #f."
+ "Return two values: the best machine among MACHINES and its build
+slot (which must later be released with 'release-build-slot'), or #f and #f."
;; Proceed like this:
- ;; 1. Acquire the global machine-choice lock.
- ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
+ ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
;; those machines for which we failed.
- ;; 3. Choose the best machine among those that are left.
- ;; 4. Release the previously-acquired build slots of the other machines.
- ;; 5. Release the global machine-choice lock.
-
- (with-file-lock (machine-choice-lock-file)
- (define machines+slots+loads
- (filter-map (lambda (machine)
- ;; Call 'machine-load' from here to make sure it is called
- ;; only once per machine (it is expensive).
- (let ((slot (acquire-build-slot machine)))
- (and slot
- (list machine slot (machine-load machine)))))
- machines))
-
- (define (undecorate pred)
- (lambda (a b)
- (match a
- ((machine1 slot1 load1)
- (match b
- ((machine2 slot2 load2)
- (pred machine1 load1 machine2 load2)))))))
-
- (define (machine-less-loaded-or-faster? m1 l1 m2 l2)
- ;; Return #t if M1 is either less loaded or faster than M2, with L1
- ;; being the load of M1 and L2 the load of M2. (This relation defines a
- ;; total order on machines.)
- (> (/ (build-machine-speed m1) (+ 1 l1))
- (/ (build-machine-speed m2) (+ 1 l2))))
-
- (let loop ((machines+slots+loads
- (sort machines+slots+loads
- (undecorate machine-less-loaded-or-faster?))))
- (match machines+slots+loads
- (((best slot load) others ...)
- ;; Return the best machine unless it's already overloaded.
- (if (< load 2.)
+ ;; 2. Choose the best machine among those that are left.
+ ;; 3. Release the previously-acquired build slots of the other machines.
+
+ (define machines+slots
+ (filter-map (lambda (machine)
+ (let ((slot (acquire-build-slot machine)))
+ (and slot (list machine slot))))
+ (shuffle machines)))
+
+ (define (undecorate pred)
+ (lambda (a b)
+ (match a
+ ((machine1 slot1)
+ (match b
+ ((machine2 slot2)
+ (pred machine1 machine2)))))))
+
+ (define (machine-faster? m1 m2)
+ ;; Return #t if M1 is faster than M2.
+ (> (build-machine-speed m1)
+ (build-machine-speed m2)))
+
+ (let loop ((machines+slots
+ (sort machines+slots (undecorate machine-faster?))))
+ (match machines+slots
+ (((best slot) others ...)
+ ;; Return the best machine unless it's already overloaded.
+ ;; Note: We call 'node-load' only as a last resort because it is
+ ;; too costly to call it once for every machine.
+ (let* ((session (false-if-exception (open-ssh-session best)))
+ (node (and session (remote-inferior session)))
+ (load (and node (normalized-load best (node-load node))))
+ (space (and node (node-free-disk-space node))))
+ (when node (close-inferior node))
+ (when session (disconnect! session))
+ (if (and node (< load 2.) (>= space %minimum-disk-space))
(match others
- (((machines slots loads) ...)
+ (((machines slots) ...)
;; Release slots from the uninteresting machines.
(for-each release-build-slot slots)
- ;; Prevent SLOT from being GC'd.
- (set! %slots (cons slot %slots))
- best))
+ ;; The caller must keep SLOT to protect it from GC and to
+ ;; eventually release it.
+ (values best slot)))
(begin
- ;; BEST is overloaded, so try the next one.
+ ;; BEST is unsuitable, so try the next one.
+ (when (and space (< space %minimum-disk-space))
+ (format (current-error-port)
+ "skipping machine '~a' because it is low \
+on disk space (~,2f MiB free)~%"
+ (build-machine-name best)
+ (/ space (expt 2 20) 1.)))
(release-build-slot slot)
- (loop others))))
- (() #f)))))
+ (loop others)))))
+ (()
+ (values #f #f)))))
+
+(define (call-with-timeout timeout drv thunk)
+ "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
+THUNK. Use DRV as an indication of what we were building when the timeout
+expired."
+ (if (number? timeout)
+ (dynamic-wind
+ (lambda ()
+ (sigaction SIGALRM
+ (lambda _
+ ;; The exit code here will be 1, which guix-daemon will
+ ;; interpret as a transient failure.
+ (leave (G_ "timeout expired while offloading '~a'~%")
+ (derivation-file-name drv))))
+ (alarm timeout))
+ thunk
+ (lambda ()
+ (alarm 0)))
+ (thunk)))
+
+(define-syntax-rule (with-timeout timeout drv exp ...)
+ "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
+If TIMEOUT is #f, simply evaluate EXP..."
+ (call-with-timeout timeout drv (lambda () exp ...)))
(define* (process-request wants-local? system drv features
#:key
(()
;; We'll never be able to match REQS.
(display "# decline\n"))
- ((_ ...)
- (let ((machine (choose-build-machine candidates)))
+ ((x ...)
+ (let-values (((machine slot)
+ (choose-build-machine candidates)))
(if machine
- (begin
- ;; Offload DRV to MACHINE.
- (display "# accept\n")
- (let ((inputs (string-tokenize (read-line)))
- (outputs (string-tokenize (read-line))))
- (transfer-and-offload drv machine
- #:inputs inputs
- #:outputs outputs
- #:max-silent-time max-silent-time
- #:build-timeout build-timeout
- #:print-build-trace? print-build-trace?)))
+ (dynamic-wind
+ (const #f)
+ (lambda ()
+ ;; Offload DRV to MACHINE.
+ (display "# accept\n")
+ (let ((inputs (string-tokenize (read-line)))
+ (outputs (string-tokenize (read-line))))
+ ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
+ ;; be issues with the connection or deadlocks that could
+ ;; lead the 'guix offload' process to remain stuck forever.
+ ;; To avoid that, install a timeout here as well.
+ (with-timeout build-timeout drv
+ (transfer-and-offload drv machine
+ #:inputs inputs
+ #:outputs outputs
+ #:max-silent-time max-silent-time
+ #:build-timeout build-timeout
+ #:print-build-trace?
+ print-build-trace?))))
+ (lambda ()
+ (release-build-slot slot)))
;; Not now, all the machines are busy.
(display "# postpone\n")))))))
"Bail out if NODE is not running Guile."
(match (node-guile-version node)
(#f
- (leave (_ "Guile could not be started on '~a'~%")
- name))
+ (report-guile-error name))
((? string? version)
- ;; Note: The version string already contains the word "Guile".
- (info (_ "'~a' is running ~a~%")
+ (info (G_ "'~a' is running GNU Guile ~a~%")
name (node-guile-version node)))))
(define (assert-node-has-guix node name)
- "Bail out if NODE lacks the (guix) module, or if its daemon is not running."
- (match (node-eval node
- '(begin
- (use-modules (guix))
- (with-store store
- (add-text-to-store store "test"
- "Hello, build machine!"))))
+ "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
+daemon is not running."
+ (unless (inferior? node)
+ (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
+
+ (match (inferior-eval '(begin
+ (use-modules (guix))
+ (and add-text-to-store 'alright))
+ node)
+ ('alright #t)
+ (_ (report-module-error name)))
+
+ (match (inferior-eval '(begin
+ (use-modules (guix))
+ (with-store store
+ (add-text-to-store store "test"
+ "Hello, build machine!")))
+ node)
((? string? str)
- (info (_ "Guix is usable on '~a' (test returned ~s)~%")
+ (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
name str))
(x
- (leave (_ "failed to use Guix module on '~a' (test returned ~s)~%")
+ (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
name x))))
(define %random-state
(delay
(seed->random-state (logxor (getpid) (car (gettimeofday))))))
-(define (nonce)
- (string-append (gethostname) "-"
+(define* (nonce #:optional (name (gethostname)))
+ (string-append name "-"
(number->string (random 1000000 (force %random-state)))))
-(define (assert-node-can-import node name daemon-socket)
+(define (assert-node-can-import session node name daemon-socket)
"Bail out if NODE refuses to import our archives."
- (let ((session (node-session node)))
- (with-store store
- (let* ((item (add-text-to-store store "export-test" (nonce)))
- (remote (connect-to-remote-daemon session daemon-socket)))
- (send-files (list item) remote)
- (if (valid-path? remote item)
- (info (_ "'~a' successfully imported '~a'~%")
- name item)
- (leave (_ "'~a' was not properly imported on '~a'~%")
- item name))))))
-
-(define (assert-node-can-export node name daemon-socket)
+ (with-store store
+ (let* ((item (add-text-to-store store "export-test" (nonce)))
+ (remote (connect-to-remote-daemon session daemon-socket)))
+ (with-store local
+ (send-files local (list item) remote))
+
+ (if (valid-path? remote item)
+ (info (G_ "'~a' successfully imported '~a'~%")
+ name item)
+ (leave (G_ "'~a' was not properly imported on '~a'~%")
+ item name)))))
+
+(define (assert-node-can-export session node name daemon-socket)
"Bail out if we cannot import signed archives from NODE."
- (let* ((session (node-session node))
- (remote (connect-to-remote-daemon session daemon-socket))
- (item (add-text-to-store remote "import-test" (nonce)))
- (port (store-export-channel session (list item))))
+ (let* ((remote (connect-to-remote-daemon session daemon-socket))
+ (item (add-text-to-store remote "import-test" (nonce name))))
(with-store store
- (if (and (import-paths store port)
+ (if (and (retrieve-files store (list item) remote)
(valid-path? store item))
- (info (_ "successfully imported '~a' from '~a'~%")
+ (info (G_ "successfully imported '~a' from '~a'~%")
item name)
- (leave (_ "failed to import '~a' from '~a'~%")
+ (leave (G_ "failed to import '~a' from '~a'~%")
item name)))))
-(define (check-machine-availability machine-file)
- "Check that each machine in MACHINE-FILE is usable as a build machine."
- (let ((machines (build-machines machine-file)))
- (info (_ "testing ~a build machines defined in '~a'...~%")
+(define (check-machine-availability machine-file pred)
+ "Check that each machine matching PRED in MACHINE-FILE is usable as a build
+machine."
+ (define (build-machine=? m1 m2)
+ (and (string=? (build-machine-name m1) (build-machine-name m2))
+ (= (build-machine-port m1) (build-machine-port m2))))
+
+ ;; A given build machine may appear several times (e.g., once for
+ ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
+ (let ((machines (filter pred
+ (delete-duplicates (build-machines machine-file)
+ build-machine=?))))
+ (info (G_ "testing ~a build machines defined in '~a'...~%")
(length machines) machine-file)
(let* ((names (map build-machine-name machines))
(sockets (map build-machine-daemon-socket machines))
(sessions (map open-ssh-session machines))
- (nodes (map make-node sessions)))
- (for-each assert-node-repl nodes names)
+ (nodes (map remote-inferior sessions)))
(for-each assert-node-has-guix nodes names)
- (for-each assert-node-can-import nodes names sockets)
- (for-each assert-node-can-export nodes names sockets))))
+ (for-each assert-node-repl nodes names)
+ (for-each assert-node-can-import sessions nodes names sockets)
+ (for-each assert-node-can-export sessions nodes names sockets)
+ (for-each close-inferior nodes)
+ (for-each disconnect! sessions))))
+
+(define (check-machine-status machine-file pred)
+ "Print the load of each machine matching PRED in MACHINE-FILE."
+ (define (build-machine=? m1 m2)
+ (and (string=? (build-machine-name m1) (build-machine-name m2))
+ (= (build-machine-port m1) (build-machine-port m2))))
+
+ ;; A given build machine may appear several times (e.g., once for
+ ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
+ (let ((machines (filter pred
+ (delete-duplicates (build-machines machine-file)
+ build-machine=?))))
+ (info (G_ "getting status of ~a build machines defined in '~a'...~%")
+ (length machines) machine-file)
+ (for-each (lambda (machine)
+ (define session
+ (open-ssh-session machine))
+
+ (match (remote-inferior session)
+ (#f
+ (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
+ (build-machine-name machine)))
+ ((? inferior? inferior)
+ (let ((now (car (gettimeofday))))
+ (match (inferior-eval '(list (uname)
+ (car (gettimeofday)))
+ inferior)
+ ((uts time)
+ (when (< time now)
+ ;; Build machine clocks must not be behind as this
+ ;; could cause timestamp issues.
+ (warning (G_ "machine '~a' is ~a seconds behind~%")
+ (build-machine-name machine)
+ (- now time)))
+
+ (let ((load (node-load inferior))
+ (free (node-free-disk-space inferior)))
+ (close-inferior inferior)
+ (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
+ host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
+ time difference: ~a s~%"
+ (build-machine-name machine)
+ (utsname:sysname uts) (utsname:release uts)
+ (utsname:machine uts)
+ (utsname:nodename uts)
+ (normalized-load machine load)
+ (/ free (expt 2 20) 1.)
+ (- time now))))))))
+
+ (disconnect! session))
+ machines)))
\f
;;;
;; We rely on protocol-level compression from libssh to optimize large data
;; transfers. Warn if it's missing.
(unless (zlib-support?)
- (warning (_ "Guile-SSH lacks zlib support"))
- (warning (_ "data transfers will *not* be compressed!")))
+ (warning (G_ "Guile-SSH lacks zlib support"))
+ (warning (G_ "data transfers will *not* be compressed!")))
(match args
((system max-silent-time print-build-trace? build-timeout)
(let ((max-silent-time (string->number max-silent-time))
(build-timeout (string->number build-timeout))
(print-build-trace? (string=? print-build-trace? "1")))
+ (set-thread-name "guix offload")
(parameterize ((%current-system system))
(let loop ((line (read-line)))
(unless (eof-object? line)
(with-error-handling
(process-request (equal? (match:substring match 1) "1")
(match:substring match 2) ; system
- (call-with-input-file
- (match:substring match 3)
- read-derivation)
+ (read-derivation-from-file
+ (match:substring match 3))
(string-tokenize
(match:substring match 4) not-coma)
#:print-build-trace? print-build-trace?
#:max-silent-time max-silent-time
#:build-timeout build-timeout))))
(else
- (leave (_ "invalid request line: ~s~%") line)))
+ (leave (G_ "invalid request line: ~s~%") line)))
(loop (read-line)))))))
(("test" rest ...)
(with-error-handling
- (let ((file (match rest
- ((file) file)
- (() %machine-file)
- (_ (leave (_ "wrong number of arguments~%"))))))
- (check-machine-availability (or file %machine-file)))))
+ (let-values (((file pred)
+ (match rest
+ ((file regexp)
+ (values file
+ (compose (cut string-match regexp <>)
+ build-machine-name)))
+ ((file) (values file (const #t)))
+ (() (values %machine-file (const #t)))
+ (x (leave (G_ "wrong number of arguments~%"))))))
+ (check-machine-availability (or file %machine-file) pred))))
+ (("status" rest ...)
+ (with-error-handling
+ (let-values (((file pred)
+ (match rest
+ ((file regexp)
+ (values file
+ (compose (cut string-match regexp <>)
+ build-machine-name)))
+ ((file) (values file (const #t)))
+ (() (values %machine-file (const #t)))
+ (x (leave (G_ "wrong number of arguments~%"))))))
+ (check-machine-status (or file %machine-file) pred))))
(("--version")
(show-version-and-exit "guix offload"))
(("--help")
- (format #t (_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
+ (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
Process build offload requests written on the standard input, possibly
offloading builds to the machines listed in '~a'.~%")
%machine-file)
- (display (_ "
+ (display (G_ "
This tool is meant to be used internally by 'guix-daemon'.\n"))
(show-bug-report-information))
(x
- (leave (_ "invalid arguments: ~{~s ~}~%") x))))
+ (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
;;; Local Variables:
-;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
+;;; eval: (put 'with-timeout 'scheme-indent-function 2)
;;; End:
;;; offload.scm ends here