-;;; GNU Guix --- Functional package management for GNU
-;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
-;;;
-;;; This file is part of GNU Guix.
-;;;
-;;; GNU Guix is free software; you can redistribute it and/or modify it
-;;; under the terms of the GNU General Public License as published by
-;;; the Free Software Foundation; either version 3 of the License, or (at
-;;; your option) any later version.
-;;;
-;;; GNU Guix is distributed in the hope that it will be useful, but
-;;; WITHOUT ANY WARRANTY; without even the implied warranty of
-;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;;; GNU General Public License for more details.
-;;;
-;;; You should have received a copy of the GNU General Public License
-;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
-
-(define-module (guix scripts offload)
- #:use-module (guix config)
- #:use-module (guix records)
- #:use-module (guix store)
- #:use-module (guix derivations)
- #:use-module (guix serialization)
- #:use-module (guix nar)
- #:use-module (guix utils)
- #:use-module ((guix build syscalls) #:select (fcntl-flock))
- #:use-module ((guix build utils) #:select (which mkdir-p))
- #:use-module (guix ui)
- #:use-module (srfi srfi-1)
- #:use-module (srfi srfi-11)
- #:use-module (srfi srfi-26)
- #:use-module (srfi srfi-34)
- #:use-module (srfi srfi-35)
- #:use-module (ice-9 popen)
- #:use-module (ice-9 rdelim)
- #:use-module (ice-9 match)
- #:use-module (ice-9 regex)
- #:use-module (ice-9 format)
- #:use-module (rnrs io ports)
- #:export (build-machine
- build-requirements
- guix-offload))
-
-;;; Commentary:
-;;;
-;;; Attempt to offload builds to the machines listed in
-;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
-;;; retrieving the build output(s) over SSH upon success.
-;;;
-;;; This command should not be used directly; instead, it is called on-demand
-;;; by the daemon, unless it was started with '--no-build-hook' or a client
-;;; inhibited build hooks.
-;;;
-;;; Code:
-
-
-(define-record-type* <build-machine>
- build-machine make-build-machine
- build-machine?
- (name build-machine-name) ; string
- (port build-machine-port ; number
- (default 22))
- (system build-machine-system) ; string
- (user build-machine-user) ; string
- (private-key build-machine-private-key ; file name
- (default (user-lsh-private-key)))
- (parallel-builds build-machine-parallel-builds ; number
- (default 1))
- (speed build-machine-speed ; inexact real
- (default 1.0))
- (features build-machine-features ; list of strings
- (default '()))
- (ssh-options build-machine-ssh-options ; list of strings
- (default '())))
-
-(define-record-type* <build-requirements>
- build-requirements make-build-requirements
- build-requirements?
- (system build-requirements-system) ; string
- (features build-requirements-features ; list of strings
- (default '())))
-
-(define %machine-file
- ;; File that lists machines available as build slaves.
- (string-append %config-directory "/machines.scm"))
-
-(define %lsh-command
- "lsh")
-
-(define %lshg-command
- ;; FIXME: 'lshg' fails to pass large amounts of data, see
- ;; <http://lists.lysator.liu.se/pipermail/lsh-bugs/2014q1/000639.html>.
- "lsh")
-
-(define (user-lsh-private-key)
- "Return the user's default lsh private key, or #f if it could not be
-determined."
- (and=> (getenv "HOME")
- (cut string-append <> "/.lsh/identity")))
-
-(define %user-module
- ;; Module in which the machine description file is loaded.
- (let ((module (make-fresh-user-module)))
- (module-use! module (resolve-interface '(guix scripts offload)))
- module))
-
-(define* (build-machines #:optional (file %machine-file))
- "Read the list of build machines from FILE and return it."
- (catch #t
- (lambda ()
- ;; Avoid ABI incompatibility with the <build-machine> record.
- (set! %fresh-auto-compile #t)
-
- (save-module-excursion
- (lambda ()
- (set-current-module %user-module)
- (primitive-load file))))
- (lambda args
- (match args
- (('system-error . rest)
- (let ((err (system-error-errno args)))
- ;; Silently ignore missing file since this is a common case.
- (if (= ENOENT err)
- '()
- (leave (_ "failed to open machine file '~a': ~a~%")
- file (strerror err)))))
- (('syntax-error proc message properties form . rest)
- (let ((loc (source-properties->location properties)))
- (leave (_ "~a: ~a~%")
- (location->string loc) message)))
- (x
- (leave (_ "failed to load machine file '~a': ~s~%")
- file args))))))
-
-;;; FIXME: The idea was to open the connection to MACHINE once for all, but
-;;; lshg is currently non-functional.
-;; (define (open-ssh-gateway machine)
-;; "Initiate an SSH connection gateway to MACHINE, and return the PID of the
-;; running lsh gateway upon success, or #f on failure."
-;; (catch 'system-error
-;; (lambda ()
-;; (let* ((port (open-pipe* OPEN_READ %lsh-command
-;; "-l" (build-machine-user machine)
-;; "-i" (build-machine-private-key machine)
-;; ;; XXX: With lsh 2.1, passing '--write-pid'
-;; ;; last causes the PID not to be printed.
-;; "--write-pid" "--gateway" "--background"
-;; (build-machine-name machine)))
-;; (line (read-line port))
-;; (status (close-pipe port)))
-;; (if (zero? status)
-;; (let ((pid (string->number line)))
-;; (if (integer? pid)
-;; pid
-;; (begin
-;; (warning (_ "'~a' did not write its PID on stdout: ~s~%")
-;; %lsh-command line)
-;; #f)))
-;; (begin
-;; (warning (_ "failed to initiate SSH connection to '~a':\
-;; '~a' exited with ~a~%")
-;; (build-machine-name machine)
-;; %lsh-command
-;; (status:exit-val status))
-;; #f))))
-;; (lambda args
-;; (leave (_ "failed to execute '~a': ~a~%")
-;; %lsh-command (strerror (system-error-errno args))))))
-
-(define-syntax with-error-to-port
- (syntax-rules ()
- ((_ port exp0 exp ...)
- (let ((new port)
- (old (current-error-port)))
- (dynamic-wind
- (lambda ()
- (set-current-error-port new))
- (lambda ()
- exp0 exp ...)
- (lambda ()
- (set-current-error-port old)))))))
-
-(define* (remote-pipe machine mode command
- #:key (error-port (current-error-port)) (quote? #t))
- "Run COMMAND (a string list) on MACHINE, assuming an lsh gateway has been
-set up. When QUOTE? is true, perform shell-quotation of all the elements of
-COMMAND. Return either a pipe opened with MODE, or #f if the lsh client could
-not be started."
- (define (shell-quote str)
- ;; Sort-of shell-quote STR so it can be passed as an argument to the
- ;; shell.
- (with-output-to-string
- (lambda ()
- (write str))))
-
- ;; Let the child inherit ERROR-PORT.
- (with-error-to-port error-port
- (apply open-pipe* mode %lshg-command
- "-l" (build-machine-user machine)
- "-p" (number->string (build-machine-port machine))
-
- ;; XXX: Remove '-i' when %LSHG-COMMAND really is lshg.
- "-i" (build-machine-private-key machine)
-
- (append (build-machine-ssh-options machine)
- (list (build-machine-name machine))
- (if quote?
- (map shell-quote command)
- command)))))
-
-\f
-;;;
-;;; Synchronization.
-;;;
-
-(define (lock-file file)
- "Wait and acquire an exclusive lock on FILE. Return an open port."
- (mkdir-p (dirname file))
- (let ((port (open-file file "w0")))
- (fcntl-flock port 'write-lock)
- port))
-
-(define (unlock-file lock)
- "Unlock LOCK."
- (fcntl-flock lock 'unlock)
- (close-port lock)
- #t)
-
-(define-syntax-rule (with-file-lock file exp ...)
- "Wait to acquire a lock on FILE and evaluate EXP in that context."
- (let ((port (lock-file file)))
- (dynamic-wind
- (lambda ()
- #t)
- (lambda ()
- exp ...)
- (lambda ()
- (unlock-file port)))))
-
-(define-syntax-rule (with-machine-lock machine hint exp ...)
- "Wait to acquire MACHINE's exclusive lock for HINT, and evaluate EXP in that
-context."
- (with-file-lock (machine-lock-file machine hint)
- exp ...))
-
-
-(define (machine-slot-file machine slot)
- "Return the file name of MACHINE's file for SLOT."
- ;; For each machine we have a bunch of files representing each build slot.
- ;; When choosing a build machine, we attempt to get an exclusive lock on one
- ;; of these; if we fail, that means all the build slots are already taken.
- ;; Inspired by Nix's build-remote.pl.
- (string-append (string-append %state-directory "/offload/"
- (build-machine-name machine)
- "/" (number->string slot))))
-
-(define (acquire-build-slot machine)
- "Attempt to acquire a build slot on MACHINE. Return the port representing
-the slot, or #f if none is available.
-
-This mechanism allows us to set a hard limit on the number of simultaneous
-connections allowed to MACHINE."
- (mkdir-p (dirname (machine-slot-file machine 0)))
- (with-machine-lock machine 'slots
- (any (lambda (slot)
- (let ((port (open-file (machine-slot-file machine slot)
- "w0")))
- (catch 'flock-error
- (lambda ()
- (fcntl-flock port 'write-lock #:wait? #f)
- ;; Got it!
- (format (current-error-port)
- "process ~a acquired build slot '~a'~%"
- (getpid) (port-filename port))
- port)
- (lambda args
- ;; PORT is already locked by another process.
- (close-port port)
- #f))))
- (iota (build-machine-parallel-builds machine)))))
-
-(define (release-build-slot slot)
- "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
- (close-port slot))
-
-\f
-;;;
-;;; Offloading.
-;;;
-
-(define (build-log-port)
- "Return the default port where build logs should be sent. The default is
-file descriptor 4, which is open by the daemon before running the offload
-hook."
- (let ((port (fdopen 4 "w0")))
- ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
- (set-port-revealed! port 1)
- port))
-
-(define %gc-root-file
- ;; File name of the temporary GC root we install.
- (format #f "offload-~a-~a" (gethostname) (getpid)))
-
-(define (register-gc-root file machine)
- "Mark FILE, a store item, as a garbage collector root on MACHINE."
- (define script
- `(begin
- (use-modules (guix config))
-
- ;; Note: we can't use 'add-indirect-root' because dangling links under
- ;; gcroots/auto are automatically deleted by the GC. This strategy
- ;; doesn't have this problem, but it requires write access to that
- ;; directory.
- (let ((root-directory (string-append %state-directory
- "/gcroots/tmp")))
- (catch 'system-error
- (lambda ()
- (mkdir root-directory))
- (lambda args
- (unless (= EEXIST (system-error-errno args))
- (error "failed to create remote GC root directory"
- root-directory (system-error-errno args)))))
-
- (catch 'system-error
- (lambda ()
- (symlink ,file
- (string-append root-directory "/" ,%gc-root-file)))
- (lambda args
- ;; If FILE already exists, we can assume that either it's a stale
- ;; reference (which is fine), or another process is already
- ;; building the derivation represented by FILE (which is fine
- ;; too.) Thus, do nothing in that case.
- (unless (= EEXIST (system-error-errno args))
- (apply throw args)))))))
-
- (let ((pipe (remote-pipe machine OPEN_READ
- `("guile" "-c" ,(object->string script)))))
- (get-string-all pipe)
- (let ((status (close-pipe pipe)))
- (unless (zero? status)
- ;; Better be safe than sorry: if we ignore the error here, then FILE
- ;; may be GC'd just before we start using it.
- (leave (_ "failed to register GC root for '~a' on '~a' (status: ~a)~%")
- file (build-machine-name machine) status)))))
-
-(define (remove-gc-roots machine)
- "Remove from MACHINE the GC roots previously installed with
-'register-gc-root'."
- (define script
- `(begin
- (use-modules (guix config) (ice-9 ftw)
- (srfi srfi-1) (srfi srfi-26))
-
- (let ((root-directory (string-append %state-directory
- "/gcroots/tmp")))
- (false-if-exception
- (delete-file
- (string-append root-directory "/" ,%gc-root-file)))
-
- ;; These ones were created with 'guix build -r' (there can be more
- ;; than one in case of multiple-output derivations.)
- (let ((roots (filter (cut string-prefix? ,%gc-root-file <>)
- (scandir "."))))
- (for-each (lambda (file)
- (false-if-exception (delete-file file)))
- roots)))))
-
- (let ((pipe (remote-pipe machine OPEN_READ
- `("guile" "-c" ,(object->string script)))))
- (get-string-all pipe)
- (close-pipe pipe)))
-
-(define* (offload drv machine
- #:key print-build-trace? (max-silent-time 3600)
- build-timeout (log-port (build-log-port)))
- "Perform DRV on MACHINE, assuming DRV and its prerequisites are available
-there, and write the build log to LOG-PORT. Return the exit status."
- (format (current-error-port) "offloading '~a' to '~a'...~%"
- (derivation-file-name drv) (build-machine-name machine))
- (format (current-error-port) "@ build-remote ~a ~a~%"
- (derivation-file-name drv) (build-machine-name machine))
-
- ;; Normally DRV has already been protected from GC when it was transferred.
- ;; The '-r' flag below prevents the build result from being GC'd.
- (let ((pipe (remote-pipe machine OPEN_READ
- `("guix" "build"
- "-r" ,%gc-root-file
- ,(format #f "--max-silent-time=~a"
- max-silent-time)
- ,@(if build-timeout
- (list (format #f "--timeout=~a"
- build-timeout))
- '())
- ,(derivation-file-name drv))
-
- ;; Since 'guix build' writes the build log to its
- ;; stderr, everything will go directly to LOG-PORT.
- #:error-port log-port)))
- (let loop ((line (read-line pipe)))
- (unless (eof-object? line)
- (display line log-port)
- (newline log-port)
- (loop (read-line pipe))))
-
- (close-pipe pipe)))
-
-(define* (transfer-and-offload drv machine
- #:key
- (inputs '())
- (outputs '())
- (max-silent-time 3600)
- build-timeout
- print-build-trace?)
- "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
-INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
-MACHINE."
- (when (begin
- (register-gc-root (derivation-file-name drv) machine)
- (send-files (cons (derivation-file-name drv) inputs)
- machine))
- (let ((status (offload drv machine
- #:print-build-trace? print-build-trace?
- #:max-silent-time max-silent-time
- #:build-timeout build-timeout)))
- (if (zero? status)
- (begin
- (retrieve-files outputs machine)
- (remove-gc-roots machine)
- (format (current-error-port)
- "done with offloaded '~a'~%"
- (derivation-file-name drv)))
- (begin
- (remove-gc-roots machine)
- (format (current-error-port)
- "derivation '~a' offloaded to '~a' failed \
-with exit code ~a~%"
- (derivation-file-name drv)
- (build-machine-name machine)
- (status:exit-val status))
-
- ;; Use exit code 100 for a permanent build failure. The daemon
- ;; interprets other non-zero codes as transient build failures.
- (primitive-exit 100))))))
-
-(define (send-files files machine)
- "Send the subset of FILES that's missing to MACHINE's store. Return #t on
-success, #f otherwise."
- (define (missing-files files)
- ;; Return the subset of FILES not already on MACHINE.
- (let*-values (((files)
- (format #f "~{~a~%~}" files))
- ((missing pids)
- (filtered-port
- (append (list (which %lshg-command)
- "-l" (build-machine-user machine)
- "-p" (number->string
- (build-machine-port machine))
- "-i" (build-machine-private-key machine))
- (build-machine-ssh-options machine)
- (cons (build-machine-name machine)
- '("guix" "archive" "--missing")))
- (open-input-string files)))
- ((result)
- (get-string-all missing)))
- (for-each waitpid pids)
- (string-tokenize result)))
-
- (with-store store
- (guard (c ((nix-protocol-error? c)
- (warning (_ "failed to export files for '~a': ~s~%")
- (build-machine-name machine)
- c)
- #f))
-
- ;; Compute the subset of FILES missing on MACHINE, and send them in
- ;; topologically sorted order so that they can actually be imported.
- ;;
- ;; To reduce load on the machine that's offloading (since it's typically
- ;; already quite busy, see hydra.gnu.org), compress with gzip rather
- ;; than xz: For a compression ratio 2 times larger, it is 20 times
- ;; faster.
- (let* ((files (missing-files (topologically-sorted store files)))
- (pipe (remote-pipe machine OPEN_WRITE
- '("gzip" "-dc" "|"
- "guix" "archive" "--import")
- #:quote? #f)))
- (format #t (_ "sending ~a store files to '~a'...~%")
- (length files) (build-machine-name machine))
- (call-with-compressed-output-port 'gzip pipe
- (lambda (compressed)
- (catch 'system-error
- (lambda ()
- (export-paths store files compressed))
- (lambda args
- (warning (_ "failed while exporting files to '~a': ~a~%")
- (build-machine-name machine)
- (strerror (system-error-errno args))))))
- #:options '("--fast"))
-
- ;; Wait for the 'lsh' process to complete.
- (zero? (close-pipe pipe))))))
-
-(define (retrieve-files files machine)
- "Retrieve FILES from MACHINE's store, and import them."
- (define host
- (build-machine-name machine))
-
- (let ((pipe (remote-pipe machine OPEN_READ
- `("guix" "archive" "--export" ,@files
- "|" "xz" "-c")
- #:quote? #f)))
- (and pipe
- (with-store store
- (guard (c ((nix-protocol-error? c)
- (warning (_ "failed to import files from '~a': ~s~%")
- host c)
- #f))
- (format (current-error-port) "retrieving ~a files from '~a'...~%"
- (length files) host)
-
- ;; We cannot use the 'import-paths' RPC here because we already
- ;; hold the locks for FILES.
- (call-with-decompressed-port 'xz pipe
- (lambda (decompressed)
- (restore-file-set decompressed
- #:log-port (current-error-port)
- #:lock? #f)))
-
- ;; Wait for the 'lsh' process to complete.
- (zero? (close-pipe pipe)))))))
-
-\f
-;;;
-;;; Scheduling.
-;;;
-
-(define (machine-matches? machine requirements)
- "Return #t if MACHINE matches REQUIREMENTS."
- (and (string=? (build-requirements-system requirements)
- (build-machine-system machine))
- (lset<= string=?
- (build-requirements-features requirements)
- (build-machine-features machine))))
-
-(define (machine-load machine)
- "Return the load of MACHINE, divided by the number of parallel builds
-allowed on MACHINE."
- (let* ((pipe (remote-pipe machine OPEN_READ `("cat" "/proc/loadavg")))
- (line (read-line pipe))
- (status (close-pipe pipe)))
- (unless (eqv? 0 (status:exit-val status))
- (warning (_ "failed to obtain load of '~a': SSH client exited with ~a~%")
- (build-machine-name machine)
- (status:exit-val status)))
-
- (if (eof-object? line)
- +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
- (match (string-tokenize line)
- ((one five fifteen . _)
- (let* ((raw (string->number five))
- (jobs (build-machine-parallel-builds machine))
- (normalized (/ raw jobs)))
- (format (current-error-port) "load on machine '~a' is ~s\
- (normalized: ~s)~%"
- (build-machine-name machine) raw normalized)
- normalized))
- (_
- +inf.0))))) ;something's fishy about MACHINE, so avoid it
-
-(define (machine-power-factor m)
- "Return a factor that aggregates the speed and load of M. The higher the
-better."
- (/ (build-machine-speed m)
- (+ 1 (machine-load m))))
-
-(define (machine-less-loaded-or-faster? m1 m2)
- "Return #t if M1 is either less loaded or faster than M2. (This relation
-defines a total order on machines.)"
- (> (machine-power-factor m1) (machine-power-factor m2)))
-
-(define (machine-lock-file machine hint)
- "Return the name of MACHINE's lock file for HINT."
- (string-append %state-directory "/offload/"
- (build-machine-name machine)
- "." (symbol->string hint) ".lock"))
-
-(define (machine-choice-lock-file)
- "Return the name of the file used as a lock when choosing a build machine."
- (string-append %state-directory "/offload/machine-choice.lock"))
-
-
-(define %slots
- ;; List of acquired build slots (open ports).
- '())
-
-(define (choose-build-machine machines)
- "Return the best machine among MACHINES, or #f."
-
- ;; Proceed like this:
- ;; 1. Acquire the global machine-choice lock.
- ;; 2. For all MACHINES, attempt to acquire a build slot, and filter out
- ;; those machines for which we failed.
- ;; 3. Choose the best machine among those that are left.
- ;; 4. Release the previously-acquired build slots of the other machines.
- ;; 5. Release the global machine-choice lock.
-
- (with-file-lock (machine-choice-lock-file)
- (define machines+slots
- (filter-map (lambda (machine)
- (let ((slot (acquire-build-slot machine)))
- (and slot (list machine slot))))
- machines))
-
- (define (undecorate pred)
- (lambda (a b)
- (match a
- ((machine1 slot1)
- (match b
- ((machine2 slot2)
- (pred machine1 machine2)))))))
-
- (let loop ((machines+slots
- (sort machines+slots
- (undecorate machine-less-loaded-or-faster?))))
- (match machines+slots
- (((best slot) others ...)
- ;; Return the best machine unless it's already overloaded.
- (if (< (machine-load best) 2.)
- (match others
- (((machines slots) ...)
- ;; Release slots from the uninteresting machines.
- (for-each release-build-slot slots)
-
- ;; Prevent SLOT from being GC'd.
- (set! %slots (cons slot %slots))
- best))
- (begin
- ;; BEST is overloaded, so try the next one.
- (release-build-slot slot)
- (loop others))))
- (() #f)))))
-
-(define* (process-request wants-local? system drv features
- #:key
- print-build-trace? (max-silent-time 3600)
- build-timeout)
- "Process a request to build DRV."
- (let* ((local? (and wants-local? (string=? system (%current-system))))
- (reqs (build-requirements
- (system system)
- (features features)))
- (candidates (filter (cut machine-matches? <> reqs)
- (build-machines))))
- (match candidates
- (()
- ;; We'll never be able to match REQS.
- (display "# decline\n"))
- ((_ ...)
- (let ((machine (choose-build-machine candidates)))
- (if machine
- (begin
- ;; Offload DRV to MACHINE.
- (display "# accept\n")
- (let ((inputs (string-tokenize (read-line)))
- (outputs (string-tokenize (read-line))))
- (transfer-and-offload drv machine
- #:inputs inputs
- #:outputs outputs
- #:max-silent-time max-silent-time
- #:build-timeout build-timeout
- #:print-build-trace? print-build-trace?)))
-
- ;; Not now, all the machines are busy.
- (display "# postpone\n")))))))
-
-(define-syntax-rule (with-nar-error-handling body ...)
- "Execute BODY with any &nar-error suitably reported to the user."
- (guard (c ((nar-error? c)
- (let ((file (nar-error-file c)))
- (if (condition-has-type? c &message)
- (leave (_ "while importing file '~a': ~a~%")
- file (gettext (condition-message c)))
- (leave (_ "failed to import file '~a'~%")
- file)))))
- body ...))
-
-\f
-;;;
-;;; Entry point.
-;;;
-
-(define (guix-offload . args)
- (define request-line-rx
- ;; The request format. See 'tryBuildHook' method in build.cc.
- (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
-
- (define not-coma
- (char-set-complement (char-set #\,)))
-
- ;; Make sure $HOME really corresponds to the current user. This is
- ;; necessary since lsh uses that to determine the location of the yarrow
- ;; seed file, and fails if it's owned by someone else.
- (and=> (passwd:dir (getpw (getuid)))
- (cut setenv "HOME" <>))
-
- (match args
- ((system max-silent-time print-build-trace? build-timeout)
- (let ((max-silent-time (string->number max-silent-time))
- (build-timeout (string->number build-timeout))
- (print-build-trace? (string=? print-build-trace? "1")))
- (parameterize ((%current-system system))
- (let loop ((line (read-line)))
- (unless (eof-object? line)
- (cond ((regexp-exec request-line-rx line)
- =>
- (lambda (match)
- (with-nar-error-handling
- (process-request (equal? (match:substring match 1) "1")
- (match:substring match 2) ; system
- (call-with-input-file
- (match:substring match 3)
- read-derivation)
- (string-tokenize
- (match:substring match 4) not-coma)
- #:print-build-trace? print-build-trace?
- #:max-silent-time max-silent-time
- #:build-timeout build-timeout))))
- (else
- (leave (_ "invalid request line: ~s~%") line)))
- (loop (read-line)))))))
- (("--version")
- (show-version-and-exit "guix offload"))
- (("--help")
- (format #t (_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
-Process build offload requests written on the standard input, possibly
-offloading builds to the machines listed in '~a'.~%")
- %machine-file)
- (display (_ "
-This tool is meant to be used internally by 'guix-daemon'.\n"))
- (show-bug-report-information))
- (x
- (leave (_ "invalid arguments: ~{~s ~}~%") x))))
-
-;;; Local Variables:
-;;; eval: (put 'with-machine-lock 'scheme-indent-function 2)
-;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
-;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
-;;; End:
-
-;;; offload.scm ends here
+;;; GNU Guix --- Functional package management for GNU
+;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
+;;;
+;;; This file is part of GNU Guix.
+;;;
+;;; GNU Guix is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; GNU Guix is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
+
+(define-module (guix scripts offload)
+ #:use-module (ssh key)
+ #:use-module (ssh auth)
+ #:use-module (ssh session)
+ #:use-module (ssh channel)
+ #:use-module (ssh popen)
+ #:use-module (ssh version)
+ #:use-module (guix config)
+ #:use-module (guix records)
+ #:use-module (guix ssh)
+ #:use-module (guix store)
+ #:use-module (guix inferior)
+ #:use-module (guix derivations)
+ #:use-module ((guix serialization)
+ #:select (nar-error? nar-error-file))
+ #:use-module (guix nar)
+ #:use-module (guix utils)
+ #:use-module ((guix build syscalls)
+ #:select (fcntl-flock set-thread-name))
+ #:use-module ((guix build utils) #:select (which mkdir-p))
+ #:use-module (guix ui)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-11)
+ #:use-module (srfi srfi-26)
+ #:use-module (srfi srfi-34)
+ #:use-module (srfi srfi-35)
+ #:use-module (ice-9 popen)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 match)
+ #:use-module (ice-9 regex)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 binary-ports)
+ #:export (build-machine
+ build-requirements
+ guix-offload))
+
+;;; Commentary:
+;;;
+;;; Attempt to offload builds to the machines listed in
+;;; /etc/guix/machines.scm, transferring missing dependencies over SSH, and
+;;; retrieving the build output(s) over SSH upon success.
+;;;
+;;; This command should not be used directly; instead, it is called on-demand
+;;; by the daemon, unless it was started with '--no-build-hook' or a client
+;;; inhibited build hooks.
+;;;
+;;; Code:
+
+
+(define-record-type* <build-machine>
+ build-machine make-build-machine
+ build-machine?
+ (name build-machine-name) ; string
+ (port build-machine-port ; number
+ (default 22))
+ (system build-machine-system) ; string
+ (user build-machine-user) ; string
+ (private-key build-machine-private-key ; file name
+ (default (user-openssh-private-key)))
+ (host-key build-machine-host-key) ; string
+ (compression build-machine-compression ; string
+ (default "zlib@openssh.com,zlib"))
+ (compression-level build-machine-compression-level ;integer
+ (default 3))
+ (daemon-socket build-machine-daemon-socket ; string
+ (default "/var/guix/daemon-socket/socket"))
+ (parallel-builds build-machine-parallel-builds ; number
+ (default 1))
+ (speed build-machine-speed ; inexact real
+ (default 1.0))
+ (features build-machine-features ; list of strings
+ (default '())))
+
+(define-record-type* <build-requirements>
+ build-requirements make-build-requirements
+ build-requirements?
+ (system build-requirements-system) ; string
+ (features build-requirements-features ; list of strings
+ (default '())))
+
+(define %machine-file
+ ;; File that lists machines available as build slaves.
+ (string-append %config-directory "/machines.scm"))
+
+(define (user-openssh-private-key)
+ "Return the user's default SSH private key, or #f if it could not be
+determined."
+ (and=> (getenv "HOME")
+ (cut string-append <> "/.ssh/id_rsa")))
+
+(define %user-module
+ ;; Module in which the machine description file is loaded.
+ (let ((module (make-fresh-user-module)))
+ (module-use! module (resolve-interface '(guix scripts offload)))
+ module))
+
+(define* (build-machines #:optional (file %machine-file))
+ "Read the list of build machines from FILE and return it."
+ (catch #t
+ (lambda ()
+ ;; Avoid ABI incompatibility with the <build-machine> record.
+ ;; (set! %fresh-auto-compile #t)
+
+ (save-module-excursion
+ (lambda ()
+ (set-current-module %user-module)
+ (match (primitive-load file)
+ (((? build-machine? machines) ...)
+ machines)
+ (_
+ ;; Instead of crashing, assume the empty list.
+ (warning (G_ "'~a' did not return a list of build machines; \
+ignoring it~%")
+ file)
+ '())))))
+ (lambda args
+ (match args
+ (('system-error . rest)
+ (let ((err (system-error-errno args)))
+ ;; Silently ignore missing file since this is a common case.
+ (if (= ENOENT err)
+ '()
+ (leave (G_ "failed to open machine file '~a': ~a~%")
+ file (strerror err)))))
+ (('syntax-error proc message properties form . rest)
+ (let ((loc (source-properties->location properties)))
+ (leave (G_ "~a: ~a~%")
+ (location->string loc) message)))
+ (x
+ (leave (G_ "failed to load machine file '~a': ~s~%")
+ file args))))))
+
+(define (host-key->type+key host-key)
+ "Destructure HOST-KEY, an OpenSSH host key string, and return two values:
+its key type as a symbol, and the actual base64-encoded string."
+ (define (type->symbol type)
+ (and (string-prefix? "ssh-" type)
+ (string->symbol (string-drop type 4))))
+
+ (match (string-tokenize host-key)
+ ((type key x)
+ (values (type->symbol type) key))
+ ((type key)
+ (values (type->symbol type) key))))
+
+(define (private-key-from-file* file)
+ "Like 'private-key-from-file', but raise an error that 'with-error-handling'
+can interpret meaningfully."
+ (catch 'guile-ssh-error
+ (lambda ()
+ (private-key-from-file file))
+ (lambda (key proc str . rest)
+ (raise (condition
+ (&message (message (format #f (G_ "failed to load SSH \
+private key from '~a': ~a")
+ file str))))))))
+
+(define (open-ssh-session machine)
+ "Open an SSH session for MACHINE and return it. Throw an error on failure."
+ (let ((private (private-key-from-file* (build-machine-private-key machine)))
+ (public (public-key-from-file
+ (string-append (build-machine-private-key machine)
+ ".pub")))
+ (session (make-session #:user (build-machine-user machine)
+ #:host (build-machine-name machine)
+ #:port (build-machine-port machine)
+ #:timeout 10 ;seconds
+ ;; #:log-verbosity 'protocol
+ #:identity (build-machine-private-key machine)
+
+ ;; By default libssh reads ~/.ssh/known_hosts
+ ;; and uses that to adjust its choice of cipher
+ ;; suites, which changes the type of host key
+ ;; that the server sends (RSA vs. Ed25519,
+ ;; etc.). Opt for something reproducible and
+ ;; stateless instead.
+ #:knownhosts "/dev/null"
+
+ ;; We need lightweight compression when
+ ;; exchanging full archives.
+ #:compression
+ (build-machine-compression machine)
+ #:compression-level
+ (build-machine-compression-level machine))))
+ (match (connect! session)
+ ('ok
+ ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about
+ ;; ed25519 keys and 'get-key-type' returns #f in that case.
+ (let-values (((server) (get-server-public-key session))
+ ((type key) (host-key->type+key
+ (build-machine-host-key machine))))
+ (unless (and (or (not (get-key-type server))
+ (eq? (get-key-type server) type))
+ (string=? (public-key->string server) key))
+ ;; Key mismatch: something's wrong. XXX: It could be that the server
+ ;; provided its Ed25519 key when we where expecting its RSA key.
+ (leave (G_ "server at '~a' returned host key '~a' of type '~a' \
+instead of '~a' of type '~a'~%")
+ (build-machine-name machine)
+ (public-key->string server) (get-key-type server)
+ key type)))
+
+ (let ((auth (userauth-public-key! session private)))
+ (unless (eq? 'success auth)
+ (disconnect! session)
+ (leave (G_ "SSH public key authentication failed for '~a': ~a~%")
+ (build-machine-name machine) (get-error session))))
+
+ session)
+ (x
+ ;; Connection failed or timeout expired.
+ (leave (G_ "failed to connect to '~a': ~a~%")
+ (build-machine-name machine) (get-error session))))))
+
+\f
+;;;
+;;; Synchronization.
+;;;
+
+(define (lock-file file)
+ "Wait and acquire an exclusive lock on FILE. Return an open port."
+ (mkdir-p (dirname file))
+ (let ((port (open-file file "w0")))
+ (fcntl-flock port 'write-lock)
+ port))
+
+(define (unlock-file lock)
+ "Unlock LOCK."
+ (fcntl-flock lock 'unlock)
+ (close-port lock)
+ #t)
+
+(define-syntax-rule (with-file-lock file exp ...)
+ "Wait to acquire a lock on FILE and evaluate EXP in that context."
+ (let ((port (lock-file file)))
+ (dynamic-wind
+ (lambda ()
+ #t)
+ (lambda ()
+ exp ...)
+ (lambda ()
+ (unlock-file port)))))
+
+(define (machine-slot-file machine slot)
+ "Return the file name of MACHINE's file for SLOT."
+ ;; For each machine we have a bunch of files representing each build slot.
+ ;; When choosing a build machine, we attempt to get an exclusive lock on one
+ ;; of these; if we fail, that means all the build slots are already taken.
+ ;; Inspired by Nix's build-remote.pl.
+ (string-append (string-append %state-directory "/offload/"
+ (build-machine-name machine)
+ "/" (number->string slot))))
+
+(define (acquire-build-slot machine)
+ "Attempt to acquire a build slot on MACHINE. Return the port representing
+the slot, or #f if none is available.
+
+This mechanism allows us to set a hard limit on the number of simultaneous
+connections allowed to MACHINE."
+ (mkdir-p (dirname (machine-slot-file machine 0)))
+
+ ;; When several 'guix offload' processes run in parallel, there's a race
+ ;; among them, but since they try the slots in the same order, we're fine.
+ (any (lambda (slot)
+ (let ((port (open-file (machine-slot-file machine slot)
+ "w0")))
+ (catch 'flock-error
+ (lambda ()
+ (fcntl-flock port 'write-lock #:wait? #f)
+ ;; Got it!
+ (format (current-error-port)
+ "process ~a acquired build slot '~a'~%"
+ (getpid) (port-filename port))
+ port)
+ (lambda args
+ ;; PORT is already locked by another process.
+ (close-port port)
+ #f))))
+ (iota (build-machine-parallel-builds machine))))
+
+(define (release-build-slot slot)
+ "Release SLOT, a build slot as returned as by 'acquire-build-slot'."
+ (close-port slot))
+
+\f
+;;;
+;;; Offloading.
+;;;
+
+(define (build-log-port)
+ "Return the default port where build logs should be sent. The default is
+file descriptor 4, which is open by the daemon before running the offload
+hook."
+ (let ((port (fdopen 4 "w0")))
+ ;; Make sure file descriptor 4 isn't closed when PORT is GC'd.
+ (set-port-revealed! port 1)
+ port))
+
+(define (node-guile-version node)
+ (inferior-eval '(version) node))
+
+(define (node-free-disk-space node)
+ "Return the free disk space, in bytes, in NODE's store."
+ (inferior-eval `(begin
+ (use-modules (guix build syscalls))
+ (free-disk-space ,(%store-prefix)))
+ node))
+
+(define* (transfer-and-offload drv machine
+ #:key
+ (inputs '())
+ (outputs '())
+ (max-silent-time 3600)
+ build-timeout
+ print-build-trace?)
+ "Offload DRV to MACHINE. Prior to the actual offloading, transfer all of
+INPUTS to MACHINE; if building DRV succeeds, retrieve all of OUTPUTS from
+MACHINE."
+ (define session
+ (open-ssh-session machine))
+
+ (define store
+ (connect-to-remote-daemon session
+ (build-machine-daemon-socket machine)))
+
+ (set-build-options store
+ #:print-build-trace print-build-trace?
+ #:max-silent-time max-silent-time
+ #:timeout build-timeout)
+
+ ;; Protect DRV from garbage collection.
+ (add-temp-root store (derivation-file-name drv))
+
+ (with-store local
+ (send-files local (cons (derivation-file-name drv) inputs) store
+ #:log-port (current-output-port)))
+ (format (current-error-port) "offloading '~a' to '~a'...~%"
+ (derivation-file-name drv) (build-machine-name machine))
+ (format (current-error-port) "@ build-remote ~a ~a~%"
+ (derivation-file-name drv) (build-machine-name machine))
+
+ (guard (c ((store-protocol-error? c)
+ (format (current-error-port)
+ (G_ "derivation '~a' offloaded to '~a' failed: ~a~%")
+ (derivation-file-name drv)
+ (build-machine-name machine)
+ (store-protocol-error-message c))
+ (let* ((inferior (false-if-exception (remote-inferior session)))
+ (space (false-if-exception
+ (node-free-disk-space inferior))))
+
+ (when inferior
+ (close-inferior inferior))
+
+ ;; Use exit code 100 for a permanent build failure. The daemon
+ ;; interprets other non-zero codes as transient build failures.
+ (if (and space (< space (* 10 (expt 2 20))))
+ (begin
+ (format (current-error-port)
+ (G_ "build failure may have been caused by lack \
+of free disk space on '~a'~%")
+ (build-machine-name machine))
+ (primitive-exit 1))
+ (primitive-exit 100)))))
+ (parameterize ((current-build-output-port (build-log-port)))
+ (build-derivations store (list drv))))
+
+ (retrieve-files* outputs store
+
+ ;; We cannot use the 'import-paths' RPC here because we
+ ;; already hold the locks for FILES.
+ #:import
+ (lambda (port)
+ (restore-file-set port
+ #:log-port (current-error-port)
+ #:lock? #f)))
+
+ (format (current-error-port) "done with offloaded '~a'~%"
+ (derivation-file-name drv)))
+
+\f
+;;;
+;;; Scheduling.
+;;;
+
+(define (machine-matches? machine requirements)
+ "Return #t if MACHINE matches REQUIREMENTS."
+ (and (string=? (build-requirements-system requirements)
+ (build-machine-system machine))
+ (lset<= string=?
+ (build-requirements-features requirements)
+ (build-machine-features machine))))
+
+(define %minimum-disk-space
+ ;; Minimum disk space required on the build machine for a build to be
+ ;; offloaded. This keeps us from offloading to machines that are bound to
+ ;; run out of disk space.
+ (* 100 (expt 2 20))) ;100 MiB
+
+(define (node-load node)
+ "Return the load on NODE. Return +∞ if NODE is misbehaving."
+ (let ((line (inferior-eval '(begin
+ (use-modules (ice-9 rdelim))
+ (call-with-input-file "/proc/loadavg"
+ read-string))
+ node)))
+ (if (eof-object? line)
+ +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
+ (match (string-tokenize line)
+ ((one five fifteen . x)
+ (string->number one))
+ (x
+ +inf.0)))))
+
+(define (normalized-load machine load)
+ "Divide LOAD by the number of parallel builds of MACHINE."
+ (if (rational? load)
+ (let* ((jobs (build-machine-parallel-builds machine))
+ (normalized (/ load jobs)))
+ (format (current-error-port) "load on machine '~a' is ~s\
+ (normalized: ~s)~%"
+ (build-machine-name machine) load normalized)
+ normalized)
+ load))
+
+(define (random-seed)
+ (logxor (getpid) (car (gettimeofday))))
+
+(define shuffle
+ (let ((state (seed->random-state (random-seed))))
+ (lambda (lst)
+ "Return LST shuffled (using the Fisher-Yates algorithm.)"
+ (define vec (list->vector lst))
+ (let loop ((result '())
+ (i (vector-length vec)))
+ (if (zero? i)
+ result
+ (let* ((j (random i state))
+ (val (vector-ref vec j)))
+ (vector-set! vec j (vector-ref vec (- i 1)))
+ (loop (cons val result) (- i 1))))))))
+
+(define (choose-build-machine machines)
+ "Return two values: the best machine among MACHINES and its build
+slot (which must later be released with 'release-build-slot'), or #f and #f."
+
+ ;; Proceed like this:
+ ;; 1. For all MACHINES, attempt to acquire a build slot, and filter out
+ ;; those machines for which we failed.
+ ;; 2. Choose the best machine among those that are left.
+ ;; 3. Release the previously-acquired build slots of the other machines.
+
+ (define machines+slots
+ (filter-map (lambda (machine)
+ (let ((slot (acquire-build-slot machine)))
+ (and slot (list machine slot))))
+ (shuffle machines)))
+
+ (define (undecorate pred)
+ (lambda (a b)
+ (match a
+ ((machine1 slot1)
+ (match b
+ ((machine2 slot2)
+ (pred machine1 machine2)))))))
+
+ (define (machine-faster? m1 m2)
+ ;; Return #t if M1 is faster than M2.
+ (> (build-machine-speed m1)
+ (build-machine-speed m2)))
+
+ (let loop ((machines+slots
+ (sort machines+slots (undecorate machine-faster?))))
+ (match machines+slots
+ (((best slot) others ...)
+ ;; Return the best machine unless it's already overloaded.
+ ;; Note: We call 'node-load' only as a last resort because it is
+ ;; too costly to call it once for every machine.
+ (let* ((session (false-if-exception (open-ssh-session best)))
+ (node (and session (remote-inferior session)))
+ (load (and node (normalized-load best (node-load node))))
+ (space (and node (node-free-disk-space node))))
+ (when node (close-inferior node))
+ (when session (disconnect! session))
+ (if (and node (< load 2.) (>= space %minimum-disk-space))
+ (match others
+ (((machines slots) ...)
+ ;; Release slots from the uninteresting machines.
+ (for-each release-build-slot slots)
+
+ ;; The caller must keep SLOT to protect it from GC and to
+ ;; eventually release it.
+ (values best slot)))
+ (begin
+ ;; BEST is unsuitable, so try the next one.
+ (when (and space (< space %minimum-disk-space))
+ (format (current-error-port)
+ "skipping machine '~a' because it is low \
+on disk space (~,2f MiB free)~%"
+ (build-machine-name best)
+ (/ space (expt 2 20) 1.)))
+ (release-build-slot slot)
+ (loop others)))))
+ (()
+ (values #f #f)))))
+
+(define (call-with-timeout timeout drv thunk)
+ "Call THUNK and leave after TIMEOUT seconds. If TIMEOUT is #f, simply call
+THUNK. Use DRV as an indication of what we were building when the timeout
+expired."
+ (if (number? timeout)
+ (dynamic-wind
+ (lambda ()
+ (sigaction SIGALRM
+ (lambda _
+ ;; The exit code here will be 1, which guix-daemon will
+ ;; interpret as a transient failure.
+ (leave (G_ "timeout expired while offloading '~a'~%")
+ (derivation-file-name drv))))
+ (alarm timeout))
+ thunk
+ (lambda ()
+ (alarm 0)))
+ (thunk)))
+
+(define-syntax-rule (with-timeout timeout drv exp ...)
+ "Evaluate EXP... and leave after TIMEOUT seconds if EXP hasn't completed.
+If TIMEOUT is #f, simply evaluate EXP..."
+ (call-with-timeout timeout drv (lambda () exp ...)))
+
+(define* (process-request wants-local? system drv features
+ #:key
+ print-build-trace? (max-silent-time 3600)
+ build-timeout)
+ "Process a request to build DRV."
+ (let* ((local? (and wants-local? (string=? system (%current-system))))
+ (reqs (build-requirements
+ (system system)
+ (features features)))
+ (candidates (filter (cut machine-matches? <> reqs)
+ (build-machines))))
+ (match candidates
+ (()
+ ;; We'll never be able to match REQS.
+ (display "# decline\n"))
+ ((x ...)
+ (let-values (((machine slot)
+ (choose-build-machine candidates)))
+ (if machine
+ (dynamic-wind
+ (const #f)
+ (lambda ()
+ ;; Offload DRV to MACHINE.
+ (display "# accept\n")
+ (let ((inputs (string-tokenize (read-line)))
+ (outputs (string-tokenize (read-line))))
+ ;; Even if BUILD-TIMEOUT is honored by MACHINE, there can
+ ;; be issues with the connection or deadlocks that could
+ ;; lead the 'guix offload' process to remain stuck forever.
+ ;; To avoid that, install a timeout here as well.
+ (with-timeout build-timeout drv
+ (transfer-and-offload drv machine
+ #:inputs inputs
+ #:outputs outputs
+ #:max-silent-time max-silent-time
+ #:build-timeout build-timeout
+ #:print-build-trace?
+ print-build-trace?))))
+ (lambda ()
+ (release-build-slot slot)))
+
+ ;; Not now, all the machines are busy.
+ (display "# postpone\n")))))))
+
+\f
+;;;
+;;; Installation tests.
+;;;
+
+(define (assert-node-repl node name)
+ "Bail out if NODE is not running Guile."
+ (match (node-guile-version node)
+ (#f
+ (report-guile-error name))
+ ((? string? version)
+ (info (G_ "'~a' is running GNU Guile ~a~%")
+ name (node-guile-version node)))))
+
+(define (assert-node-has-guix node name)
+ "Bail out if NODE if #f or if we fail to use the (guix) module, or if its
+daemon is not running."
+ (unless (inferior? node)
+ (leave (G_ "failed to run 'guix repl' on '~a'~%") name))
+
+ (match (inferior-eval '(begin
+ (use-modules (guix))
+ (and add-text-to-store 'alright))
+ node)
+ ('alright #t)
+ (_ (report-module-error name)))
+
+ (match (inferior-eval '(begin
+ (use-modules (guix))
+ (with-store store
+ (add-text-to-store store "test"
+ "Hello, build machine!")))
+ node)
+ ((? string? str)
+ (info (G_ "Guix is usable on '~a' (test returned ~s)~%")
+ name str))
+ (x
+ (leave (G_ "failed to talk to guix-daemon on '~a' (test returned ~s)~%")
+ name x))))
+
+(define %random-state
+ (delay
+ (seed->random-state (logxor (getpid) (car (gettimeofday))))))
+
+(define* (nonce #:optional (name (gethostname)))
+ (string-append name "-"
+ (number->string (random 1000000 (force %random-state)))))
+
+(define (assert-node-can-import session node name daemon-socket)
+ "Bail out if NODE refuses to import our archives."
+ (with-store store
+ (let* ((item (add-text-to-store store "export-test" (nonce)))
+ (remote (connect-to-remote-daemon session daemon-socket)))
+ (with-store local
+ (send-files local (list item) remote))
+
+ (if (valid-path? remote item)
+ (info (G_ "'~a' successfully imported '~a'~%")
+ name item)
+ (leave (G_ "'~a' was not properly imported on '~a'~%")
+ item name)))))
+
+(define (assert-node-can-export session node name daemon-socket)
+ "Bail out if we cannot import signed archives from NODE."
+ (let* ((remote (connect-to-remote-daemon session daemon-socket))
+ (item (add-text-to-store remote "import-test" (nonce name))))
+ (with-store store
+ (if (and (retrieve-files store (list item) remote)
+ (valid-path? store item))
+ (info (G_ "successfully imported '~a' from '~a'~%")
+ item name)
+ (leave (G_ "failed to import '~a' from '~a'~%")
+ item name)))))
+
+(define (check-machine-availability machine-file pred)
+ "Check that each machine matching PRED in MACHINE-FILE is usable as a build
+machine."
+ (define (build-machine=? m1 m2)
+ (and (string=? (build-machine-name m1) (build-machine-name m2))
+ (= (build-machine-port m1) (build-machine-port m2))))
+
+ ;; A given build machine may appear several times (e.g., once for
+ ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
+ (let ((machines (filter pred
+ (delete-duplicates (build-machines machine-file)
+ build-machine=?))))
+ (info (G_ "testing ~a build machines defined in '~a'...~%")
+ (length machines) machine-file)
+ (let* ((names (map build-machine-name machines))
+ (sockets (map build-machine-daemon-socket machines))
+ (sessions (map open-ssh-session machines))
+ (nodes (map remote-inferior sessions)))
+ (for-each assert-node-has-guix nodes names)
+ (for-each assert-node-repl nodes names)
+ (for-each assert-node-can-import sessions nodes names sockets)
+ (for-each assert-node-can-export sessions nodes names sockets)
+ (for-each close-inferior nodes)
+ (for-each disconnect! sessions))))
+
+(define (check-machine-status machine-file pred)
+ "Print the load of each machine matching PRED in MACHINE-FILE."
+ (define (build-machine=? m1 m2)
+ (and (string=? (build-machine-name m1) (build-machine-name m2))
+ (= (build-machine-port m1) (build-machine-port m2))))
+
+ ;; A given build machine may appear several times (e.g., once for
+ ;; "x86_64-linux" and a second time for "i686-linux"); test them only once.
+ (let ((machines (filter pred
+ (delete-duplicates (build-machines machine-file)
+ build-machine=?))))
+ (info (G_ "getting status of ~a build machines defined in '~a'...~%")
+ (length machines) machine-file)
+ (for-each (lambda (machine)
+ (define session
+ (open-ssh-session machine))
+
+ (match (remote-inferior session)
+ (#f
+ (warning (G_ "failed to run 'guix repl' on machine '~a'~%")
+ (build-machine-name machine)))
+ ((? inferior? inferior)
+ (let ((now (car (gettimeofday))))
+ (match (inferior-eval '(list (uname)
+ (car (gettimeofday)))
+ inferior)
+ ((uts time)
+ (when (< time now)
+ ;; Build machine clocks must not be behind as this
+ ;; could cause timestamp issues.
+ (warning (G_ "machine '~a' is ~a seconds behind~%")
+ (build-machine-name machine)
+ (- now time)))
+
+ (let ((load (node-load inferior))
+ (free (node-free-disk-space inferior)))
+ (close-inferior inferior)
+ (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
+ host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%\
+ time difference: ~a s~%"
+ (build-machine-name machine)
+ (utsname:sysname uts) (utsname:release uts)
+ (utsname:machine uts)
+ (utsname:nodename uts)
+ (normalized-load machine load)
+ (/ free (expt 2 20) 1.)
+ (- time now))))))))
+
+ (disconnect! session))
+ machines)))
+
+\f
+;;;
+;;; Entry point.
+;;;
+
+(define (guix-offload . args)
+ (define request-line-rx
+ ;; The request format. See 'tryBuildHook' method in build.cc.
+ (make-regexp "([01]) ([a-z0-9_-]+) (/[[:graph:]]+.drv) ([[:graph:]]*)"))
+
+ (define not-coma
+ (char-set-complement (char-set #\,)))
+
+ ;; Make sure $HOME really corresponds to the current user. This is
+ ;; necessary since lsh uses that to determine the location of the yarrow
+ ;; seed file, and fails if it's owned by someone else.
+ (and=> (passwd:dir (getpw (getuid)))
+ (cut setenv "HOME" <>))
+
+ ;; We rely on protocol-level compression from libssh to optimize large data
+ ;; transfers. Warn if it's missing.
+ (unless (zlib-support?)
+ (warning (G_ "Guile-SSH lacks zlib support"))
+ (warning (G_ "data transfers will *not* be compressed!")))
+
+ (match args
+ ((system max-silent-time print-build-trace? build-timeout)
+ (let ((max-silent-time (string->number max-silent-time))
+ (build-timeout (string->number build-timeout))
+ (print-build-trace? (string=? print-build-trace? "1")))
+ (set-thread-name "guix offload")
+ (parameterize ((%current-system system))
+ (let loop ((line (read-line)))
+ (unless (eof-object? line)
+ (cond ((regexp-exec request-line-rx line)
+ =>
+ (lambda (match)
+ (with-error-handling
+ (process-request (equal? (match:substring match 1) "1")
+ (match:substring match 2) ; system
+ (read-derivation-from-file
+ (match:substring match 3))
+ (string-tokenize
+ (match:substring match 4) not-coma)
+ #:print-build-trace? print-build-trace?
+ #:max-silent-time max-silent-time
+ #:build-timeout build-timeout))))
+ (else
+ (leave (G_ "invalid request line: ~s~%") line)))
+ (loop (read-line)))))))
+ (("test" rest ...)
+ (with-error-handling
+ (let-values (((file pred)
+ (match rest
+ ((file regexp)
+ (values file
+ (compose (cut string-match regexp <>)
+ build-machine-name)))
+ ((file) (values file (const #t)))
+ (() (values %machine-file (const #t)))
+ (x (leave (G_ "wrong number of arguments~%"))))))
+ (check-machine-availability (or file %machine-file) pred))))
+ (("status" rest ...)
+ (with-error-handling
+ (let-values (((file pred)
+ (match rest
+ ((file regexp)
+ (values file
+ (compose (cut string-match regexp <>)
+ build-machine-name)))
+ ((file) (values file (const #t)))
+ (() (values %machine-file (const #t)))
+ (x (leave (G_ "wrong number of arguments~%"))))))
+ (check-machine-status (or file %machine-file) pred))))
+ (("--version")
+ (show-version-and-exit "guix offload"))
+ (("--help")
+ (format #t (G_ "Usage: guix offload SYSTEM PRINT-BUILD-TRACE
+Process build offload requests written on the standard input, possibly
+offloading builds to the machines listed in '~a'.~%")
+ %machine-file)
+ (display (G_ "
+This tool is meant to be used internally by 'guix-daemon'.\n"))
+ (show-bug-report-information))
+ (x
+ (leave (G_ "invalid arguments: ~{~s ~}~%") x))))
+
+;;; Local Variables:
+;;; eval: (put 'with-file-lock 'scheme-indent-function 1)
+;;; eval: (put 'with-error-to-port 'scheme-indent-function 1)
+;;; eval: (put 'with-timeout 'scheme-indent-function 2)
+;;; End:
+
+;;; offload.scm ends here