1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès <ludo@gnu.org>
3 ;;; Copyright © 2020 Jakub Kądziołka <kuba@kadziolka.net>
4 ;;; Copyright © 2021 Xinglu Chen <public@yoctocell.xyz>
5 ;;; Copyright © 2021 Simon Tournier <zimon.toutoune@gmail.com>
7 ;;; This file is part of GNU Guix.
9 ;;; GNU Guix is free software; you can redistribute it and/or modify it
10 ;;; under the terms of the GNU General Public License as published by
11 ;;; the Free Software Foundation; either version 3 of the License, or (at
12 ;;; your option) any later version.
14 ;;; GNU Guix is distributed in the hope that it will be useful, but
15 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;;; GNU General Public License for more details.
19 ;;; You should have received a copy of the GNU General Public License
20 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
22 (define-module (guix swh)
23 #:use-module (guix base16)
24 #:use-module (guix build utils)
25 #:use-module ((guix build syscalls) #:select (mkdtemp!))
26 #:use-module (web uri)
27 #:use-module (web client)
28 #:use-module (web response)
30 #:use-module (srfi srfi-1)
31 #:use-module (srfi srfi-9)
32 #:use-module (srfi srfi-11)
33 #:use-module (srfi srfi-19)
34 #:use-module (ice-9 match)
35 #:use-module (ice-9 regex)
36 #:use-module (ice-9 popen)
37 #:use-module ((ice-9 ftw) #:select (scandir))
38 #:export (%swh-base-url
39 %verify-swh-certificate?
42 request-rate-limit-reached?
62 lookup-snapshot-branch
79 lookup-origin-revision
90 directory-entry-checksums
91 directory-entry-length
92 directory-entry-permissions
94 directory-entry-target
98 save-reply-origin-type
99 save-reply-request-date
100 save-reply-request-status
101 save-reply-task-status
107 vault-reply-fetch-url
108 vault-reply-progress-message
117 swh-download-directory
122 ;;; This module provides bindings to the HTTP interface of Software Heritage.
123 ;;; It allows you to browse the archive, look up revisions (such as SHA1
124 ;;; commit IDs), "origins" (code hosting URLs), content (files), etc. See
125 ;;; <https://archive.softwareheritage.org/api/> for more information.
127 ;;; The high-level 'swh-download' procedure allows you to download a Git
128 ;;; revision from Software Heritage, provided it is available.
132 (define %swh-base-url
133 ;; Presumably we won't need to change it.
134 (make-parameter "https://archive.softwareheritage.org"))
136 (define %verify-swh-certificate?
137 ;; Whether to verify the X.509 HTTPS certificate for %SWH-BASE-URL.
140 ;; Token from an account to the Software Heritage Authentication service
141 ;; <https://archive.softwareheritage.org/api/>
143 (make-parameter (and=> (getenv "GUIX_SWH_TOKEN")
146 (define (swh-url path . rest)
147 ;; URLs returned by the API may be relative or absolute. This has changed
148 ;; without notice before. Handle both cases by detecting whether the path
149 ;; starts with a domain.
151 (if (string-prefix? "/" path)
152 (string-append (%swh-base-url) path)
156 (string-append root (string-join rest "/" 'prefix)))
158 ;; Ensure there's a trailing slash or we get a redirect.
159 (if (string-suffix? "/" url)
161 (string-append url "/")))
163 ;; XXX: Work around a bug in Guile 3.0.2 where #:verify-certificate? would
164 ;; be ignored (<https://bugs.gnu.org/40486>).
165 (define* (http-get* uri #:rest rest)
166 (apply http-request uri #:method 'GET rest))
167 (define* (http-post* uri #:rest rest)
168 (apply http-request uri #:method 'POST rest))
171 ;; Match strings like "2014-11-17T22:09:38+01:00" or
172 ;; "2018-09-30T23:20:07.815449+00:00"".
173 (make-regexp "^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):([0-9]{2})((\\.[0-9]+)?)([+-][0-9]{2}):([0-9]{2})$"))
175 (define (string->date* str)
176 "Return a SRFI-19 date parsed from STR, a date string as returned by
178 ;; We can't use 'string->date' because of the timezone format: SWH returns
179 ;; "+01:00" when the '~z' template expects "+0100". So we roll our own!
180 (or (and=> (regexp-exec %date-regexp str)
183 (string->number (match:substring match n)))
185 (make-date (let ((ns (match:substring match 8)))
187 (string->number (string-drop ns 1))
189 (ref 6) (ref 5) (ref 4)
190 (ref 3) (ref 2) (ref 1)
191 (+ (* 3600 (ref 9)) ;time zone
197 (define (maybe-null proc)
204 ;; Converts "string or #nil" coming from JSON to "string or #f".
206 ((? string? str) str)
207 ((? null?) #f) ;Guile-JSON 3.x
208 ('null #f))) ;Guile-JSON 4.x
210 (define %allow-request?
211 ;; Takes a URL and method (e.g., the 'http-get' procedure) and returns true
212 ;; to keep going. This can be used to disallow requests when
213 ;; 'request-rate-limit-reached?' returns true, for instance.
214 (make-parameter (const #t)))
216 ;; The time when the rate limit for "/origin/save" POST requests and that of
217 ;; other requests will be reset.
218 ;; See <https://archive.softwareheritage.org/api/#rate-limiting>.
219 (define %save-rate-limit-reset-time 0)
220 (define %general-rate-limit-reset-time 0)
222 (define (request-rate-limit-reached? url method)
223 "Return true if the rate limit has been reached for URI."
228 (if (and (eq? method http-post*)
229 (string-prefix? "/api/1/origin/save/" (uri-path uri)))
230 %save-rate-limit-reset-time
231 %general-rate-limit-reset-time))
233 (< (car (gettimeofday)) reset-time))
235 (define (update-rate-limit-reset-time! url method response)
236 "Update the rate limit reset time for URL and METHOD based on the headers in
238 (let ((uri (string->uri url)))
239 (match (assq-ref (response-headers response) 'x-ratelimit-reset)
240 ((= string->number (? number? reset))
241 (if (and (eq? method http-post*)
242 (string-prefix? "/api/1/origin/save/" (uri-path uri)))
243 (set! %save-rate-limit-reset-time reset)
244 (set! %general-rate-limit-reset-time reset)))
248 (define* (call url decode #:optional (method http-get*)
249 #:key (false-if-404? #t))
250 "Invoke the endpoint at URL using METHOD. Decode the resulting JSON body
251 using DECODE, a one-argument procedure that takes an input port. When
252 FALSE-IF-404? is true, return #f upon 404 responses."
253 (and ((%allow-request?) url method)
254 (let*-values (((response port)
255 (method url #:streaming? #t
258 `((authorization . (Bearer ,(%swh-token))))
260 #:verify-certificate?
261 (%verify-swh-certificate?))))
262 ;; See <https://archive.softwareheritage.org/api/#rate-limiting>.
263 (match (assq-ref (response-headers response) 'x-ratelimit-remaining)
265 ((? (compose zero? string->number))
266 (update-rate-limit-reset-time! url method response)
267 (throw 'swh-error url method response))
270 (cond ((= 200 (response-code response))
271 (let ((result (decode port)))
275 (= 404 (response-code response)))
280 (throw 'swh-error url method response))))))
282 (define-syntax define-query
284 "Define a procedure that performs a Software Heritage query."
285 ((_ (name args ...) docstring (path components ...)
287 (define (name args ...)
289 (call (swh-url components ...) json->value)))))
291 ;; <https://archive.softwareheritage.org/api/1/origin/https://github.com/guix-mirror/guix/get>
292 (define-json-mapping <origin> make-origin origin?
294 (visits-url origin-visits-url "origin_visits_url")
298 ;; <https://archive.softwareheritage.org/api/1/origin/52181937/visits/>
299 (define-json-mapping <visit> make-visit visit?
301 (date visit-date "date" string->date*)
302 (origin visit-origin)
303 (url visit-url "origin_visit_url")
304 (snapshot-url visit-snapshot-url "snapshot_url" string*) ;string | #f
305 (status visit-status "status" string->symbol) ;'full | 'partial | 'ongoing
306 (number visit-number "visit"))
308 ;; <https://archive.softwareheritage.org/api/1/snapshot/4334c3ed4bb208604ed780d8687fe523837f1bd1/>
309 (define-json-mapping <snapshot> make-snapshot snapshot?
312 (branches snapshot-branches "branches" json->branches))
314 ;; This is used for the "branches" field of snapshots.
315 (define-record-type <branch>
316 (make-branch name target-type target-url)
319 (target-type branch-target-type) ;release | revision
320 (target-url branch-target-url))
322 (define (json->branches branches)
327 (assoc-ref value "target_type"))
328 (assoc-ref value "target_url"))))
331 ;; <https://archive.softwareheritage.org/api/1/release/1f44934fb6e2cefccbecd4fa347025349fa9ff76/>
332 (define-json-mapping <release> make-release release?
336 (message release-message)
337 (target-type release-target-type "target_type" string->symbol)
338 (target-url release-target-url "target_url"))
340 ;; <https://archive.softwareheritage.org/api/1/revision/359fdda40f754bbf1b5dc261e7427b75463b59be/>
341 ;; Note: Some revisions, such as those for "nixguix" origins (e.g.,
342 ;; <https://archive.softwareheritage.org/api/1/revision/b8dbc65475bbedde8e015d4730ade8864c38fad3/>),
343 ;; have their 'date' field set to null.
344 (define-json-mapping <revision> make-revision revision?
347 (date revision-date "date" (maybe-null string->date*))
348 (directory revision-directory)
349 (directory-url revision-directory-url "directory_url"))
351 ;; <https://archive.softwareheritage.org/api/1/content/>
352 (define-json-mapping <content> make-content content?
354 (checksums content-checksums "checksums" json->checksums)
355 (data-url content-data-url "data_url")
356 (file-type-url content-file-type-url "filetype_url")
357 (language-url content-language-url "language_url")
358 (length content-length)
359 (license-url content-license-url "license_url"))
361 (define (json->checksums checksums)
364 (cons key (base16-string->bytevector value))))
367 ;; <https://archive.softwareheritage.org/api/1/directory/27c69c5d298a43096a53affbf881e7b13f17bdcd/>
368 (define-json-mapping <directory-entry> make-directory-entry directory-entry?
369 json->directory-entry
370 (name directory-entry-name)
371 (type directory-entry-type "type"
374 (str (string->symbol str))))
375 (checksums directory-entry-checksums "checksums"
378 ((? unspecified?) #f)
379 (lst (json->checksums lst))))
380 (id directory-entry-id "dir_id")
381 (length directory-entry-length)
382 (permissions directory-entry-permissions "perms")
383 (target-url directory-entry-target-url "target_url"))
385 ;; <https://archive.softwareheritage.org/api/1/origin/save/>
386 (define-json-mapping <save-reply> make-save-reply save-reply?
388 (origin-url save-reply-origin-url "origin_url")
389 (origin-type save-reply-origin-type "origin_type")
390 (request-date save-reply-request-date "save_request_date"
392 (request-status save-reply-request-status "save_request_status"
394 (task-status save-reply-task-status "save_task_status"
396 ("not created" 'not-created)
397 ((? string? str) (string->symbol str)))))
399 ;; <https://docs.softwareheritage.org/devel/swh-vault/api.html#vault-api-ref>
400 (define-json-mapping <vault-reply> make-vault-reply vault-reply?
403 (fetch-url vault-reply-fetch-url "fetch_url")
404 (progress-message vault-reply-progress-message "progress_message")
405 (status vault-reply-status "status" string->symbol)
406 (swhid vault-reply-swhid))
413 (define-query (lookup-origin url)
414 "Return an origin for URL."
415 (path "/api/1/origin" url "get")
418 (define-query (lookup-content hash type)
419 "Return a content for HASH, of the given TYPE--e.g., \"sha256\"."
420 (path "/api/1/content"
421 (string-append type ":"
422 (bytevector->base16-string hash)))
425 (define-query (lookup-revision id)
426 "Return the revision with the given ID, typically a Git commit SHA1."
427 (path "/api/1/revision" id)
430 (define-query (lookup-directory id)
431 "Return the directory with the given ID."
432 (path "/api/1/directory" id)
433 json->directory-entries)
435 (define (json->directory-entries port)
436 (map json->directory-entry
437 (vector->list (json->scm port))))
439 (define (origin-visits origin)
440 "Return the list of visits of ORIGIN, a record as returned by
442 (call (swh-url (origin-visits-url origin))
444 (map json->visit (vector->list (json->scm port))))))
446 (define (visit-snapshot visit)
447 "Return the snapshot corresponding to VISIT or #f if no snapshot is
449 (and (visit-snapshot-url visit)
450 (call (swh-url (visit-snapshot-url visit))
453 (define (snapshot-url snapshot branch-count first-branch)
454 "Return the URL of SNAPSHOT such that it contains information for
455 BRANCH-COUNT branches, starting at FIRST-BRANCH."
456 (string-append (swh-url "/api/1/snapshot" (snapshot-id snapshot))
457 "?branches_count=" (number->string branch-count)
458 "&branches_from=" (uri-encode first-branch)))
460 (define (lookup-snapshot-branch snapshot name)
461 "Look up branch NAME on SNAPSHOT. Return the branch, or return #f if it
463 (or (find (lambda (branch)
464 (string=? (branch-name branch) name))
465 (snapshot-branches snapshot))
467 ;; There's no API entry point to look up a snapshot branch by name.
468 ;; Work around that by using the paginated list of branches provided by
469 ;; the /api/1/snapshot API: ask for one branch, and start pagination at
471 (let ((snapshot (call (snapshot-url snapshot 1 name)
473 (match (snapshot-branches snapshot)
475 (and (string=? (branch-name branch) name)
479 (define (branch-target branch)
480 "Return the target of BRANCH, either a <revision> or a <release>."
481 (match (branch-target-type branch)
483 (call (swh-url (branch-target-url branch))
486 (call (swh-url (branch-target-url branch))
489 (define (lookup-origin-revision url tag)
490 "Return a <revision> corresponding to the given TAG for the repository
491 coming from URL. Example:
493 (lookup-origin-revision \"https://github.com/guix-mirror/guix/\" \"v0.8\")
494 => #<<revision> id: \"44941…\" …>
496 The information is based on the latest visit of URL available. Return #f if
497 URL could not be found."
498 (match (lookup-origin url)
501 (match (filter (lambda (visit)
502 ;; Return #f if (visit-snapshot VISIT) would return #f.
503 (and (visit-snapshot-url visit)
504 (eq? 'full (visit-status visit))))
505 (origin-visits origin))
507 (let ((snapshot (visit-snapshot visit)))
508 (match (and=> (find (lambda (branch)
511 (string=? (string-append "refs/tags/" tag)
512 (branch-name branch))
515 (branch-name branch))))
516 (snapshot-branches snapshot))
518 ((? release? release)
519 (release-target release))
520 ((? revision? revision)
527 (define (release-target release)
528 "Return the revision that is the target of RELEASE."
529 (match (release-target-type release)
531 (call (swh-url (release-target-url release))
534 (define (directory-entry-target entry)
535 "If ENTRY, a directory entry, has type 'directory, return its list of
536 directory entries; if it has type 'file, return its <content> object."
537 (call (swh-url (directory-entry-target-url entry))
538 (match (directory-entry-type entry)
539 ('file json->content)
540 ('directory json->directory-entries))))
542 (define* (save-origin url #:optional (type "git"))
543 "Request URL to be saved."
544 (call (swh-url "/api/1/origin/save" type "url" url) json->save-reply
547 (define-query (save-origin-status url type)
548 "Return the status of a /save request for URL and TYPE (e.g., \"git\")."
549 (path "/api/1/origin/save" type "url" url)
552 (define* (vault-url id kind #:optional (archive-type 'flat))
553 "Return the vault query/cooking URL for ID and KIND. Normally, ID is an
554 SWHID and KIND is #f; the deprecated convention is to set ID to a raw
555 directory or revision ID and KIND to 'revision or 'directory."
556 ;; Note: /api/1/vault/directory/ID was deprecated in favor of
557 ;; /api/1/vault/flat/SWHID; this procedure "converts" automatically.
558 (let ((id (match kind
559 ('directory (string-append "swh:1:dir:" id))
560 ('revision (string-append "swh:1:rev:" id))
562 (swh-url "/api/1/vault" (symbol->string archive-type) id)))
564 (define* (query-vault id #:optional kind #:key (archive-type 'flat))
565 "Ask the availability of object ID (an SWHID) to the vault. Return #f if it
566 could not be found, or a <vault-reply> on success. ARCHIVE-TYPE can be 'flat
567 for a tarball containing a directory, or 'git-bare for a tarball containing a
568 bare Git repository corresponding to a revision.
570 Passing KIND (one of 'directory or 'revision) together with a raw revision or
571 directory identifier is deprecated."
572 (call (vault-url id kind archive-type)
575 (define* (request-cooking id #:optional kind #:key (archive-type 'flat))
576 "Request the cooking of object ID, an SWHID. Return a <vault-reply>.
577 ARCHIVE-TYPE can be 'flat for a tarball containing a directory, or 'git-bare
578 for a tarball containing a bare Git repository corresponding to a revision.
580 Passing KIND (one of 'directory or 'revision) together with a raw revision or
581 directory identifier is deprecated."
582 (call (vault-url id kind archive-type)
586 (define* (vault-fetch id
590 (log-port (current-error-port)))
591 "Return an input port from which a bundle of the object with the given ID,
592 an SWHID, or #f if the object could not be found.
594 ARCHIVE-TYPE can be 'flat for a tarball containing a directory, or 'git-bare
595 for a tarball containing a bare Git repository corresponding to a revision."
596 (let loop ((reply (query-vault id kind
597 #:archive-type archive-type)))
600 (and=> (request-cooking id kind
601 #:archive-type archive-type)
604 (match (vault-reply-status reply)
607 (let-values (((response port)
608 (http-get* (swh-url (vault-reply-fetch-url reply))
610 #:verify-certificate?
611 (%verify-swh-certificate?))))
612 (if (= (response-code response) 200)
614 (begin ;shouldn't happen
618 ;; Upon failure, we're supposed to try again.
619 (format log-port "SWH vault: failure: ~a~%"
620 (vault-reply-progress-message reply))
621 (format log-port "SWH vault: retrying...~%")
622 (loop (request-cooking id kind
623 #:archive-type archive-type)))
624 ((and (or 'new 'pending) status)
625 ;; Wait until the bundle shows up.
626 (let ((message (vault-reply-progress-message reply)))
627 (when (eq? 'new status)
628 (format log-port "SWH vault: \
629 requested bundle cooking, waiting for completion...~%"))
630 (when (string? message)
631 (format log-port "SWH vault: ~a~%" message))
633 ;; Wait long enough so we don't exhaust our maximum number of
634 ;; requests per hour too fast (as of this writing, the limit is 60
635 ;; requests per hour per IP address.)
636 (sleep (if (eq? status 'new) 60 30))
638 (loop (query-vault id kind
639 #:archive-type archive-type)))))))))
643 ;;; High-level interface.
646 (define (call-with-temporary-directory proc) ;FIXME: factorize
647 "Call PROC with a name of a temporary directory; close the directory and
648 delete it when leaving the dynamic extent of this call."
649 (let* ((directory (or (getenv "TMPDIR") "/tmp"))
650 (template (string-append directory "/guix-directory.XXXXXX"))
651 (tmp-dir (mkdtemp! template)))
657 (false-if-exception (delete-file-recursively tmp-dir))))))
659 (define* (swh-download-archive swhid output
662 (log-port (current-error-port)))
663 "Download from Software Heritage the directory or revision with the given
664 SWID, in the ARCHIVE-TYPE format (one of 'flat or 'git-bare), and unpack it to
665 OUTPUT. Return #t on success and #f on failure."
666 (call-with-temporary-directory
668 (match (vault-fetch swhid
669 #:archive-type archive-type
673 "SWH: object ~a could not be fetched from the vault~%"
677 (let ((tar (open-pipe* OPEN_WRITE "tar" "-C" directory
679 ('flat "-xzvf") ;gzipped
680 ('git-bare "-xvf")) ;uncompressed
682 (dump-port input tar)
684 (let ((status (close-pipe tar)))
685 (unless (zero? status)
686 (error "tar extraction failure" status)))
688 (match (scandir directory)
689 (("." ".." sub-directory)
690 (copy-recursively (string-append directory "/" sub-directory)
692 #:log (%make-void-port "w"))
695 (define* (swh-download-directory id output
696 #:key (log-port (current-error-port)))
697 "Download from Software Heritage the directory with the given ID, and
698 unpack it to OUTPUT. Return #t on success and #f on failure."
699 (swh-download-archive (string-append "swh:1:dir:" id) output
701 #:log-port log-port))
703 (define (commit-id? reference)
704 "Return true if REFERENCE is likely a commit ID, false otherwise---e.g., if
705 it is a tag name. This is based on a simple heuristic so use with care!"
706 (and (= (string-length reference) 40)
707 (string-every char-set:hex-digit reference)))
709 (define* (swh-download url reference output
712 (log-port (current-error-port)))
713 "Download from Software Heritage a checkout (if ARCHIVE-TYPE is 'flat) or a
714 full Git repository (if ARCHIVE-TYPE is 'git-bare) of the Git tag or commit
715 REFERENCE originating from URL, and unpack it in OUTPUT. Return #t on success
718 This procedure uses the \"vault\", which contains \"cooked\" directories in
719 the form of tarballs. If the requested directory is not cooked yet, it will
720 wait until it becomes available, which could take several minutes."
721 (match (if (commit-id? reference)
722 (lookup-revision reference)
723 (lookup-origin-revision url reference))
724 ((? revision? revision)
725 (format log-port "SWH: found revision ~a with directory at '~a'~%"
726 (revision-id revision)
727 (swh-url (revision-directory-url revision)))
728 (swh-download-archive (match archive-type
731 "swh:1:dir:" (revision-directory revision)))
734 "swh:1:rev:" (revision-id revision))))
736 #:archive-type archive-type
737 #:log-port log-port))
740 "SWH: revision ~s originating from ~a could not be found~%"