Add "cps2" experiment
[bpt/guile.git] / module / web / uri.scm
1 ;;;; (web uri) --- URI manipulation tools
2 ;;;;
3 ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014 Free Software Foundation, Inc.
4 ;;;;
5 ;;;; This library is free software; you can redistribute it and/or
6 ;;;; modify it under the terms of the GNU Lesser General Public
7 ;;;; License as published by the Free Software Foundation; either
8 ;;;; version 3 of the License, or (at your option) any later version.
9 ;;;;
10 ;;;; This library is distributed in the hope that it will be useful,
11 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;;; Lesser General Public License for more details.
14 ;;;;
15 ;;;; You should have received a copy of the GNU Lesser General Public
16 ;;;; License along with this library; if not, write to the Free Software
17 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 ;;;;
19
20 ;;; Commentary:
21
22 ;; A data type for Universal Resource Identifiers, as defined in RFC
23 ;; 3986.
24
25 ;;; Code:
26
27 (define-module (web uri)
28 #:use-module (srfi srfi-9)
29 #:use-module (ice-9 regex)
30 #:use-module (ice-9 rdelim)
31 #:use-module (ice-9 control)
32 #:use-module (rnrs bytevectors)
33 #:use-module (ice-9 binary-ports)
34 #:export (uri?
35 uri-scheme uri-userinfo uri-host uri-port
36 uri-path uri-query uri-fragment
37
38 build-uri
39 build-uri-reference
40 declare-default-port!
41 string->uri string->uri-reference
42 uri->string
43 uri-decode uri-encode
44 split-and-decode-uri-path
45 encode-and-join-uri-path))
46
47 (define-record-type <uri>
48 (make-uri scheme userinfo host port path query fragment)
49 uri?
50 (scheme uri-scheme)
51 (userinfo uri-userinfo)
52 (host uri-host)
53 (port uri-port)
54 (path uri-path)
55 (query uri-query)
56 (fragment uri-fragment))
57
58 (define (absolute-uri? obj)
59 (and (uri? obj) (uri-scheme obj) #t))
60
61 (define (uri-error message . args)
62 (throw 'uri-error message args))
63
64 (define (positive-exact-integer? port)
65 (and (number? port) (exact? port) (integer? port) (positive? port)))
66
67 (define* (validate-uri scheme userinfo host port path query fragment
68 #:key reference?)
69 (cond
70 ((and (not reference?) (not (symbol? scheme)))
71 (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
72 ((and (or userinfo port) (not host))
73 (uri-error "Expected a host, given userinfo or port"))
74 ((and port (not (positive-exact-integer? port)))
75 (uri-error "Expected port to be an integer: ~s" port))
76 ((and host (or (not (string? host)) (not (valid-host? host))))
77 (uri-error "Expected valid host: ~s" host))
78 ((and userinfo (not (string? userinfo)))
79 (uri-error "Expected string for userinfo: ~s" userinfo))
80 ((not (string? path))
81 (uri-error "Expected string for path: ~s" path))
82 ((and host (not (string-null? path))
83 (not (eqv? (string-ref path 0) #\/)))
84 (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
85
86 (define* (build-uri scheme #:key userinfo host port (path "") query fragment
87 (validate? #t))
88 "Construct a URI object. SCHEME should be a symbol, PORT either a
89 positive, exact integer or ‘#f’, and the rest of the fields are either
90 strings or ‘#f’. If VALIDATE? is true, also run some consistency checks
91 to make sure that the constructed object is a valid absolute URI."
92 (if validate?
93 (validate-uri scheme userinfo host port path query fragment))
94 (make-uri scheme userinfo host port path query fragment))
95
96 (define* (build-uri-reference #:key scheme userinfo host port (path "") query
97 fragment (validate? #t))
98 "Construct a URI object. SCHEME should be a symbol or ‘#f’, PORT
99 either a positive, exact integer or ‘#f’, and the rest of the fields
100 are either strings or ‘#f’. If VALIDATE? is true, also run some
101 consistency checks to make sure that the constructed URI is a valid URI
102 reference (either an absolute URI or a relative reference)."
103 (if validate?
104 (validate-uri scheme userinfo host port path query fragment
105 #:reference? #t))
106 (make-uri scheme userinfo host port path query fragment))
107
108 ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
109 ;; 3490), and non-ASCII host names.
110 ;;
111 (define ipv4-regexp
112 (make-regexp "^([0-9.]+)$"))
113 (define ipv6-regexp
114 (make-regexp "^([0-9a-fA-F:.]+)$"))
115 (define domain-label-regexp
116 (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
117 (define top-label-regexp
118 (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
119
120 (define (valid-host? host)
121 (cond
122 ((regexp-exec ipv4-regexp host)
123 (false-if-exception (inet-pton AF_INET host)))
124 ((regexp-exec ipv6-regexp host)
125 (false-if-exception (inet-pton AF_INET6 host)))
126 (else
127 (let lp ((start 0))
128 (let ((end (string-index host #\. start)))
129 (if end
130 (and (regexp-exec domain-label-regexp
131 (substring host start end))
132 (lp (1+ end)))
133 (regexp-exec top-label-regexp host start)))))))
134
135 (define userinfo-pat
136 "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
137 (define host-pat
138 "[a-zA-Z0-9.-]+")
139 (define ipv6-host-pat
140 "[0-9a-fA-F:.]+")
141 (define port-pat
142 "[0-9]*")
143 (define authority-regexp
144 (make-regexp
145 (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
146 userinfo-pat host-pat ipv6-host-pat port-pat)))
147
148 (define (parse-authority authority fail)
149 (if (equal? authority "//")
150 ;; Allow empty authorities: file:///etc/hosts is a synonym of
151 ;; file:/etc/hosts.
152 (values #f #f #f)
153 (let ((m (regexp-exec authority-regexp authority)))
154 (if (and m (valid-host? (or (match:substring m 4)
155 (match:substring m 6))))
156 (values (match:substring m 2)
157 (or (match:substring m 4)
158 (match:substring m 6))
159 (let ((port (match:substring m 8)))
160 (and port (not (string-null? port))
161 (string->number port))))
162 (fail)))))
163
164
165 ;;; RFC 3986, #3.
166 ;;;
167 ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
168 ;;;
169 ;;; hier-part = "//" authority path-abempty
170 ;;; / path-absolute
171 ;;; / path-rootless
172 ;;; / path-empty
173 ;;;
174 ;;; A URI-reference is the same as URI, but where the scheme is
175 ;;; optional. If the scheme is not present, its colon isn't present
176 ;;; either.
177
178 (define scheme-pat
179 "[a-zA-Z][a-zA-Z0-9+.-]*")
180 (define authority-pat
181 "[^/?#]*")
182 (define path-pat
183 "[^?#]*")
184 (define query-pat
185 "[^#]*")
186 (define fragment-pat
187 ".*")
188 (define uri-pat
189 (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$"
190 scheme-pat authority-pat path-pat query-pat fragment-pat))
191 (define uri-regexp
192 (make-regexp uri-pat))
193
194 (define (string->uri-reference string)
195 "Parse the URI reference written as STRING into a URI object. Return
196 ‘#f’ if the string could not be parsed."
197 (% (let ((m (regexp-exec uri-regexp string)))
198 (if (not m) (abort))
199 (let ((scheme (let ((str (match:substring m 2)))
200 (and str (string->symbol (string-downcase str)))))
201 (authority (match:substring m 3))
202 (path (match:substring m 4))
203 (query (match:substring m 6))
204 (fragment (match:substring m 8)))
205 (call-with-values
206 (lambda ()
207 (if authority
208 (parse-authority authority abort)
209 (values #f #f #f)))
210 (lambda (userinfo host port)
211 (make-uri scheme userinfo host port path query fragment)))))
212 (lambda (k)
213 #f)))
214
215 (define (string->uri string)
216 "Parse STRING into an absolute URI object. Return ‘#f’ if the string
217 could not be parsed."
218 (let ((uri (string->uri-reference string)))
219 (and uri (uri-scheme uri) uri)))
220
221 (define *default-ports* (make-hash-table))
222
223 (define (declare-default-port! scheme port)
224 "Declare a default port for the given URI scheme."
225 (hashq-set! *default-ports* scheme port))
226
227 (define (default-port? scheme port)
228 (or (not port)
229 (eqv? port (hashq-ref *default-ports* scheme))))
230
231 (declare-default-port! 'http 80)
232 (declare-default-port! 'https 443)
233
234 (define (uri->string uri)
235 "Serialize URI to a string. If the URI has a port that is the
236 default port for its scheme, the port is not included in the
237 serialization."
238 (let* ((scheme (uri-scheme uri))
239 (userinfo (uri-userinfo uri))
240 (host (uri-host uri))
241 (port (uri-port uri))
242 (path (uri-path uri))
243 (query (uri-query uri))
244 (fragment (uri-fragment uri)))
245 (string-append
246 (if scheme
247 (string-append (symbol->string scheme) ":")
248 "")
249 (if host
250 (string-append "//"
251 (if userinfo (string-append userinfo "@")
252 "")
253 (if (string-index host #\:)
254 (string-append "[" host "]")
255 host)
256 (if (default-port? (uri-scheme uri) port)
257 ""
258 (string-append ":" (number->string port))))
259 "")
260 path
261 (if query
262 (string-append "?" query)
263 "")
264 (if fragment
265 (string-append "#" fragment)
266 ""))))
267
268
269 ;; like call-with-output-string, but actually closes the port (doh)
270 (define (call-with-output-string* proc)
271 (let ((port (open-output-string)))
272 (proc port)
273 (let ((str (get-output-string port)))
274 (close-port port)
275 str)))
276
277 (define (call-with-output-bytevector* proc)
278 (call-with-values
279 (lambda ()
280 (open-bytevector-output-port))
281 (lambda (port get-bytevector)
282 (proc port)
283 (let ((bv (get-bytevector)))
284 (close-port port)
285 bv))))
286
287 (define (call-with-encoded-output-string encoding proc)
288 (if (string-ci=? encoding "utf-8")
289 (string->utf8 (call-with-output-string* proc))
290 (call-with-output-bytevector*
291 (lambda (port)
292 (set-port-encoding! port encoding)
293 (proc port)))))
294
295 (define (encode-string str encoding)
296 (if (string-ci=? encoding "utf-8")
297 (string->utf8 str)
298 (call-with-encoded-output-string encoding
299 (lambda (port)
300 (display str port)))))
301
302 (define (decode-string bv encoding)
303 (if (string-ci=? encoding "utf-8")
304 (utf8->string bv)
305 (let ((p (open-bytevector-input-port bv)))
306 (set-port-encoding! p encoding)
307 (let ((res (read-string p)))
308 (close-port p)
309 res))))
310
311
312 ;; A note on characters and bytes: URIs are defined to be sequences of
313 ;; characters in a subset of ASCII. Those characters may encode a
314 ;; sequence of bytes (octets), which in turn may encode sequences of
315 ;; characters in other character sets.
316 ;;
317
318 ;; Return a new string made from uri-decoding STR. Specifically,
319 ;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into
320 ;; their eight-bit characters.
321 ;;
322 (define hex-chars
323 (string->char-set "0123456789abcdefABCDEF"))
324
325 (define* (uri-decode str #:key (encoding "utf-8"))
326 "Percent-decode the given STR, according to ENCODING,
327 which should be the name of a character encoding.
328
329 Note that this function should not generally be applied to a full URI
330 string. For paths, use ‘split-and-decode-uri-path’ instead. For query
331 strings, split the query on ‘&’ and ‘=’ boundaries, and decode
332 the components separately.
333
334 Note also that percent-encoded strings encode _bytes_, not characters.
335 There is no guarantee that a given byte sequence is a valid string
336 encoding. Therefore this routine may signal an error if the decoded
337 bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if
338 you want decoded bytes as a bytevector directly. ‘set-port-encoding!’,
339 for more information on character encodings.
340
341 Returns a string of the decoded characters, or a bytevector if
342 ENCODING was ‘#f’."
343 (let* ((len (string-length str))
344 (bv
345 (call-with-output-bytevector*
346 (lambda (port)
347 (let lp ((i 0))
348 (if (< i len)
349 (let ((ch (string-ref str i)))
350 (cond
351 ((eqv? ch #\+)
352 (put-u8 port (char->integer #\space))
353 (lp (1+ i)))
354 ((and (< (+ i 2) len) (eqv? ch #\%)
355 (let ((a (string-ref str (+ i 1)))
356 (b (string-ref str (+ i 2))))
357 (and (char-set-contains? hex-chars a)
358 (char-set-contains? hex-chars b)
359 (string->number (string a b) 16))))
360 => (lambda (u8)
361 (put-u8 port u8)
362 (lp (+ i 3))))
363 ((< (char->integer ch) 128)
364 (put-u8 port (char->integer ch))
365 (lp (1+ i)))
366 (else
367 (uri-error "Invalid character in encoded URI ~a: ~s"
368 str ch))))))))))
369 (if encoding
370 (decode-string bv encoding)
371 ;; Otherwise return raw bytevector
372 bv)))
373
374 (define ascii-alnum-chars
375 (string->char-set
376 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
377
378 ;; RFC 3986, #2.2.
379 (define gen-delims
380 (string->char-set ":/?#[]@"))
381 (define sub-delims
382 (string->char-set "!$&'()*+,l="))
383 (define reserved-chars
384 (char-set-union gen-delims sub-delims))
385
386 ;; RFC 3986, #2.3
387 (define unreserved-chars
388 (char-set-union ascii-alnum-chars
389 (string->char-set "-._~")))
390
391 ;; Return a new string made from uri-encoding STR, unconditionally
392 ;; transforming any characters not in UNESCAPED-CHARS.
393 ;;
394 (define* (uri-encode str #:key (encoding "utf-8")
395 (unescaped-chars unreserved-chars))
396 "Percent-encode any character not in the character set,
397 UNESCAPED-CHARS.
398
399 The default character set includes alphanumerics from ASCII, as well as
400 the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will
401 be percent-encoded, by writing out the character to a bytevector within
402 the given ENCODING, then encoding each byte as ‘%HH’, where HH is the
403 uppercase hexadecimal representation of the byte."
404 (define (needs-escaped? ch)
405 (not (char-set-contains? unescaped-chars ch)))
406 (if (string-index str needs-escaped?)
407 (call-with-output-string*
408 (lambda (port)
409 (string-for-each
410 (lambda (ch)
411 (if (char-set-contains? unescaped-chars ch)
412 (display ch port)
413 (let* ((bv (encode-string (string ch) encoding))
414 (len (bytevector-length bv)))
415 (let lp ((i 0))
416 (if (< i len)
417 (let ((byte (bytevector-u8-ref bv i)))
418 (display #\% port)
419 (when (< byte 16)
420 (display #\0 port))
421 (display (string-upcase (number->string byte 16))
422 port)
423 (lp (1+ i))))))))
424 str)))
425 str))
426
427 (define (split-and-decode-uri-path path)
428 "Split PATH into its components, and decode each component,
429 removing empty components.
430
431 For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list,
432 ‘(\"foo\" \"bar baz\")’."
433 (filter (lambda (x) (not (string-null? x)))
434 (map uri-decode (string-split path #\/))))
435
436 (define (encode-and-join-uri-path parts)
437 "URI-encode each element of PARTS, which should be a list of
438 strings, and join the parts together with ‘/’ as a delimiter.
439
440 For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’
441 encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’."
442 (string-join (map uri-encode parts) "/"))