Adapt visit-prompt-control-flow to use intsets.
[bpt/guile.git] / module / web / uri.scm
1 ;;;; (web uri) --- URI manipulation tools
2 ;;;;
3 ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013 Free Software Foundation, Inc.
4 ;;;;
5 ;;;; This library is free software; you can redistribute it and/or
6 ;;;; modify it under the terms of the GNU Lesser General Public
7 ;;;; License as published by the Free Software Foundation; either
8 ;;;; version 3 of the License, or (at your option) any later version.
9 ;;;;
10 ;;;; This library is distributed in the hope that it will be useful,
11 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;;; Lesser General Public License for more details.
14 ;;;;
15 ;;;; You should have received a copy of the GNU Lesser General Public
16 ;;;; License along with this library; if not, write to the Free Software
17 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 ;;;;
19
20 ;;; Commentary:
21
22 ;; A data type for Universal Resource Identifiers, as defined in RFC
23 ;; 3986.
24
25 ;;; Code:
26
27 (define-module (web uri)
28 #:use-module (srfi srfi-9)
29 #:use-module (ice-9 regex)
30 #:use-module (ice-9 rdelim)
31 #:use-module (ice-9 control)
32 #:use-module (rnrs bytevectors)
33 #:use-module (ice-9 binary-ports)
34 #:export (uri?
35 uri-scheme uri-userinfo uri-host uri-port
36 uri-path uri-query uri-fragment
37
38 build-uri
39 declare-default-port!
40 string->uri uri->string
41 uri-decode uri-encode
42 split-and-decode-uri-path
43 encode-and-join-uri-path))
44
45 (define-record-type <uri>
46 (make-uri scheme userinfo host port path query fragment)
47 uri?
48 (scheme uri-scheme)
49 (userinfo uri-userinfo)
50 (host uri-host)
51 (port uri-port)
52 (path uri-path)
53 (query uri-query)
54 (fragment uri-fragment))
55
56 (define (absolute-uri? obj)
57 (and (uri? obj) (uri-scheme obj) #t))
58
59 (define (uri-error message . args)
60 (throw 'uri-error message args))
61
62 (define (positive-exact-integer? port)
63 (and (number? port) (exact? port) (integer? port) (positive? port)))
64
65 (define (validate-uri scheme userinfo host port path query fragment)
66 (cond
67 ((not (symbol? scheme))
68 (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
69 ((and (or userinfo port) (not host))
70 (uri-error "Expected a host, given userinfo or port"))
71 ((and port (not (positive-exact-integer? port)))
72 (uri-error "Expected port to be an integer: ~s" port))
73 ((and host (or (not (string? host)) (not (valid-host? host))))
74 (uri-error "Expected valid host: ~s" host))
75 ((and userinfo (not (string? userinfo)))
76 (uri-error "Expected string for userinfo: ~s" userinfo))
77 ((not (string? path))
78 (uri-error "Expected string for path: ~s" path))
79 ((and host (not (string-null? path))
80 (not (eqv? (string-ref path 0) #\/)))
81 (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
82
83 (define* (build-uri scheme #:key userinfo host port (path "") query fragment
84 (validate? #t))
85 "Construct a URI object. SCHEME should be a symbol, PORT
86 either a positive, exact integer or ‘#f’, and the rest of the
87 fields are either strings or ‘#f’. If VALIDATE? is true,
88 also run some consistency checks to make sure that the constructed URI
89 is valid."
90 (if validate?
91 (validate-uri scheme userinfo host port path query fragment))
92 (make-uri scheme userinfo host port path query fragment))
93
94 ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
95 ;; 3490), and non-ASCII host names.
96 ;;
97 (define ipv4-regexp
98 (make-regexp "^([0-9.]+)$"))
99 (define ipv6-regexp
100 (make-regexp "^([0-9a-fA-F:.]+)$"))
101 (define domain-label-regexp
102 (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
103 (define top-label-regexp
104 (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
105
106 (define (valid-host? host)
107 (cond
108 ((regexp-exec ipv4-regexp host)
109 (false-if-exception (inet-pton AF_INET host)))
110 ((regexp-exec ipv6-regexp host)
111 (false-if-exception (inet-pton AF_INET6 host)))
112 (else
113 (let lp ((start 0))
114 (let ((end (string-index host #\. start)))
115 (if end
116 (and (regexp-exec domain-label-regexp
117 (substring host start end))
118 (lp (1+ end)))
119 (regexp-exec top-label-regexp host start)))))))
120
121 (define userinfo-pat
122 "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
123 (define host-pat
124 "[a-zA-Z0-9.-]+")
125 (define ipv6-host-pat
126 "[0-9a-fA-F:.]+")
127 (define port-pat
128 "[0-9]*")
129 (define authority-regexp
130 (make-regexp
131 (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
132 userinfo-pat host-pat ipv6-host-pat port-pat)))
133
134 (define (parse-authority authority fail)
135 (if (equal? authority "//")
136 ;; Allow empty authorities: file:///etc/hosts is a synonym of
137 ;; file:/etc/hosts.
138 (values #f #f #f)
139 (let ((m (regexp-exec authority-regexp authority)))
140 (if (and m (valid-host? (or (match:substring m 4)
141 (match:substring m 6))))
142 (values (match:substring m 2)
143 (or (match:substring m 4)
144 (match:substring m 6))
145 (let ((port (match:substring m 8)))
146 (and port (not (string-null? port))
147 (string->number port))))
148 (fail)))))
149
150
151 ;;; RFC 3986, #3.
152 ;;;
153 ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
154 ;;;
155 ;;; hier-part = "//" authority path-abempty
156 ;;; / path-absolute
157 ;;; / path-rootless
158 ;;; / path-empty
159
160 (define scheme-pat
161 "[a-zA-Z][a-zA-Z0-9+.-]*")
162 (define authority-pat
163 "[^/?#]*")
164 (define path-pat
165 "[^?#]*")
166 (define query-pat
167 "[^#]*")
168 (define fragment-pat
169 ".*")
170 (define uri-pat
171 (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$"
172 scheme-pat authority-pat path-pat query-pat fragment-pat))
173 (define uri-regexp
174 (make-regexp uri-pat))
175
176 (define (string->uri* string)
177 "Parse STRING into a URI object. Return ‘#f’ if the string
178 could not be parsed."
179 (% (let ((m (regexp-exec uri-regexp string)))
180 (if (not m) (abort))
181 (let ((scheme (let ((str (match:substring m 2)))
182 (and str (string->symbol (string-downcase str)))))
183 (authority (match:substring m 3))
184 (path (match:substring m 4))
185 (query (match:substring m 6))
186 (fragment (match:substring m 7)))
187 (call-with-values
188 (lambda ()
189 (if authority
190 (parse-authority authority abort)
191 (values #f #f #f)))
192 (lambda (userinfo host port)
193 (make-uri scheme userinfo host port path query fragment)))))
194 (lambda (k)
195 #f)))
196
197 (define (string->uri string)
198 "Parse STRING into a URI object. Return ‘#f’ if the string
199 could not be parsed."
200 (let ((uri (string->uri* string)))
201 (and uri (uri-scheme uri) uri)))
202
203 (define *default-ports* (make-hash-table))
204
205 (define (declare-default-port! scheme port)
206 "Declare a default port for the given URI scheme."
207 (hashq-set! *default-ports* scheme port))
208
209 (define (default-port? scheme port)
210 (or (not port)
211 (eqv? port (hashq-ref *default-ports* scheme))))
212
213 (declare-default-port! 'http 80)
214 (declare-default-port! 'https 443)
215
216 (define (uri->string uri)
217 "Serialize URI to a string. If the URI has a port that is the
218 default port for its scheme, the port is not included in the
219 serialization."
220 (let* ((scheme (uri-scheme uri))
221 (userinfo (uri-userinfo uri))
222 (host (uri-host uri))
223 (port (uri-port uri))
224 (path (uri-path uri))
225 (query (uri-query uri))
226 (fragment (uri-fragment uri)))
227 (string-append
228 (if scheme
229 (string-append (symbol->string scheme) ":")
230 "")
231 (if host
232 (string-append "//"
233 (if userinfo (string-append userinfo "@")
234 "")
235 (if (string-index host #\:)
236 (string-append "[" host "]")
237 host)
238 (if (default-port? (uri-scheme uri) port)
239 ""
240 (string-append ":" (number->string port))))
241 "")
242 path
243 (if query
244 (string-append "?" query)
245 "")
246 (if fragment
247 (string-append "#" fragment)
248 ""))))
249
250
251 ;; like call-with-output-string, but actually closes the port (doh)
252 (define (call-with-output-string* proc)
253 (let ((port (open-output-string)))
254 (proc port)
255 (let ((str (get-output-string port)))
256 (close-port port)
257 str)))
258
259 (define (call-with-output-bytevector* proc)
260 (call-with-values
261 (lambda ()
262 (open-bytevector-output-port))
263 (lambda (port get-bytevector)
264 (proc port)
265 (let ((bv (get-bytevector)))
266 (close-port port)
267 bv))))
268
269 (define (call-with-encoded-output-string encoding proc)
270 (if (string-ci=? encoding "utf-8")
271 (string->utf8 (call-with-output-string* proc))
272 (call-with-output-bytevector*
273 (lambda (port)
274 (set-port-encoding! port encoding)
275 (proc port)))))
276
277 (define (encode-string str encoding)
278 (if (string-ci=? encoding "utf-8")
279 (string->utf8 str)
280 (call-with-encoded-output-string encoding
281 (lambda (port)
282 (display str port)))))
283
284 (define (decode-string bv encoding)
285 (if (string-ci=? encoding "utf-8")
286 (utf8->string bv)
287 (let ((p (open-bytevector-input-port bv)))
288 (set-port-encoding! p encoding)
289 (let ((res (read-string p)))
290 (close-port p)
291 res))))
292
293
294 ;; A note on characters and bytes: URIs are defined to be sequences of
295 ;; characters in a subset of ASCII. Those characters may encode a
296 ;; sequence of bytes (octets), which in turn may encode sequences of
297 ;; characters in other character sets.
298 ;;
299
300 ;; Return a new string made from uri-decoding STR. Specifically,
301 ;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into
302 ;; their eight-bit characters.
303 ;;
304 (define hex-chars
305 (string->char-set "0123456789abcdefABCDEF"))
306
307 (define* (uri-decode str #:key (encoding "utf-8"))
308 "Percent-decode the given STR, according to ENCODING,
309 which should be the name of a character encoding.
310
311 Note that this function should not generally be applied to a full URI
312 string. For paths, use ‘split-and-decode-uri-path’ instead. For query
313 strings, split the query on ‘&’ and ‘=’ boundaries, and decode
314 the components separately.
315
316 Note also that percent-encoded strings encode _bytes_, not characters.
317 There is no guarantee that a given byte sequence is a valid string
318 encoding. Therefore this routine may signal an error if the decoded
319 bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if
320 you want decoded bytes as a bytevector directly. ‘set-port-encoding!’,
321 for more information on character encodings.
322
323 Returns a string of the decoded characters, or a bytevector if
324 ENCODING was ‘#f’."
325 (let* ((len (string-length str))
326 (bv
327 (call-with-output-bytevector*
328 (lambda (port)
329 (let lp ((i 0))
330 (if (< i len)
331 (let ((ch (string-ref str i)))
332 (cond
333 ((eqv? ch #\+)
334 (put-u8 port (char->integer #\space))
335 (lp (1+ i)))
336 ((and (< (+ i 2) len) (eqv? ch #\%)
337 (let ((a (string-ref str (+ i 1)))
338 (b (string-ref str (+ i 2))))
339 (and (char-set-contains? hex-chars a)
340 (char-set-contains? hex-chars b)
341 (string->number (string a b) 16))))
342 => (lambda (u8)
343 (put-u8 port u8)
344 (lp (+ i 3))))
345 ((< (char->integer ch) 128)
346 (put-u8 port (char->integer ch))
347 (lp (1+ i)))
348 (else
349 (uri-error "Invalid character in encoded URI ~a: ~s"
350 str ch))))))))))
351 (if encoding
352 (decode-string bv encoding)
353 ;; Otherwise return raw bytevector
354 bv)))
355
356 (define ascii-alnum-chars
357 (string->char-set
358 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
359
360 ;; RFC 3986, #2.2.
361 (define gen-delims
362 (string->char-set ":/?#[]@"))
363 (define sub-delims
364 (string->char-set "!$&'()*+,l="))
365 (define reserved-chars
366 (char-set-union gen-delims sub-delims))
367
368 ;; RFC 3986, #2.3
369 (define unreserved-chars
370 (char-set-union ascii-alnum-chars
371 (string->char-set "-._~")))
372
373 ;; Return a new string made from uri-encoding STR, unconditionally
374 ;; transforming any characters not in UNESCAPED-CHARS.
375 ;;
376 (define* (uri-encode str #:key (encoding "utf-8")
377 (unescaped-chars unreserved-chars))
378 "Percent-encode any character not in the character set,
379 UNESCAPED-CHARS.
380
381 The default character set includes alphanumerics from ASCII, as well as
382 the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will
383 be percent-encoded, by writing out the character to a bytevector within
384 the given ENCODING, then encoding each byte as ‘%HH’, where HH is the
385 uppercase hexadecimal representation of the byte."
386 (define (needs-escaped? ch)
387 (not (char-set-contains? unescaped-chars ch)))
388 (if (string-index str needs-escaped?)
389 (call-with-output-string*
390 (lambda (port)
391 (string-for-each
392 (lambda (ch)
393 (if (char-set-contains? unescaped-chars ch)
394 (display ch port)
395 (let* ((bv (encode-string (string ch) encoding))
396 (len (bytevector-length bv)))
397 (let lp ((i 0))
398 (if (< i len)
399 (let ((byte (bytevector-u8-ref bv i)))
400 (display #\% port)
401 (when (< byte 16)
402 (display #\0 port))
403 (display (string-upcase (number->string byte 16))
404 port)
405 (lp (1+ i))))))))
406 str)))
407 str))
408
409 (define (split-and-decode-uri-path path)
410 "Split PATH into its components, and decode each component,
411 removing empty components.
412
413 For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list,
414 ‘(\"foo\" \"bar baz\")’."
415 (filter (lambda (x) (not (string-null? x)))
416 (map uri-decode (string-split path #\/))))
417
418 (define (encode-and-join-uri-path parts)
419 "URI-encode each element of PARTS, which should be a list of
420 strings, and join the parts together with ‘/’ as a delimiter.
421
422 For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’
423 encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’."
424 (string-join (map uri-encode parts) "/"))