Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / module / web / uri.scm
1 ;;;; (web uri) --- URI manipulation tools
2 ;;;;
3 ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012 Free Software Foundation, Inc.
4 ;;;;
5 ;;;; This library is free software; you can redistribute it and/or
6 ;;;; modify it under the terms of the GNU Lesser General Public
7 ;;;; License as published by the Free Software Foundation; either
8 ;;;; version 3 of the License, or (at your option) any later version.
9 ;;;;
10 ;;;; This library is distributed in the hope that it will be useful,
11 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;;; Lesser General Public License for more details.
14 ;;;;
15 ;;;; You should have received a copy of the GNU Lesser General Public
16 ;;;; License along with this library; if not, write to the Free Software
17 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 ;;;;
19
20 ;;; Commentary:
21
22 ;; A data type for Universal Resource Identifiers, as defined in RFC
23 ;; 3986.
24
25 ;;; Code:
26
27 (define-module (web uri)
28 #:use-module (srfi srfi-9)
29 #:use-module (ice-9 regex)
30 #:use-module (ice-9 rdelim)
31 #:use-module (ice-9 control)
32 #:use-module (rnrs bytevectors)
33 #:use-module (ice-9 binary-ports)
34 #:export (uri?
35 uri-scheme uri-userinfo uri-host uri-port
36 uri-path uri-query uri-fragment
37
38 build-uri
39 declare-default-port!
40 string->uri uri->string
41 uri-decode uri-encode
42 split-and-decode-uri-path
43 encode-and-join-uri-path))
44
45 (define-record-type <uri>
46 (make-uri scheme userinfo host port path query fragment)
47 uri?
48 (scheme uri-scheme)
49 (userinfo uri-userinfo)
50 (host uri-host)
51 (port uri-port)
52 (path uri-path)
53 (query uri-query)
54 (fragment uri-fragment))
55
56 (define (uri-error message . args)
57 (throw 'uri-error message args))
58
59 (define (positive-exact-integer? port)
60 (and (number? port) (exact? port) (integer? port) (positive? port)))
61
62 (define (validate-uri scheme userinfo host port path query fragment)
63 (cond
64 ((not (symbol? scheme))
65 (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
66 ((and (or userinfo port) (not host))
67 (uri-error "Expected a host, given userinfo or port"))
68 ((and port (not (positive-exact-integer? port)))
69 (uri-error "Expected port to be an integer: ~s" port))
70 ((and host (or (not (string? host)) (not (valid-host? host))))
71 (uri-error "Expected valid host: ~s" host))
72 ((and userinfo (not (string? userinfo)))
73 (uri-error "Expected string for userinfo: ~s" userinfo))
74 ((not (string? path))
75 (uri-error "Expected string for path: ~s" path))
76 ((and host (not (string-null? path))
77 (not (eqv? (string-ref path 0) #\/)))
78 (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
79
80 (define* (build-uri scheme #:key userinfo host port (path "") query fragment
81 (validate? #t))
82 "Construct a URI object. If @var{validate?} is true, also run some
83 consistency checks to make sure that the constructed URI is valid."
84 (if validate?
85 (validate-uri scheme userinfo host port path query fragment))
86 (make-uri scheme userinfo host port path query fragment))
87
88 ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
89 ;; 3490), and non-ASCII host names.
90 ;;
91 (define ipv4-regexp
92 (make-regexp "^([0-9.]+)$"))
93 (define ipv6-regexp
94 (make-regexp "^([0-9a-fA-F:.]+)$"))
95 (define domain-label-regexp
96 (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
97 (define top-label-regexp
98 (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
99
100 (define (valid-host? host)
101 (cond
102 ((regexp-exec ipv4-regexp host)
103 (false-if-exception (inet-pton AF_INET host)))
104 ((regexp-exec ipv6-regexp host)
105 (false-if-exception (inet-pton AF_INET6 host)))
106 (else
107 (let lp ((start 0))
108 (let ((end (string-index host #\. start)))
109 (if end
110 (and (regexp-exec domain-label-regexp
111 (substring host start end))
112 (lp (1+ end)))
113 (regexp-exec top-label-regexp host start)))))))
114
115 (define userinfo-pat
116 "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
117 (define host-pat
118 "[a-zA-Z0-9.-]+")
119 (define ipv6-host-pat
120 "[0-9a-fA-F:.]+")
121 (define port-pat
122 "[0-9]*")
123 (define authority-regexp
124 (make-regexp
125 (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
126 userinfo-pat host-pat ipv6-host-pat port-pat)))
127
128 (define (parse-authority authority fail)
129 (if (equal? authority "//")
130 ;; Allow empty authorities: file:///etc/hosts is a synonym of
131 ;; file:/etc/hosts.
132 (values #f #f #f)
133 (let ((m (regexp-exec authority-regexp authority)))
134 (if (and m (valid-host? (or (match:substring m 4)
135 (match:substring m 6))))
136 (values (match:substring m 2)
137 (or (match:substring m 4)
138 (match:substring m 6))
139 (let ((port (match:substring m 8)))
140 (and port (not (string-null? port))
141 (string->number port))))
142 (fail)))))
143
144
145 ;;; RFC 3986, #3.
146 ;;;
147 ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
148 ;;;
149 ;;; hier-part = "//" authority path-abempty
150 ;;; / path-absolute
151 ;;; / path-rootless
152 ;;; / path-empty
153
154 (define scheme-pat
155 "[a-zA-Z][a-zA-Z0-9+.-]*")
156 (define authority-pat
157 "[^/?#]*")
158 (define path-pat
159 "[^?#]*")
160 (define query-pat
161 "[^#]*")
162 (define fragment-pat
163 ".*")
164 (define uri-pat
165 (format #f "^(~a):(//~a)?(~a)(\\?(~a))?(#(~a))?$"
166 scheme-pat authority-pat path-pat query-pat fragment-pat))
167 (define uri-regexp
168 (make-regexp uri-pat))
169
170 (define (string->uri string)
171 "Parse @var{string} into a URI object. Returns @code{#f} if the string
172 could not be parsed."
173 (% (let ((m (regexp-exec uri-regexp string)))
174 (if (not m) (abort))
175 (let ((scheme (string->symbol
176 (string-downcase (match:substring m 1))))
177 (authority (match:substring m 2))
178 (path (match:substring m 3))
179 (query (match:substring m 5))
180 (fragment (match:substring m 7)))
181 (call-with-values
182 (lambda ()
183 (if authority
184 (parse-authority authority abort)
185 (values #f #f #f)))
186 (lambda (userinfo host port)
187 (make-uri scheme userinfo host port path query fragment)))))
188 (lambda (k)
189 #f)))
190
191 (define *default-ports* (make-hash-table))
192
193 (define (declare-default-port! scheme port)
194 "Declare a default port for the given URI scheme.
195
196 Default ports are for printing URI objects: a default port is not
197 printed."
198 (hashq-set! *default-ports* scheme port))
199
200 (define (default-port? scheme port)
201 (or (not port)
202 (eqv? port (hashq-ref *default-ports* scheme))))
203
204 (declare-default-port! 'http 80)
205 (declare-default-port! 'https 443)
206
207 (define (uri->string uri)
208 "Serialize @var{uri} to a string."
209 (let* ((scheme-str (string-append
210 (symbol->string (uri-scheme uri)) ":"))
211 (userinfo (uri-userinfo uri))
212 (host (uri-host uri))
213 (port (uri-port uri))
214 (path (uri-path uri))
215 (query (uri-query uri))
216 (fragment (uri-fragment uri)))
217 (string-append
218 scheme-str
219 (if host
220 (string-append "//"
221 (if userinfo (string-append userinfo "@")
222 "")
223 (if (string-index host #\:)
224 (string-append "[" host "]")
225 host)
226 (if (default-port? (uri-scheme uri) port)
227 ""
228 (string-append ":" (number->string port))))
229 "")
230 path
231 (if query
232 (string-append "?" query)
233 "")
234 (if fragment
235 (string-append "#" fragment)
236 ""))))
237
238
239 ;; like call-with-output-string, but actually closes the port (doh)
240 (define (call-with-output-string* proc)
241 (let ((port (open-output-string)))
242 (proc port)
243 (let ((str (get-output-string port)))
244 (close-port port)
245 str)))
246
247 (define (call-with-output-bytevector* proc)
248 (call-with-values
249 (lambda ()
250 (open-bytevector-output-port))
251 (lambda (port get-bytevector)
252 (proc port)
253 (let ((bv (get-bytevector)))
254 (close-port port)
255 bv))))
256
257 (define (call-with-encoded-output-string encoding proc)
258 (if (string-ci=? encoding "utf-8")
259 (string->utf8 (call-with-output-string* proc))
260 (call-with-output-bytevector*
261 (lambda (port)
262 (set-port-encoding! port encoding)
263 (proc port)))))
264
265 (define (encode-string str encoding)
266 (if (string-ci=? encoding "utf-8")
267 (string->utf8 str)
268 (call-with-encoded-output-string encoding
269 (lambda (port)
270 (display str port)))))
271
272 (define (decode-string bv encoding)
273 (if (string-ci=? encoding "utf-8")
274 (utf8->string bv)
275 (let ((p (open-bytevector-input-port bv)))
276 (set-port-encoding! p encoding)
277 (let ((res (read-delimited "" p)))
278 (close-port p)
279 res))))
280
281
282 ;; A note on characters and bytes: URIs are defined to be sequences of
283 ;; characters in a subset of ASCII. Those characters may encode a
284 ;; sequence of bytes (octets), which in turn may encode sequences of
285 ;; characters in other character sets.
286 ;;
287
288 ;; Return a new string made from uri-decoding @var{str}. Specifically,
289 ;; turn @code{+} into space, and hex-encoded @code{%XX} strings into
290 ;; their eight-bit characters.
291 ;;
292 (define hex-chars
293 (string->char-set "0123456789abcdefABCDEF"))
294
295 (define* (uri-decode str #:key (encoding "utf-8"))
296 "Percent-decode the given @var{str}, according to @var{encoding}.
297
298 Note that this function should not generally be applied to a full URI
299 string. For paths, use split-and-decode-uri-path instead. For query
300 strings, split the query on @code{&} and @code{=} boundaries, and decode
301 the components separately.
302
303 Note that percent-encoded strings encode @emph{bytes}, not characters.
304 There is no guarantee that a given byte sequence is a valid string
305 encoding. Therefore this routine may signal an error if the decoded
306 bytes are not valid for the given encoding. Pass @code{#f} for
307 @var{encoding} if you want decoded bytes as a bytevector directly."
308 (let* ((len (string-length str))
309 (bv
310 (call-with-output-bytevector*
311 (lambda (port)
312 (let lp ((i 0))
313 (if (< i len)
314 (let ((ch (string-ref str i)))
315 (cond
316 ((eqv? ch #\+)
317 (put-u8 port (char->integer #\space))
318 (lp (1+ i)))
319 ((and (< (+ i 2) len) (eqv? ch #\%)
320 (let ((a (string-ref str (+ i 1)))
321 (b (string-ref str (+ i 2))))
322 (and (char-set-contains? hex-chars a)
323 (char-set-contains? hex-chars b)
324 (string->number (string a b) 16))))
325 => (lambda (u8)
326 (put-u8 port u8)
327 (lp (+ i 3))))
328 ((< (char->integer ch) 128)
329 (put-u8 port (char->integer ch))
330 (lp (1+ i)))
331 (else
332 (uri-error "Invalid character in encoded URI ~a: ~s"
333 str ch))))))))))
334 (if encoding
335 (decode-string bv encoding)
336 ;; Otherwise return raw bytevector
337 bv)))
338
339 (define ascii-alnum-chars
340 (string->char-set
341 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
342
343 ;; RFC 3986, #2.2.
344 (define gen-delims
345 (string->char-set ":/?#[]@"))
346 (define sub-delims
347 (string->char-set "!$&'()*+,l="))
348 (define reserved-chars
349 (char-set-union gen-delims sub-delims))
350
351 ;; RFC 3986, #2.3
352 (define unreserved-chars
353 (char-set-union ascii-alnum-chars
354 (string->char-set "-._~")))
355
356 ;; Return a new string made from uri-encoding @var{str}, unconditionally
357 ;; transforming any characters not in @var{unescaped-chars}.
358 ;;
359 (define* (uri-encode str #:key (encoding "utf-8")
360 (unescaped-chars unreserved-chars))
361 "Percent-encode any character not in the character set, @var{unescaped-chars}.
362
363 Percent-encoding first writes out the given character to a bytevector
364 within the given @var{encoding}, then encodes each byte as
365 @code{%@var{HH}}, where @var{HH} is the hexadecimal representation of
366 the byte."
367 (define (needs-escaped? ch)
368 (not (char-set-contains? unescaped-chars ch)))
369 (if (string-index str needs-escaped?)
370 (call-with-output-string*
371 (lambda (port)
372 (string-for-each
373 (lambda (ch)
374 (if (char-set-contains? unescaped-chars ch)
375 (display ch port)
376 (let* ((bv (encode-string (string ch) encoding))
377 (len (bytevector-length bv)))
378 (let lp ((i 0))
379 (if (< i len)
380 (let ((byte (bytevector-u8-ref bv i)))
381 (display #\% port)
382 (when (< byte 16)
383 (display #\0 port))
384 (display (number->string byte 16) port)
385 (lp (1+ i))))))))
386 str)))
387 str))
388
389 (define (split-and-decode-uri-path path)
390 "Split @var{path} into its components, and decode each
391 component, removing empty components.
392
393 For example, @code{\"/foo/bar/\"} decodes to the two-element list,
394 @code{(\"foo\" \"bar\")}."
395 (filter (lambda (x) (not (string-null? x)))
396 (map uri-decode (string-split path #\/))))
397
398 (define (encode-and-join-uri-path parts)
399 "URI-encode each element of @var{parts}, which should be a list of
400 strings, and join the parts together with @code{/} as a delimiter."
401 (string-join (map uri-encode parts) "/"))