quick fix to uri parser
[bpt/guile.git] / module / web / uri.scm
CommitLineData
73124c6c
AW
1;;;; (web uri) --- URI manipulation tools
2;;;;
a964aa62 3;;;; Copyright (C) 1997,2001,2002,2010,2011 Free Software Foundation, Inc.
73124c6c
AW
4;;;;
5;;;; This library is free software; you can redistribute it and/or
6;;;; modify it under the terms of the GNU Lesser General Public
7;;;; License as published by the Free Software Foundation; either
8;;;; version 3 of the License, or (at your option) any later version.
9;;;;
10;;;; This library is distributed in the hope that it will be useful,
11;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13;;;; Lesser General Public License for more details.
14;;;;
15;;;; You should have received a copy of the GNU Lesser General Public
16;;;; License along with this library; if not, write to the Free Software
17;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18;;;;
15c9af8c
AW
19
20;;; Commentary:
21
277bbe96
AW
22;; A data type for Universal Resource Identifiers, as defined in RFC
23;; 3986.
15c9af8c
AW
24
25;;; Code:
26
73124c6c 27(define-module (web uri)
277bbe96
AW
28 #:use-module (srfi srfi-9)
29 #:use-module (ice-9 regex)
30 #:use-module (ice-9 rdelim)
31 #:use-module (ice-9 control)
32 #:use-module (rnrs bytevectors)
6854c324 33 #:use-module (ice-9 binary-ports)
73124c6c
AW
34 #:export (uri?
35 uri-scheme uri-userinfo uri-host uri-port
36 uri-path uri-query uri-fragment
37
38 build-uri
4eb7c8f0 39 declare-default-port!
8745c33a 40 string->uri uri->string
73124c6c
AW
41 uri-decode uri-encode
42 split-and-decode-uri-path
277bbe96 43 encode-and-join-uri-path))
73124c6c
AW
44
45(define-record-type <uri>
46 (make-uri scheme userinfo host port path query fragment)
47 uri?
48 (scheme uri-scheme)
49 (userinfo uri-userinfo)
50 (host uri-host)
51 (port uri-port)
52 (path uri-path)
53 (query uri-query)
54 (fragment uri-fragment))
55
5a2f7fb3
AW
56(define (uri-error message . args)
57 (throw 'uri-error message args))
58
73124c6c
AW
59(define (positive-exact-integer? port)
60 (and (number? port) (exact? port) (integer? port) (positive? port)))
61
62(define (validate-uri scheme userinfo host port path query fragment)
15c9af8c 63 (cond
73124c6c 64 ((not (symbol? scheme))
5a2f7fb3 65 (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
73124c6c 66 ((and (or userinfo port) (not host))
5a2f7fb3 67 (uri-error "Expected a host, given userinfo or port"))
73124c6c 68 ((and port (not (positive-exact-integer? port)))
5a2f7fb3 69 (uri-error "Expected port to be an integer: ~s" port))
73124c6c 70 ((and host (or (not (string? host)) (not (valid-host? host))))
5a2f7fb3 71 (uri-error "Expected valid host: ~s" host))
73124c6c 72 ((and userinfo (not (string? userinfo)))
5a2f7fb3 73 (uri-error "Expected string for userinfo: ~s" userinfo))
73124c6c 74 ((not (string? path))
5a2f7fb3 75 (uri-error "Expected string for path: ~s" path))
73124c6c
AW
76 ((and host (not (string-null? path))
77 (not (eqv? (string-ref path 0) #\/)))
5a2f7fb3 78 (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
15c9af8c 79
73124c6c
AW
80(define* (build-uri scheme #:key userinfo host port (path "") query fragment
81 (validate? #t))
277bbe96
AW
82 "Construct a URI object. If @var{validate?} is true, also run some
83consistency checks to make sure that the constructed URI is valid."
73124c6c
AW
84 (if validate?
85 (validate-uri scheme userinfo host port path query fragment))
86 (make-uri scheme userinfo host port path query fragment))
15c9af8c 87
73124c6c
AW
88;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
89;; 3490), and non-ASCII host names.
90;;
91(define ipv4-regexp
1868309a 92 (make-regexp "^([0-9.]+)$"))
73124c6c 93(define ipv6-regexp
1868309a 94 (make-regexp "^\\[([0-9a-fA-F:]+)\\]+$"))
73124c6c
AW
95(define domain-label-regexp
96 (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
97(define top-label-regexp
1868309a 98 (make-regexp "^[a-zA-Z]?([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
15c9af8c 99
73124c6c
AW
100(define (valid-host? host)
101 (cond
102 ((regexp-exec ipv4-regexp host)
103 => (lambda (m)
104 (false-if-exception (inet-pton AF_INET (match:substring m 1)))))
105 ((regexp-exec ipv6-regexp host)
106 => (lambda (m)
107 (false-if-exception (inet-pton AF_INET6 (match:substring m 1)))))
15c9af8c 108 (else
73124c6c
AW
109 (let ((labels (reverse (string-split host #\.))))
110 (and (pair? labels)
111 (regexp-exec top-label-regexp (car labels))
112 (and-map (lambda (label)
113 (regexp-exec domain-label-regexp label))
114 (cdr labels)))))))
115
116(define userinfo-pat
117 "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
118(define host-pat
119 "[a-zA-Z0-9.-]+")
120(define port-pat
121 "[0-9]*")
122(define authority-regexp
123 (make-regexp
124 (format #f "^//((~a)@)?(~a)(:(~a))?$"
125 userinfo-pat host-pat port-pat)))
126
127(define (parse-authority authority fail)
679eea4f
AW
128 (if (equal? authority "//")
129 ;; Allow empty authorities: file:///etc/hosts is a synonym of
130 ;; file:/etc/hosts.
131 (values #f #f #f)
132 (let ((m (regexp-exec authority-regexp authority)))
133 (if (and m (valid-host? (match:substring m 3)))
134 (values (match:substring m 2)
135 (match:substring m 3)
136 (let ((port (match:substring m 5)))
137 (and port (not (string-null? port))
138 (string->number port))))
139 (fail)))))
73124c6c
AW
140
141
142;;; RFC 3986, #3.
143;;;
144;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
145;;;
146;;; hier-part = "//" authority path-abempty
147;;; / path-absolute
148;;; / path-rootless
149;;; / path-empty
150
151(define scheme-pat
152 "[a-zA-Z][a-zA-Z0-9+.-]*")
153(define authority-pat
154 "[^/?#]*")
155(define path-pat
156 "[^?#]*")
157(define query-pat
158 "[^#]*")
159(define fragment-pat
160 ".*")
161(define uri-pat
162 (format #f "^(~a):(//~a)?(~a)(\\?(~a))?(#(~a))?$"
163 scheme-pat authority-pat path-pat query-pat fragment-pat))
164(define uri-regexp
165 (make-regexp uri-pat))
166
8745c33a 167(define (string->uri string)
277bbe96
AW
168 "Parse @var{string} into a URI object. Returns @code{#f} if the string
169could not be parsed."
73124c6c
AW
170 (% (let ((m (regexp-exec uri-regexp string)))
171 (if (not m) (abort))
172 (let ((scheme (string->symbol
173 (string-downcase (match:substring m 1))))
174 (authority (match:substring m 2))
175 (path (match:substring m 3))
176 (query (match:substring m 5))
177 (fragment (match:substring m 7)))
178 (call-with-values
179 (lambda ()
180 (if authority
181 (parse-authority authority abort)
182 (values #f #f #f)))
183 (lambda (userinfo host port)
184 (make-uri scheme userinfo host port path query fragment)))))
185 (lambda (k)
186 #f)))
187
4eb7c8f0
AW
188(define *default-ports* (make-hash-table))
189
190(define (declare-default-port! scheme port)
277bbe96
AW
191 "Declare a default port for the given URI scheme.
192
193Default ports are for printing URI objects: a default port is not
194printed."
4eb7c8f0
AW
195 (hashq-set! *default-ports* scheme port))
196
197(define (default-port? scheme port)
198 (or (not port)
199 (eqv? port (hashq-ref *default-ports* scheme))))
200
201(declare-default-port! 'http 80)
202(declare-default-port! 'https 443)
203
8745c33a 204(define (uri->string uri)
277bbe96 205 "Serialize @var{uri} to a string."
73124c6c
AW
206 (let* ((scheme-str (string-append
207 (symbol->string (uri-scheme uri)) ":"))
208 (userinfo (uri-userinfo uri))
209 (host (uri-host uri))
210 (port (uri-port uri))
211 (path (uri-path uri))
212 (query (uri-query uri))
213 (fragment (uri-fragment uri)))
214 (string-append
215 scheme-str
216 (if host
217 (string-append "//"
218 (if userinfo (string-append userinfo "@")
219 "")
220 host
4eb7c8f0
AW
221 (if (default-port? (uri-scheme uri) port)
222 ""
223 (string-append ":" (number->string port))))
73124c6c
AW
224 "")
225 path
226 (if query
227 (string-append "?" query)
228 "")
229 (if fragment
230 (string-append "#" fragment)
231 ""))))
232
233
a964aa62
AW
234;; like call-with-output-string, but actually closes the port (doh)
235(define (call-with-output-string* proc)
236 (let ((port (open-output-string)))
237 (proc port)
238 (let ((str (get-output-string port)))
239 (close-port port)
240 str)))
241
242(define (call-with-output-bytevector* proc)
243 (call-with-values
244 (lambda ()
245 (open-bytevector-output-port))
246 (lambda (port get-bytevector)
247 (proc port)
248 (let ((bv (get-bytevector)))
249 (close-port port)
250 bv))))
251
7d6b8b75
AW
252(define (call-with-encoded-output-string encoding proc)
253 (if (string-ci=? encoding "utf-8")
a964aa62
AW
254 (string->utf8 (call-with-output-string* proc))
255 (call-with-output-bytevector*
256 (lambda (port)
257 (set-port-encoding! port encoding)
258 (proc port)))))
c7857da6 259
7d6b8b75
AW
260(define (encode-string str encoding)
261 (if (string-ci=? encoding "utf-8")
c7857da6 262 (string->utf8 str)
7d6b8b75 263 (call-with-encoded-output-string encoding
c7857da6
AW
264 (lambda (port)
265 (display str port)))))
266
7d6b8b75
AW
267(define (decode-string bv encoding)
268 (if (string-ci=? encoding "utf-8")
c7857da6
AW
269 (utf8->string bv)
270 (let ((p (open-bytevector-input-port bv)))
7d6b8b75 271 (set-port-encoding! p encoding)
a964aa62
AW
272 (let ((res (read-delimited "" p)))
273 (close-port p)
274 res))))
c7857da6
AW
275
276
73124c6c
AW
277;; A note on characters and bytes: URIs are defined to be sequences of
278;; characters in a subset of ASCII. Those characters may encode a
279;; sequence of bytes (octets), which in turn may encode sequences of
280;; characters in other character sets.
281;;
282
283;; Return a new string made from uri-decoding @var{str}. Specifically,
284;; turn @code{+} into space, and hex-encoded @code{%XX} strings into
285;; their eight-bit characters.
286;;
287(define hex-chars
288 (string->char-set "0123456789abcdefABCDEF"))
289
7d6b8b75
AW
290(define* (uri-decode str #:key (encoding "utf-8"))
291 "Percent-decode the given @var{str}, according to @var{encoding}.
277bbe96
AW
292
293Note that this function should not generally be applied to a full URI
294string. For paths, use split-and-decode-uri-path instead. For query
295strings, split the query on @code{&} and @code{=} boundaries, and decode
296the components separately.
297
298Note that percent-encoded strings encode @emph{bytes}, not characters.
299There is no guarantee that a given byte sequence is a valid string
300encoding. Therefore this routine may signal an error if the decoded
301bytes are not valid for the given encoding. Pass @code{#f} for
7d6b8b75 302@var{encoding} if you want decoded bytes as a bytevector directly."
a964aa62
AW
303 (let* ((len (string-length str))
304 (bv
305 (call-with-output-bytevector*
306 (lambda (port)
307 (let lp ((i 0))
308 (if (< i len)
309 (let ((ch (string-ref str i)))
310 (cond
311 ((eqv? ch #\+)
312 (put-u8 port (char->integer #\space))
313 (lp (1+ i)))
314 ((and (< (+ i 2) len) (eqv? ch #\%)
315 (let ((a (string-ref str (+ i 1)))
316 (b (string-ref str (+ i 2))))
317 (and (char-set-contains? hex-chars a)
318 (char-set-contains? hex-chars b)
319 (string->number (string a b) 16))))
320 => (lambda (u8)
321 (put-u8 port u8)
322 (lp (+ i 3))))
323 ((< (char->integer ch) 128)
324 (put-u8 port (char->integer ch))
325 (lp (1+ i)))
326 (else
327 (uri-error "Invalid character in encoded URI ~a: ~s"
328 str ch))))))))))
329 (if encoding
330 (decode-string bv encoding)
331 ;; Otherwise return raw bytevector
332 bv)))
333
73124c6c
AW
334(define ascii-alnum-chars
335 (string->char-set
336 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
337
338;; RFC 3986, #2.2.
339(define gen-delims
340 (string->char-set ":/?#[]@"))
341(define sub-delims
342 (string->char-set "!$&'()*+,l="))
15c9af8c 343(define reserved-chars
73124c6c 344 (char-set-union gen-delims sub-delims))
15c9af8c 345
73124c6c
AW
346;; RFC 3986, #2.3
347(define unreserved-chars
348 (char-set-union ascii-alnum-chars
349 (string->char-set "-._~")))
15c9af8c 350
73124c6c
AW
351;; Return a new string made from uri-encoding @var{str}, unconditionally
352;; transforming any characters not in @var{unescaped-chars}.
353;;
7d6b8b75 354(define* (uri-encode str #:key (encoding "utf-8")
73124c6c 355 (unescaped-chars unreserved-chars))
91b320fe 356 "Percent-encode any character not in the character set, @var{unescaped-chars}.
277bbe96
AW
357
358Percent-encoding first writes out the given character to a bytevector
7d6b8b75 359within the given @var{encoding}, then encodes each byte as
277bbe96
AW
360@code{%@var{HH}}, where @var{HH} is the hexadecimal representation of
361the byte."
91b320fe 362 (if (string-index str unescaped-chars)
a964aa62 363 (call-with-output-string*
91b320fe
AW
364 (lambda (port)
365 (string-for-each
366 (lambda (ch)
367 (if (char-set-contains? unescaped-chars ch)
368 (display ch port)
369 (let* ((bv (encode-string (string ch) encoding))
370 (len (bytevector-length bv)))
371 (let lp ((i 0))
372 (if (< i len)
373 (let ((byte (bytevector-u8-ref bv i)))
374 (display #\% port)
375 (display (number->string byte 16) port)
376 (lp (1+ i))))))))
377 str)))
378 str))
15c9af8c 379
73124c6c 380(define (split-and-decode-uri-path path)
277bbe96
AW
381 "Split @var{path} into its components, and decode each
382component, removing empty components.
383
384For example, @code{\"/foo/bar/\"} decodes to the two-element list,
385@code{(\"foo\" \"bar\")}."
15c9af8c 386 (filter (lambda (x) (not (string-null? x)))
73124c6c 387 (map uri-decode (string-split path #\/))))
15c9af8c 388
73124c6c 389(define (encode-and-join-uri-path parts)
277bbe96
AW
390 "URI-encode each element of @var{parts}, which should be a list of
391strings, and join the parts together with @code{/} as a delimiter."
73124c6c 392 (string-join (map uri-encode parts) "/"))