add some debugging to (web server)
[bpt/guile.git] / module / web / uri.scm
1 ;;;; (web uri) --- URI manipulation tools
2 ;;;;
3 ;;;; Copyright (C) 1997,2001,2002,2010 Free Software Foundation, Inc.
4 ;;;;
5 ;;;; This library is free software; you can redistribute it and/or
6 ;;;; modify it under the terms of the GNU Lesser General Public
7 ;;;; License as published by the Free Software Foundation; either
8 ;;;; version 3 of the License, or (at your option) any later version.
9 ;;;;
10 ;;;; This library is distributed in the hope that it will be useful,
11 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;;; Lesser General Public License for more details.
14 ;;;;
15 ;;;; You should have received a copy of the GNU Lesser General Public
16 ;;;; License along with this library; if not, write to the Free Software
17 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 ;;;;
19
20 ;;; Commentary:
21
22 ;; Based on (www url). To be documented.
23
24 ;;; Code:
25
26 (define-module (web uri)
27 #:export (uri?
28 uri-scheme uri-userinfo uri-host uri-port
29 uri-path uri-query uri-fragment
30
31 build-uri
32 declare-default-port!
33 parse-uri unparse-uri
34 uri-decode uri-encode
35 split-and-decode-uri-path
36 encode-and-join-uri-path)
37 #:use-module (srfi srfi-9)
38 #:use-module (ice-9 regex)
39 #:use-module (ice-9 control)
40 #:use-module (rnrs bytevectors)
41 #:use-module (rnrs io ports))
42
43 (define-record-type <uri>
44 (make-uri scheme userinfo host port path query fragment)
45 uri?
46 (scheme uri-scheme)
47 (userinfo uri-userinfo)
48 (host uri-host)
49 (port uri-port)
50 (path uri-path)
51 (query uri-query)
52 (fragment uri-fragment))
53
54 (define (uri-error message . args)
55 (throw 'uri-error message args))
56
57 (define (positive-exact-integer? port)
58 (and (number? port) (exact? port) (integer? port) (positive? port)))
59
60 (define (validate-uri scheme userinfo host port path query fragment)
61 (cond
62 ((not (symbol? scheme))
63 (uri-error "Expected a symbol for the URI scheme: ~s" scheme))
64 ((and (or userinfo port) (not host))
65 (uri-error "Expected a host, given userinfo or port"))
66 ((and port (not (positive-exact-integer? port)))
67 (uri-error "Expected port to be an integer: ~s" port))
68 ((and host (or (not (string? host)) (not (valid-host? host))))
69 (uri-error "Expected valid host: ~s" host))
70 ((and userinfo (not (string? userinfo)))
71 (uri-error "Expected string for userinfo: ~s" userinfo))
72 ((not (string? path))
73 (uri-error "Expected string for path: ~s" path))
74 ((and host (not (string-null? path))
75 (not (eqv? (string-ref path 0) #\/)))
76 (uri-error "Expected path of absolute URI to start with a /: ~a" path))))
77
78 (define* (build-uri scheme #:key userinfo host port (path "") query fragment
79 (validate? #t))
80 (if validate?
81 (validate-uri scheme userinfo host port path query fragment))
82 (make-uri scheme userinfo host port path query fragment))
83
84 ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
85 ;; 3490), and non-ASCII host names.
86 ;;
87 (define ipv4-regexp
88 (make-regexp "^([0-9.]+)"))
89 (define ipv6-regexp
90 (make-regexp "^\\[([0-9a-fA-F:]+)\\]+"))
91 (define domain-label-regexp
92 (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
93 (define top-label-regexp
94 (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
95
96 (define (valid-host? host)
97 (cond
98 ((regexp-exec ipv4-regexp host)
99 => (lambda (m)
100 (false-if-exception (inet-pton AF_INET (match:substring m 1)))))
101 ((regexp-exec ipv6-regexp host)
102 => (lambda (m)
103 (false-if-exception (inet-pton AF_INET6 (match:substring m 1)))))
104 (else
105 (let ((labels (reverse (string-split host #\.))))
106 (and (pair? labels)
107 (regexp-exec top-label-regexp (car labels))
108 (and-map (lambda (label)
109 (regexp-exec domain-label-regexp label))
110 (cdr labels)))))))
111
112 (define userinfo-pat
113 "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
114 (define host-pat
115 "[a-zA-Z0-9.-]+")
116 (define port-pat
117 "[0-9]*")
118 (define authority-regexp
119 (make-regexp
120 (format #f "^//((~a)@)?(~a)(:(~a))?$"
121 userinfo-pat host-pat port-pat)))
122
123 (define (parse-authority authority fail)
124 (let ((m (regexp-exec authority-regexp authority)))
125 (if (and m (valid-host? (match:substring m 3)))
126 (values (match:substring m 2)
127 (match:substring m 3)
128 (let ((port (match:substring m 5)))
129 (and port (not (string-null? port))
130 (string->number port))))
131 (fail))))
132
133
134 ;;; RFC 3986, #3.
135 ;;;
136 ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
137 ;;;
138 ;;; hier-part = "//" authority path-abempty
139 ;;; / path-absolute
140 ;;; / path-rootless
141 ;;; / path-empty
142
143 (define scheme-pat
144 "[a-zA-Z][a-zA-Z0-9+.-]*")
145 (define authority-pat
146 "[^/?#]*")
147 (define path-pat
148 "[^?#]*")
149 (define query-pat
150 "[^#]*")
151 (define fragment-pat
152 ".*")
153 (define uri-pat
154 (format #f "^(~a):(//~a)?(~a)(\\?(~a))?(#(~a))?$"
155 scheme-pat authority-pat path-pat query-pat fragment-pat))
156 (define uri-regexp
157 (make-regexp uri-pat))
158
159 (define (parse-uri string)
160 (% (let ((m (regexp-exec uri-regexp string)))
161 (if (not m) (abort))
162 (let ((scheme (string->symbol
163 (string-downcase (match:substring m 1))))
164 (authority (match:substring m 2))
165 (path (match:substring m 3))
166 (query (match:substring m 5))
167 (fragment (match:substring m 7)))
168 (call-with-values
169 (lambda ()
170 (if authority
171 (parse-authority authority abort)
172 (values #f #f #f)))
173 (lambda (userinfo host port)
174 (make-uri scheme userinfo host port path query fragment)))))
175 (lambda (k)
176 #f)))
177
178 (define *default-ports* (make-hash-table))
179
180 (define (declare-default-port! scheme port)
181 (hashq-set! *default-ports* scheme port))
182
183 (define (default-port? scheme port)
184 (or (not port)
185 (eqv? port (hashq-ref *default-ports* scheme))))
186
187 (declare-default-port! 'http 80)
188 (declare-default-port! 'https 443)
189
190 (define (unparse-uri uri)
191 (let* ((scheme-str (string-append
192 (symbol->string (uri-scheme uri)) ":"))
193 (userinfo (uri-userinfo uri))
194 (host (uri-host uri))
195 (port (uri-port uri))
196 (path (uri-path uri))
197 (query (uri-query uri))
198 (fragment (uri-fragment uri)))
199 (string-append
200 scheme-str
201 (if host
202 (string-append "//"
203 (if userinfo (string-append userinfo "@")
204 "")
205 host
206 (if (default-port? (uri-scheme uri) port)
207 ""
208 (string-append ":" (number->string port))))
209 "")
210 path
211 (if query
212 (string-append "?" query)
213 "")
214 (if fragment
215 (string-append "#" fragment)
216 ""))))
217
218
219 ;; A note on characters and bytes: URIs are defined to be sequences of
220 ;; characters in a subset of ASCII. Those characters may encode a
221 ;; sequence of bytes (octets), which in turn may encode sequences of
222 ;; characters in other character sets.
223 ;;
224
225 ;; Return a new string made from uri-decoding @var{str}. Specifically,
226 ;; turn @code{+} into space, and hex-encoded @code{%XX} strings into
227 ;; their eight-bit characters.
228 ;;
229 (define hex-chars
230 (string->char-set "0123456789abcdefABCDEF"))
231
232 (define* (uri-decode str #:key (charset 'utf-8))
233 (let ((len (string-length str)))
234 (call-with-values open-bytevector-output-port
235 (lambda (port get-bytevector)
236 (let lp ((i 0))
237 (if (= i len)
238 ((case charset
239 ((utf-8) utf8->string)
240 ((#f) (lambda (x) x)) ; raw bytevector
241 (else (uri-error "Unknown charset: ~s" charset)))
242 (get-bytevector))
243 (let ((ch (string-ref str i)))
244 (cond
245 ((eqv? ch #\+)
246 (put-u8 port (char->integer #\space))
247 (lp (1+ i)))
248 ((and (< (+ i 2) len) (eqv? ch #\%)
249 (let ((a (string-ref str (+ i 1)))
250 (b (string-ref str (+ i 2))))
251 (and (char-set-contains? hex-chars a)
252 (char-set-contains? hex-chars b)
253 (string->number (string a b) 16))))
254 => (lambda (u8)
255 (put-u8 port u8)
256 (lp (+ i 3))))
257 ((< (char->integer ch) 128)
258 (put-u8 port (char->integer ch))
259 (lp (1+ i)))
260 (else
261 (uri-error "Invalid character in encoded URI ~a: ~s"
262 str ch))))))))))
263
264 (define ascii-alnum-chars
265 (string->char-set
266 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
267
268 ;; RFC 3986, #2.2.
269 (define gen-delims
270 (string->char-set ":/?#[]@"))
271 (define sub-delims
272 (string->char-set "!$&'()*+,l="))
273 (define reserved-chars
274 (char-set-union gen-delims sub-delims))
275
276 ;; RFC 3986, #2.3
277 (define unreserved-chars
278 (char-set-union ascii-alnum-chars
279 (string->char-set "-._~")))
280
281 ;; Return a new string made from uri-encoding @var{str}, unconditionally
282 ;; transforming any characters not in @var{unescaped-chars}.
283 ;;
284 (define* (uri-encode str #:key (charset 'utf-8)
285 (unescaped-chars unreserved-chars))
286 (define (put-utf8 binary-port str)
287 (put-bytevector binary-port (string->utf8 str)))
288
289 ((case charset
290 ((utf-8) utf8->string)
291 ((#f) (lambda (x) x)) ; raw bytevector
292 (else (uri-error "Unknown charset: ~s" charset)))
293 (call-with-values open-bytevector-output-port
294 (lambda (port get-bytevector)
295 (string-for-each
296 (lambda (ch)
297 (if (char-set-contains? unescaped-chars ch)
298 (put-utf8 port (string ch))
299 (let* ((utf8 (string->utf8 (string ch)))
300 (len (bytevector-length utf8)))
301 ;; Encode each byte.
302 (let lp ((i 0))
303 (if (< i len)
304 (begin
305 (put-utf8 port (string #\%))
306 (put-utf8 port
307 (number->string (bytevector-u8-ref utf8 i) 16))
308 (lp (1+ i))))))))
309 str)
310 (get-bytevector)))))
311
312 (define (split-and-decode-uri-path path)
313 (filter (lambda (x) (not (string-null? x)))
314 (map uri-decode (string-split path #\/))))
315
316 (define (encode-and-join-uri-path parts)
317 (string-join (map uri-encode parts) "/"))