Commit | Line | Data |
---|---|---|
73124c6c AW |
1 | ;;;; (web uri) --- URI manipulation tools |
2 | ;;;; | |
a964aa62 | 3 | ;;;; Copyright (C) 1997,2001,2002,2010,2011 Free Software Foundation, Inc. |
73124c6c AW |
4 | ;;;; |
5 | ;;;; This library is free software; you can redistribute it and/or | |
6 | ;;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;;; License as published by the Free Software Foundation; either | |
8 | ;;;; version 3 of the License, or (at your option) any later version. | |
9 | ;;;; | |
10 | ;;;; This library is distributed in the hope that it will be useful, | |
11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;;; Lesser General Public License for more details. | |
14 | ;;;; | |
15 | ;;;; You should have received a copy of the GNU Lesser General Public | |
16 | ;;;; License along with this library; if not, write to the Free Software | |
17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 | ;;;; | |
15c9af8c AW |
19 | |
20 | ;;; Commentary: | |
21 | ||
277bbe96 AW |
22 | ;; A data type for Universal Resource Identifiers, as defined in RFC |
23 | ;; 3986. | |
15c9af8c AW |
24 | |
25 | ;;; Code: | |
26 | ||
73124c6c | 27 | (define-module (web uri) |
277bbe96 AW |
28 | #:use-module (srfi srfi-9) |
29 | #:use-module (ice-9 regex) | |
30 | #:use-module (ice-9 rdelim) | |
31 | #:use-module (ice-9 control) | |
32 | #:use-module (rnrs bytevectors) | |
6854c324 | 33 | #:use-module (ice-9 binary-ports) |
73124c6c AW |
34 | #:export (uri? |
35 | uri-scheme uri-userinfo uri-host uri-port | |
36 | uri-path uri-query uri-fragment | |
37 | ||
38 | build-uri | |
4eb7c8f0 | 39 | declare-default-port! |
8745c33a | 40 | string->uri uri->string |
73124c6c AW |
41 | uri-decode uri-encode |
42 | split-and-decode-uri-path | |
277bbe96 | 43 | encode-and-join-uri-path)) |
73124c6c AW |
44 | |
45 | (define-record-type <uri> | |
46 | (make-uri scheme userinfo host port path query fragment) | |
47 | uri? | |
48 | (scheme uri-scheme) | |
49 | (userinfo uri-userinfo) | |
50 | (host uri-host) | |
51 | (port uri-port) | |
52 | (path uri-path) | |
53 | (query uri-query) | |
54 | (fragment uri-fragment)) | |
55 | ||
5a2f7fb3 AW |
56 | (define (uri-error message . args) |
57 | (throw 'uri-error message args)) | |
58 | ||
73124c6c AW |
59 | (define (positive-exact-integer? port) |
60 | (and (number? port) (exact? port) (integer? port) (positive? port))) | |
61 | ||
62 | (define (validate-uri scheme userinfo host port path query fragment) | |
15c9af8c | 63 | (cond |
73124c6c | 64 | ((not (symbol? scheme)) |
5a2f7fb3 | 65 | (uri-error "Expected a symbol for the URI scheme: ~s" scheme)) |
73124c6c | 66 | ((and (or userinfo port) (not host)) |
5a2f7fb3 | 67 | (uri-error "Expected a host, given userinfo or port")) |
73124c6c | 68 | ((and port (not (positive-exact-integer? port))) |
5a2f7fb3 | 69 | (uri-error "Expected port to be an integer: ~s" port)) |
73124c6c | 70 | ((and host (or (not (string? host)) (not (valid-host? host)))) |
5a2f7fb3 | 71 | (uri-error "Expected valid host: ~s" host)) |
73124c6c | 72 | ((and userinfo (not (string? userinfo))) |
5a2f7fb3 | 73 | (uri-error "Expected string for userinfo: ~s" userinfo)) |
73124c6c | 74 | ((not (string? path)) |
5a2f7fb3 | 75 | (uri-error "Expected string for path: ~s" path)) |
73124c6c AW |
76 | ((and host (not (string-null? path)) |
77 | (not (eqv? (string-ref path 0) #\/))) | |
5a2f7fb3 | 78 | (uri-error "Expected path of absolute URI to start with a /: ~a" path)))) |
15c9af8c | 79 | |
73124c6c AW |
80 | (define* (build-uri scheme #:key userinfo host port (path "") query fragment |
81 | (validate? #t)) | |
277bbe96 AW |
82 | "Construct a URI object. If @var{validate?} is true, also run some |
83 | consistency checks to make sure that the constructed URI is valid." | |
73124c6c AW |
84 | (if validate? |
85 | (validate-uri scheme userinfo host port path query fragment)) | |
86 | (make-uri scheme userinfo host port path query fragment)) | |
15c9af8c | 87 | |
73124c6c AW |
88 | ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC |
89 | ;; 3490), and non-ASCII host names. | |
90 | ;; | |
91 | (define ipv4-regexp | |
1868309a | 92 | (make-regexp "^([0-9.]+)$")) |
73124c6c | 93 | (define ipv6-regexp |
1868309a | 94 | (make-regexp "^\\[([0-9a-fA-F:]+)\\]+$")) |
73124c6c AW |
95 | (define domain-label-regexp |
96 | (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) | |
97 | (define top-label-regexp | |
1868309a | 98 | (make-regexp "^[a-zA-Z]?([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) |
15c9af8c | 99 | |
73124c6c AW |
100 | (define (valid-host? host) |
101 | (cond | |
102 | ((regexp-exec ipv4-regexp host) | |
103 | => (lambda (m) | |
104 | (false-if-exception (inet-pton AF_INET (match:substring m 1))))) | |
105 | ((regexp-exec ipv6-regexp host) | |
106 | => (lambda (m) | |
107 | (false-if-exception (inet-pton AF_INET6 (match:substring m 1))))) | |
15c9af8c | 108 | (else |
73124c6c AW |
109 | (let ((labels (reverse (string-split host #\.)))) |
110 | (and (pair? labels) | |
111 | (regexp-exec top-label-regexp (car labels)) | |
112 | (and-map (lambda (label) | |
113 | (regexp-exec domain-label-regexp label)) | |
114 | (cdr labels))))))) | |
115 | ||
116 | (define userinfo-pat | |
117 | "[a-zA-Z0-9_.!~*'();:&=+$,-]+") | |
118 | (define host-pat | |
119 | "[a-zA-Z0-9.-]+") | |
120 | (define port-pat | |
121 | "[0-9]*") | |
122 | (define authority-regexp | |
123 | (make-regexp | |
124 | (format #f "^//((~a)@)?(~a)(:(~a))?$" | |
125 | userinfo-pat host-pat port-pat))) | |
126 | ||
127 | (define (parse-authority authority fail) | |
679eea4f AW |
128 | (if (equal? authority "//") |
129 | ;; Allow empty authorities: file:///etc/hosts is a synonym of | |
130 | ;; file:/etc/hosts. | |
131 | (values #f #f #f) | |
132 | (let ((m (regexp-exec authority-regexp authority))) | |
133 | (if (and m (valid-host? (match:substring m 3))) | |
134 | (values (match:substring m 2) | |
135 | (match:substring m 3) | |
136 | (let ((port (match:substring m 5))) | |
137 | (and port (not (string-null? port)) | |
138 | (string->number port)))) | |
139 | (fail))))) | |
73124c6c AW |
140 | |
141 | ||
142 | ;;; RFC 3986, #3. | |
143 | ;;; | |
144 | ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
145 | ;;; | |
146 | ;;; hier-part = "//" authority path-abempty | |
147 | ;;; / path-absolute | |
148 | ;;; / path-rootless | |
149 | ;;; / path-empty | |
150 | ||
151 | (define scheme-pat | |
152 | "[a-zA-Z][a-zA-Z0-9+.-]*") | |
153 | (define authority-pat | |
154 | "[^/?#]*") | |
155 | (define path-pat | |
156 | "[^?#]*") | |
157 | (define query-pat | |
158 | "[^#]*") | |
159 | (define fragment-pat | |
160 | ".*") | |
161 | (define uri-pat | |
162 | (format #f "^(~a):(//~a)?(~a)(\\?(~a))?(#(~a))?$" | |
163 | scheme-pat authority-pat path-pat query-pat fragment-pat)) | |
164 | (define uri-regexp | |
165 | (make-regexp uri-pat)) | |
166 | ||
8745c33a | 167 | (define (string->uri string) |
277bbe96 AW |
168 | "Parse @var{string} into a URI object. Returns @code{#f} if the string |
169 | could not be parsed." | |
73124c6c AW |
170 | (% (let ((m (regexp-exec uri-regexp string))) |
171 | (if (not m) (abort)) | |
172 | (let ((scheme (string->symbol | |
173 | (string-downcase (match:substring m 1)))) | |
174 | (authority (match:substring m 2)) | |
175 | (path (match:substring m 3)) | |
176 | (query (match:substring m 5)) | |
177 | (fragment (match:substring m 7))) | |
178 | (call-with-values | |
179 | (lambda () | |
180 | (if authority | |
181 | (parse-authority authority abort) | |
182 | (values #f #f #f))) | |
183 | (lambda (userinfo host port) | |
184 | (make-uri scheme userinfo host port path query fragment))))) | |
185 | (lambda (k) | |
186 | #f))) | |
187 | ||
4eb7c8f0 AW |
188 | (define *default-ports* (make-hash-table)) |
189 | ||
190 | (define (declare-default-port! scheme port) | |
277bbe96 AW |
191 | "Declare a default port for the given URI scheme. |
192 | ||
193 | Default ports are for printing URI objects: a default port is not | |
194 | printed." | |
4eb7c8f0 AW |
195 | (hashq-set! *default-ports* scheme port)) |
196 | ||
197 | (define (default-port? scheme port) | |
198 | (or (not port) | |
199 | (eqv? port (hashq-ref *default-ports* scheme)))) | |
200 | ||
201 | (declare-default-port! 'http 80) | |
202 | (declare-default-port! 'https 443) | |
203 | ||
8745c33a | 204 | (define (uri->string uri) |
277bbe96 | 205 | "Serialize @var{uri} to a string." |
73124c6c AW |
206 | (let* ((scheme-str (string-append |
207 | (symbol->string (uri-scheme uri)) ":")) | |
208 | (userinfo (uri-userinfo uri)) | |
209 | (host (uri-host uri)) | |
210 | (port (uri-port uri)) | |
211 | (path (uri-path uri)) | |
212 | (query (uri-query uri)) | |
213 | (fragment (uri-fragment uri))) | |
214 | (string-append | |
215 | scheme-str | |
216 | (if host | |
217 | (string-append "//" | |
218 | (if userinfo (string-append userinfo "@") | |
219 | "") | |
220 | host | |
4eb7c8f0 AW |
221 | (if (default-port? (uri-scheme uri) port) |
222 | "" | |
223 | (string-append ":" (number->string port)))) | |
73124c6c AW |
224 | "") |
225 | path | |
226 | (if query | |
227 | (string-append "?" query) | |
228 | "") | |
229 | (if fragment | |
230 | (string-append "#" fragment) | |
231 | "")))) | |
232 | ||
233 | ||
a964aa62 AW |
234 | ;; like call-with-output-string, but actually closes the port (doh) |
235 | (define (call-with-output-string* proc) | |
236 | (let ((port (open-output-string))) | |
237 | (proc port) | |
238 | (let ((str (get-output-string port))) | |
239 | (close-port port) | |
240 | str))) | |
241 | ||
242 | (define (call-with-output-bytevector* proc) | |
243 | (call-with-values | |
244 | (lambda () | |
245 | (open-bytevector-output-port)) | |
246 | (lambda (port get-bytevector) | |
247 | (proc port) | |
248 | (let ((bv (get-bytevector))) | |
249 | (close-port port) | |
250 | bv)))) | |
251 | ||
7d6b8b75 AW |
252 | (define (call-with-encoded-output-string encoding proc) |
253 | (if (string-ci=? encoding "utf-8") | |
a964aa62 AW |
254 | (string->utf8 (call-with-output-string* proc)) |
255 | (call-with-output-bytevector* | |
256 | (lambda (port) | |
257 | (set-port-encoding! port encoding) | |
258 | (proc port))))) | |
c7857da6 | 259 | |
7d6b8b75 AW |
260 | (define (encode-string str encoding) |
261 | (if (string-ci=? encoding "utf-8") | |
c7857da6 | 262 | (string->utf8 str) |
7d6b8b75 | 263 | (call-with-encoded-output-string encoding |
c7857da6 AW |
264 | (lambda (port) |
265 | (display str port))))) | |
266 | ||
7d6b8b75 AW |
267 | (define (decode-string bv encoding) |
268 | (if (string-ci=? encoding "utf-8") | |
c7857da6 AW |
269 | (utf8->string bv) |
270 | (let ((p (open-bytevector-input-port bv))) | |
7d6b8b75 | 271 | (set-port-encoding! p encoding) |
a964aa62 AW |
272 | (let ((res (read-delimited "" p))) |
273 | (close-port p) | |
274 | res)))) | |
c7857da6 AW |
275 | |
276 | ||
73124c6c AW |
277 | ;; A note on characters and bytes: URIs are defined to be sequences of |
278 | ;; characters in a subset of ASCII. Those characters may encode a | |
279 | ;; sequence of bytes (octets), which in turn may encode sequences of | |
280 | ;; characters in other character sets. | |
281 | ;; | |
282 | ||
283 | ;; Return a new string made from uri-decoding @var{str}. Specifically, | |
284 | ;; turn @code{+} into space, and hex-encoded @code{%XX} strings into | |
285 | ;; their eight-bit characters. | |
286 | ;; | |
287 | (define hex-chars | |
288 | (string->char-set "0123456789abcdefABCDEF")) | |
289 | ||
7d6b8b75 AW |
290 | (define* (uri-decode str #:key (encoding "utf-8")) |
291 | "Percent-decode the given @var{str}, according to @var{encoding}. | |
277bbe96 AW |
292 | |
293 | Note that this function should not generally be applied to a full URI | |
294 | string. For paths, use split-and-decode-uri-path instead. For query | |
295 | strings, split the query on @code{&} and @code{=} boundaries, and decode | |
296 | the components separately. | |
297 | ||
298 | Note that percent-encoded strings encode @emph{bytes}, not characters. | |
299 | There is no guarantee that a given byte sequence is a valid string | |
300 | encoding. Therefore this routine may signal an error if the decoded | |
301 | bytes are not valid for the given encoding. Pass @code{#f} for | |
7d6b8b75 | 302 | @var{encoding} if you want decoded bytes as a bytevector directly." |
a964aa62 AW |
303 | (let* ((len (string-length str)) |
304 | (bv | |
305 | (call-with-output-bytevector* | |
306 | (lambda (port) | |
307 | (let lp ((i 0)) | |
308 | (if (< i len) | |
309 | (let ((ch (string-ref str i))) | |
310 | (cond | |
311 | ((eqv? ch #\+) | |
312 | (put-u8 port (char->integer #\space)) | |
313 | (lp (1+ i))) | |
314 | ((and (< (+ i 2) len) (eqv? ch #\%) | |
315 | (let ((a (string-ref str (+ i 1))) | |
316 | (b (string-ref str (+ i 2)))) | |
317 | (and (char-set-contains? hex-chars a) | |
318 | (char-set-contains? hex-chars b) | |
319 | (string->number (string a b) 16)))) | |
320 | => (lambda (u8) | |
321 | (put-u8 port u8) | |
322 | (lp (+ i 3)))) | |
323 | ((< (char->integer ch) 128) | |
324 | (put-u8 port (char->integer ch)) | |
325 | (lp (1+ i))) | |
326 | (else | |
327 | (uri-error "Invalid character in encoded URI ~a: ~s" | |
328 | str ch)))))))))) | |
329 | (if encoding | |
330 | (decode-string bv encoding) | |
331 | ;; Otherwise return raw bytevector | |
332 | bv))) | |
333 | ||
73124c6c AW |
334 | (define ascii-alnum-chars |
335 | (string->char-set | |
336 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")) | |
337 | ||
338 | ;; RFC 3986, #2.2. | |
339 | (define gen-delims | |
340 | (string->char-set ":/?#[]@")) | |
341 | (define sub-delims | |
342 | (string->char-set "!$&'()*+,l=")) | |
15c9af8c | 343 | (define reserved-chars |
73124c6c | 344 | (char-set-union gen-delims sub-delims)) |
15c9af8c | 345 | |
73124c6c AW |
346 | ;; RFC 3986, #2.3 |
347 | (define unreserved-chars | |
348 | (char-set-union ascii-alnum-chars | |
349 | (string->char-set "-._~"))) | |
15c9af8c | 350 | |
73124c6c AW |
351 | ;; Return a new string made from uri-encoding @var{str}, unconditionally |
352 | ;; transforming any characters not in @var{unescaped-chars}. | |
353 | ;; | |
7d6b8b75 | 354 | (define* (uri-encode str #:key (encoding "utf-8") |
73124c6c | 355 | (unescaped-chars unreserved-chars)) |
91b320fe | 356 | "Percent-encode any character not in the character set, @var{unescaped-chars}. |
277bbe96 AW |
357 | |
358 | Percent-encoding first writes out the given character to a bytevector | |
7d6b8b75 | 359 | within the given @var{encoding}, then encodes each byte as |
277bbe96 AW |
360 | @code{%@var{HH}}, where @var{HH} is the hexadecimal representation of |
361 | the byte." | |
91b320fe | 362 | (if (string-index str unescaped-chars) |
a964aa62 | 363 | (call-with-output-string* |
91b320fe AW |
364 | (lambda (port) |
365 | (string-for-each | |
366 | (lambda (ch) | |
367 | (if (char-set-contains? unescaped-chars ch) | |
368 | (display ch port) | |
369 | (let* ((bv (encode-string (string ch) encoding)) | |
370 | (len (bytevector-length bv))) | |
371 | (let lp ((i 0)) | |
372 | (if (< i len) | |
373 | (let ((byte (bytevector-u8-ref bv i))) | |
374 | (display #\% port) | |
375 | (display (number->string byte 16) port) | |
376 | (lp (1+ i)))))))) | |
377 | str))) | |
378 | str)) | |
15c9af8c | 379 | |
73124c6c | 380 | (define (split-and-decode-uri-path path) |
277bbe96 AW |
381 | "Split @var{path} into its components, and decode each |
382 | component, removing empty components. | |
383 | ||
384 | For example, @code{\"/foo/bar/\"} decodes to the two-element list, | |
385 | @code{(\"foo\" \"bar\")}." | |
15c9af8c | 386 | (filter (lambda (x) (not (string-null? x))) |
73124c6c | 387 | (map uri-decode (string-split path #\/)))) |
15c9af8c | 388 | |
73124c6c | 389 | (define (encode-and-join-uri-path parts) |
277bbe96 AW |
390 | "URI-encode each element of @var{parts}, which should be a list of |
391 | strings, and join the parts together with @code{/} as a delimiter." | |
73124c6c | 392 | (string-join (map uri-encode parts) "/")) |