Commit | Line | Data |
---|---|---|
73124c6c AW |
1 | ;;;; (web uri) --- URI manipulation tools |
2 | ;;;; | |
18c44b29 | 3 | ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014 Free Software Foundation, Inc. |
73124c6c AW |
4 | ;;;; |
5 | ;;;; This library is free software; you can redistribute it and/or | |
6 | ;;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;;; License as published by the Free Software Foundation; either | |
8 | ;;;; version 3 of the License, or (at your option) any later version. | |
6fe2803b | 9 | ;;;; |
73124c6c AW |
10 | ;;;; This library is distributed in the hope that it will be useful, |
11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;;; Lesser General Public License for more details. | |
6fe2803b | 14 | ;;;; |
73124c6c AW |
15 | ;;;; You should have received a copy of the GNU Lesser General Public |
16 | ;;;; License along with this library; if not, write to the Free Software | |
17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 | ;;;; | |
15c9af8c AW |
19 | |
20 | ;;; Commentary: | |
21 | ||
277bbe96 | 22 | ;; A data type for Universal Resource Identifiers, as defined in RFC |
6fe2803b | 23 | ;; 3986. |
15c9af8c AW |
24 | |
25 | ;;; Code: | |
26 | ||
73124c6c | 27 | (define-module (web uri) |
277bbe96 AW |
28 | #:use-module (srfi srfi-9) |
29 | #:use-module (ice-9 regex) | |
30 | #:use-module (ice-9 rdelim) | |
31 | #:use-module (ice-9 control) | |
32 | #:use-module (rnrs bytevectors) | |
6854c324 | 33 | #:use-module (ice-9 binary-ports) |
73124c6c AW |
34 | #:export (uri? |
35 | uri-scheme uri-userinfo uri-host uri-port | |
36 | uri-path uri-query uri-fragment | |
37 | ||
38 | build-uri | |
18c44b29 | 39 | build-uri-reference |
4eb7c8f0 | 40 | declare-default-port! |
18c44b29 AW |
41 | string->uri string->uri-reference |
42 | uri->string | |
73124c6c AW |
43 | uri-decode uri-encode |
44 | split-and-decode-uri-path | |
277bbe96 | 45 | encode-and-join-uri-path)) |
73124c6c AW |
46 | |
47 | (define-record-type <uri> | |
48 | (make-uri scheme userinfo host port path query fragment) | |
49 | uri? | |
50 | (scheme uri-scheme) | |
51 | (userinfo uri-userinfo) | |
52 | (host uri-host) | |
53 | (port uri-port) | |
54 | (path uri-path) | |
55 | (query uri-query) | |
56 | (fragment uri-fragment)) | |
57 | ||
dc871261 DH |
58 | (define (absolute-uri? obj) |
59 | (and (uri? obj) (uri-scheme obj) #t)) | |
4e81e9d9 | 60 | |
5a2f7fb3 AW |
61 | (define (uri-error message . args) |
62 | (throw 'uri-error message args)) | |
63 | ||
73124c6c AW |
64 | (define (positive-exact-integer? port) |
65 | (and (number? port) (exact? port) (integer? port) (positive? port))) | |
66 | ||
18c44b29 AW |
67 | (define* (validate-uri scheme userinfo host port path query fragment |
68 | #:key reference?) | |
15c9af8c | 69 | (cond |
18c44b29 | 70 | ((and (not reference?) (not (symbol? scheme))) |
5a2f7fb3 | 71 | (uri-error "Expected a symbol for the URI scheme: ~s" scheme)) |
73124c6c | 72 | ((and (or userinfo port) (not host)) |
5a2f7fb3 | 73 | (uri-error "Expected a host, given userinfo or port")) |
73124c6c | 74 | ((and port (not (positive-exact-integer? port))) |
5a2f7fb3 | 75 | (uri-error "Expected port to be an integer: ~s" port)) |
73124c6c | 76 | ((and host (or (not (string? host)) (not (valid-host? host)))) |
5a2f7fb3 | 77 | (uri-error "Expected valid host: ~s" host)) |
73124c6c | 78 | ((and userinfo (not (string? userinfo))) |
5a2f7fb3 | 79 | (uri-error "Expected string for userinfo: ~s" userinfo)) |
73124c6c | 80 | ((not (string? path)) |
5a2f7fb3 | 81 | (uri-error "Expected string for path: ~s" path)) |
73124c6c AW |
82 | ((and host (not (string-null? path)) |
83 | (not (eqv? (string-ref path 0) #\/))) | |
5a2f7fb3 | 84 | (uri-error "Expected path of absolute URI to start with a /: ~a" path)))) |
15c9af8c | 85 | |
73124c6c AW |
86 | (define* (build-uri scheme #:key userinfo host port (path "") query fragment |
87 | (validate? #t)) | |
18c44b29 AW |
88 | "Construct a URI object. SCHEME should be a symbol, PORT either a |
89 | positive, exact integer or ‘#f’, and the rest of the fields are either | |
90 | strings or ‘#f’. If VALIDATE? is true, also run some consistency checks | |
91 | to make sure that the constructed object is a valid absolute URI." | |
73124c6c AW |
92 | (if validate? |
93 | (validate-uri scheme userinfo host port path query fragment)) | |
94 | (make-uri scheme userinfo host port path query fragment)) | |
15c9af8c | 95 | |
18c44b29 AW |
96 | (define* (build-uri-reference #:key scheme userinfo host port (path "") query |
97 | fragment (validate? #t)) | |
98 | "Construct a URI object. SCHEME should be a symbol or ‘#f’, PORT | |
99 | either a positive, exact integer or ‘#f’, and the rest of the fields | |
100 | are either strings or ‘#f’. If VALIDATE? is true, also run some | |
101 | consistency checks to make sure that the constructed URI is a valid URI | |
102 | reference (either an absolute URI or a relative reference)." | |
103 | (if validate? | |
104 | (validate-uri scheme userinfo host port path query fragment | |
105 | #:reference? #t)) | |
106 | (make-uri scheme userinfo host port path query fragment)) | |
107 | ||
73124c6c AW |
108 | ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC |
109 | ;; 3490), and non-ASCII host names. | |
110 | ;; | |
111 | (define ipv4-regexp | |
1868309a | 112 | (make-regexp "^([0-9.]+)$")) |
73124c6c | 113 | (define ipv6-regexp |
3fabb2d2 | 114 | (make-regexp "^([0-9a-fA-F:.]+)$")) |
73124c6c AW |
115 | (define domain-label-regexp |
116 | (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) | |
117 | (define top-label-regexp | |
4f6e8ba7 | 118 | (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) |
15c9af8c | 119 | |
73124c6c AW |
120 | (define (valid-host? host) |
121 | (cond | |
122 | ((regexp-exec ipv4-regexp host) | |
7ea70f35 | 123 | (false-if-exception (inet-pton AF_INET host))) |
73124c6c | 124 | ((regexp-exec ipv6-regexp host) |
7ea70f35 | 125 | (false-if-exception (inet-pton AF_INET6 host))) |
15c9af8c | 126 | (else |
7ea70f35 AW |
127 | (let lp ((start 0)) |
128 | (let ((end (string-index host #\. start))) | |
129 | (if end | |
130 | (and (regexp-exec domain-label-regexp | |
131 | (substring host start end)) | |
132 | (lp (1+ end))) | |
133 | (regexp-exec top-label-regexp host start))))))) | |
73124c6c AW |
134 | |
135 | (define userinfo-pat | |
136 | "[a-zA-Z0-9_.!~*'();:&=+$,-]+") | |
137 | (define host-pat | |
3fabb2d2 AW |
138 | "[a-zA-Z0-9.-]+") |
139 | (define ipv6-host-pat | |
140 | "[0-9a-fA-F:.]+") | |
73124c6c AW |
141 | (define port-pat |
142 | "[0-9]*") | |
143 | (define authority-regexp | |
144 | (make-regexp | |
3fabb2d2 AW |
145 | (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$" |
146 | userinfo-pat host-pat ipv6-host-pat port-pat))) | |
73124c6c AW |
147 | |
148 | (define (parse-authority authority fail) | |
679eea4f AW |
149 | (if (equal? authority "//") |
150 | ;; Allow empty authorities: file:///etc/hosts is a synonym of | |
151 | ;; file:/etc/hosts. | |
152 | (values #f #f #f) | |
153 | (let ((m (regexp-exec authority-regexp authority))) | |
3fabb2d2 AW |
154 | (if (and m (valid-host? (or (match:substring m 4) |
155 | (match:substring m 6)))) | |
679eea4f | 156 | (values (match:substring m 2) |
3fabb2d2 AW |
157 | (or (match:substring m 4) |
158 | (match:substring m 6)) | |
159 | (let ((port (match:substring m 8))) | |
679eea4f AW |
160 | (and port (not (string-null? port)) |
161 | (string->number port)))) | |
162 | (fail))))) | |
73124c6c AW |
163 | |
164 | ||
165 | ;;; RFC 3986, #3. | |
166 | ;;; | |
167 | ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
168 | ;;; | |
169 | ;;; hier-part = "//" authority path-abempty | |
170 | ;;; / path-absolute | |
171 | ;;; / path-rootless | |
172 | ;;; / path-empty | |
18c44b29 AW |
173 | ;;; |
174 | ;;; A URI-reference is the same as URI, but where the scheme is | |
175 | ;;; optional. If the scheme is not present, its colon isn't present | |
176 | ;;; either. | |
73124c6c AW |
177 | |
178 | (define scheme-pat | |
179 | "[a-zA-Z][a-zA-Z0-9+.-]*") | |
180 | (define authority-pat | |
181 | "[^/?#]*") | |
182 | (define path-pat | |
183 | "[^?#]*") | |
184 | (define query-pat | |
185 | "[^#]*") | |
186 | (define fragment-pat | |
187 | ".*") | |
188 | (define uri-pat | |
4e81e9d9 | 189 | (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$" |
73124c6c AW |
190 | scheme-pat authority-pat path-pat query-pat fragment-pat)) |
191 | (define uri-regexp | |
192 | (make-regexp uri-pat)) | |
193 | ||
18c44b29 AW |
194 | (define (string->uri-reference string) |
195 | "Parse the URI reference written as STRING into a URI object. Return | |
196 | ‘#f’ if the string could not be parsed." | |
73124c6c AW |
197 | (% (let ((m (regexp-exec uri-regexp string))) |
198 | (if (not m) (abort)) | |
4e81e9d9 DH |
199 | (let ((scheme (let ((str (match:substring m 2))) |
200 | (and str (string->symbol (string-downcase str))))) | |
201 | (authority (match:substring m 3)) | |
202 | (path (match:substring m 4)) | |
203 | (query (match:substring m 6)) | |
18c44b29 | 204 | (fragment (match:substring m 8))) |
73124c6c AW |
205 | (call-with-values |
206 | (lambda () | |
207 | (if authority | |
208 | (parse-authority authority abort) | |
209 | (values #f #f #f))) | |
210 | (lambda (userinfo host port) | |
211 | (make-uri scheme userinfo host port path query fragment))))) | |
212 | (lambda (k) | |
213 | #f))) | |
214 | ||
4e81e9d9 | 215 | (define (string->uri string) |
18c44b29 | 216 | "Parse STRING into an absolute URI object. Return ‘#f’ if the string |
4e81e9d9 | 217 | could not be parsed." |
18c44b29 | 218 | (let ((uri (string->uri-reference string))) |
4e81e9d9 DH |
219 | (and uri (uri-scheme uri) uri))) |
220 | ||
4eb7c8f0 AW |
221 | (define *default-ports* (make-hash-table)) |
222 | ||
223 | (define (declare-default-port! scheme port) | |
06883ae0 | 224 | "Declare a default port for the given URI scheme." |
4eb7c8f0 AW |
225 | (hashq-set! *default-ports* scheme port)) |
226 | ||
227 | (define (default-port? scheme port) | |
228 | (or (not port) | |
229 | (eqv? port (hashq-ref *default-ports* scheme)))) | |
230 | ||
231 | (declare-default-port! 'http 80) | |
232 | (declare-default-port! 'https 443) | |
233 | ||
8745c33a | 234 | (define (uri->string uri) |
06883ae0 DH |
235 | "Serialize URI to a string. If the URI has a port that is the |
236 | default port for its scheme, the port is not included in the | |
237 | serialization." | |
4e81e9d9 | 238 | (let* ((scheme (uri-scheme uri)) |
73124c6c AW |
239 | (userinfo (uri-userinfo uri)) |
240 | (host (uri-host uri)) | |
241 | (port (uri-port uri)) | |
242 | (path (uri-path uri)) | |
243 | (query (uri-query uri)) | |
244 | (fragment (uri-fragment uri))) | |
245 | (string-append | |
4e81e9d9 DH |
246 | (if scheme |
247 | (string-append (symbol->string scheme) ":") | |
248 | "") | |
73124c6c AW |
249 | (if host |
250 | (string-append "//" | |
251 | (if userinfo (string-append userinfo "@") | |
252 | "") | |
3fabb2d2 AW |
253 | (if (string-index host #\:) |
254 | (string-append "[" host "]") | |
255 | host) | |
4eb7c8f0 AW |
256 | (if (default-port? (uri-scheme uri) port) |
257 | "" | |
258 | (string-append ":" (number->string port)))) | |
73124c6c AW |
259 | "") |
260 | path | |
261 | (if query | |
262 | (string-append "?" query) | |
263 | "") | |
264 | (if fragment | |
265 | (string-append "#" fragment) | |
266 | "")))) | |
267 | ||
268 | ||
a964aa62 AW |
269 | ;; like call-with-output-string, but actually closes the port (doh) |
270 | (define (call-with-output-string* proc) | |
271 | (let ((port (open-output-string))) | |
272 | (proc port) | |
273 | (let ((str (get-output-string port))) | |
274 | (close-port port) | |
275 | str))) | |
276 | ||
277 | (define (call-with-output-bytevector* proc) | |
278 | (call-with-values | |
279 | (lambda () | |
280 | (open-bytevector-output-port)) | |
281 | (lambda (port get-bytevector) | |
282 | (proc port) | |
283 | (let ((bv (get-bytevector))) | |
284 | (close-port port) | |
285 | bv)))) | |
286 | ||
7d6b8b75 AW |
287 | (define (call-with-encoded-output-string encoding proc) |
288 | (if (string-ci=? encoding "utf-8") | |
a964aa62 AW |
289 | (string->utf8 (call-with-output-string* proc)) |
290 | (call-with-output-bytevector* | |
291 | (lambda (port) | |
292 | (set-port-encoding! port encoding) | |
293 | (proc port))))) | |
c7857da6 | 294 | |
7d6b8b75 AW |
295 | (define (encode-string str encoding) |
296 | (if (string-ci=? encoding "utf-8") | |
c7857da6 | 297 | (string->utf8 str) |
7d6b8b75 | 298 | (call-with-encoded-output-string encoding |
c7857da6 AW |
299 | (lambda (port) |
300 | (display str port))))) | |
301 | ||
7d6b8b75 AW |
302 | (define (decode-string bv encoding) |
303 | (if (string-ci=? encoding "utf-8") | |
c7857da6 AW |
304 | (utf8->string bv) |
305 | (let ((p (open-bytevector-input-port bv))) | |
7d6b8b75 | 306 | (set-port-encoding! p encoding) |
5a35d42a | 307 | (let ((res (read-string p))) |
a964aa62 AW |
308 | (close-port p) |
309 | res)))) | |
c7857da6 AW |
310 | |
311 | ||
73124c6c AW |
312 | ;; A note on characters and bytes: URIs are defined to be sequences of |
313 | ;; characters in a subset of ASCII. Those characters may encode a | |
314 | ;; sequence of bytes (octets), which in turn may encode sequences of | |
315 | ;; characters in other character sets. | |
316 | ;; | |
317 | ||
06883ae0 DH |
318 | ;; Return a new string made from uri-decoding STR. Specifically, |
319 | ;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into | |
73124c6c AW |
320 | ;; their eight-bit characters. |
321 | ;; | |
322 | (define hex-chars | |
323 | (string->char-set "0123456789abcdefABCDEF")) | |
324 | ||
7d6b8b75 | 325 | (define* (uri-decode str #:key (encoding "utf-8")) |
06883ae0 DH |
326 | "Percent-decode the given STR, according to ENCODING, |
327 | which should be the name of a character encoding. | |
277bbe96 AW |
328 | |
329 | Note that this function should not generally be applied to a full URI | |
dc871261 | 330 | string. For paths, use ‘split-and-decode-uri-path’ instead. For query |
06883ae0 | 331 | strings, split the query on ‘&’ and ‘=’ boundaries, and decode |
277bbe96 AW |
332 | the components separately. |
333 | ||
dc871261 DH |
334 | Note also that percent-encoded strings encode _bytes_, not characters. |
335 | There is no guarantee that a given byte sequence is a valid string | |
336 | encoding. Therefore this routine may signal an error if the decoded | |
337 | bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if | |
338 | you want decoded bytes as a bytevector directly. ‘set-port-encoding!’, | |
339 | for more information on character encodings. | |
06883ae0 DH |
340 | |
341 | Returns a string of the decoded characters, or a bytevector if | |
342 | ENCODING was ‘#f’." | |
a964aa62 AW |
343 | (let* ((len (string-length str)) |
344 | (bv | |
345 | (call-with-output-bytevector* | |
346 | (lambda (port) | |
347 | (let lp ((i 0)) | |
348 | (if (< i len) | |
349 | (let ((ch (string-ref str i))) | |
350 | (cond | |
351 | ((eqv? ch #\+) | |
352 | (put-u8 port (char->integer #\space)) | |
353 | (lp (1+ i))) | |
354 | ((and (< (+ i 2) len) (eqv? ch #\%) | |
355 | (let ((a (string-ref str (+ i 1))) | |
356 | (b (string-ref str (+ i 2)))) | |
357 | (and (char-set-contains? hex-chars a) | |
358 | (char-set-contains? hex-chars b) | |
359 | (string->number (string a b) 16)))) | |
360 | => (lambda (u8) | |
361 | (put-u8 port u8) | |
362 | (lp (+ i 3)))) | |
363 | ((< (char->integer ch) 128) | |
364 | (put-u8 port (char->integer ch)) | |
365 | (lp (1+ i))) | |
366 | (else | |
367 | (uri-error "Invalid character in encoded URI ~a: ~s" | |
368 | str ch)))))))))) | |
369 | (if encoding | |
370 | (decode-string bv encoding) | |
371 | ;; Otherwise return raw bytevector | |
372 | bv))) | |
373 | ||
73124c6c AW |
374 | (define ascii-alnum-chars |
375 | (string->char-set | |
376 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")) | |
377 | ||
378 | ;; RFC 3986, #2.2. | |
379 | (define gen-delims | |
380 | (string->char-set ":/?#[]@")) | |
381 | (define sub-delims | |
382 | (string->char-set "!$&'()*+,l=")) | |
15c9af8c | 383 | (define reserved-chars |
73124c6c | 384 | (char-set-union gen-delims sub-delims)) |
15c9af8c | 385 | |
73124c6c AW |
386 | ;; RFC 3986, #2.3 |
387 | (define unreserved-chars | |
388 | (char-set-union ascii-alnum-chars | |
389 | (string->char-set "-._~"))) | |
15c9af8c | 390 | |
06883ae0 DH |
391 | ;; Return a new string made from uri-encoding STR, unconditionally |
392 | ;; transforming any characters not in UNESCAPED-CHARS. | |
73124c6c | 393 | ;; |
7d6b8b75 | 394 | (define* (uri-encode str #:key (encoding "utf-8") |
73124c6c | 395 | (unescaped-chars unreserved-chars)) |
06883ae0 DH |
396 | "Percent-encode any character not in the character set, |
397 | UNESCAPED-CHARS. | |
398 | ||
399 | The default character set includes alphanumerics from ASCII, as well as | |
dc871261 DH |
400 | the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will |
401 | be percent-encoded, by writing out the character to a bytevector within | |
402 | the given ENCODING, then encoding each byte as ‘%HH’, where HH is the | |
6fe2803b | 403 | uppercase hexadecimal representation of the byte." |
866210bf IP |
404 | (define (needs-escaped? ch) |
405 | (not (char-set-contains? unescaped-chars ch))) | |
406 | (if (string-index str needs-escaped?) | |
a964aa62 | 407 | (call-with-output-string* |
91b320fe AW |
408 | (lambda (port) |
409 | (string-for-each | |
410 | (lambda (ch) | |
411 | (if (char-set-contains? unescaped-chars ch) | |
412 | (display ch port) | |
413 | (let* ((bv (encode-string (string ch) encoding)) | |
414 | (len (bytevector-length bv))) | |
415 | (let lp ((i 0)) | |
416 | (if (< i len) | |
417 | (let ((byte (bytevector-u8-ref bv i))) | |
418 | (display #\% port) | |
b401fe71 IP |
419 | (when (< byte 16) |
420 | (display #\0 port)) | |
6fe2803b ACF |
421 | (display (string-upcase (number->string byte 16)) |
422 | port) | |
91b320fe AW |
423 | (lp (1+ i)))))))) |
424 | str))) | |
425 | str)) | |
15c9af8c | 426 | |
73124c6c | 427 | (define (split-and-decode-uri-path path) |
06883ae0 DH |
428 | "Split PATH into its components, and decode each component, |
429 | removing empty components. | |
277bbe96 | 430 | |
06883ae0 DH |
431 | For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list, |
432 | ‘(\"foo\" \"bar baz\")’." | |
15c9af8c | 433 | (filter (lambda (x) (not (string-null? x))) |
73124c6c | 434 | (map uri-decode (string-split path #\/)))) |
15c9af8c | 435 | |
73124c6c | 436 | (define (encode-and-join-uri-path parts) |
06883ae0 DH |
437 | "URI-encode each element of PARTS, which should be a list of |
438 | strings, and join the parts together with ‘/’ as a delimiter. | |
439 | ||
440 | For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’ | |
441 | encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’." | |
73124c6c | 442 | (string-join (map uri-encode parts) "/")) |