Commit | Line | Data |
---|---|---|
73124c6c AW |
1 | ;;;; (web uri) --- URI manipulation tools |
2 | ;;;; | |
5a35d42a | 3 | ;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013 Free Software Foundation, Inc. |
73124c6c AW |
4 | ;;;; |
5 | ;;;; This library is free software; you can redistribute it and/or | |
6 | ;;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;;; License as published by the Free Software Foundation; either | |
8 | ;;;; version 3 of the License, or (at your option) any later version. | |
6fe2803b | 9 | ;;;; |
73124c6c AW |
10 | ;;;; This library is distributed in the hope that it will be useful, |
11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;;; Lesser General Public License for more details. | |
6fe2803b | 14 | ;;;; |
73124c6c AW |
15 | ;;;; You should have received a copy of the GNU Lesser General Public |
16 | ;;;; License along with this library; if not, write to the Free Software | |
17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 | ;;;; | |
15c9af8c AW |
19 | |
20 | ;;; Commentary: | |
21 | ||
277bbe96 | 22 | ;; A data type for Universal Resource Identifiers, as defined in RFC |
6fe2803b | 23 | ;; 3986. |
15c9af8c AW |
24 | |
25 | ;;; Code: | |
26 | ||
73124c6c | 27 | (define-module (web uri) |
277bbe96 AW |
28 | #:use-module (srfi srfi-9) |
29 | #:use-module (ice-9 regex) | |
30 | #:use-module (ice-9 rdelim) | |
31 | #:use-module (ice-9 control) | |
32 | #:use-module (rnrs bytevectors) | |
6854c324 | 33 | #:use-module (ice-9 binary-ports) |
73124c6c AW |
34 | #:export (uri? |
35 | uri-scheme uri-userinfo uri-host uri-port | |
36 | uri-path uri-query uri-fragment | |
37 | ||
38 | build-uri | |
4eb7c8f0 | 39 | declare-default-port! |
8745c33a | 40 | string->uri uri->string |
73124c6c AW |
41 | uri-decode uri-encode |
42 | split-and-decode-uri-path | |
277bbe96 | 43 | encode-and-join-uri-path)) |
73124c6c AW |
44 | |
45 | (define-record-type <uri> | |
46 | (make-uri scheme userinfo host port path query fragment) | |
47 | uri? | |
48 | (scheme uri-scheme) | |
49 | (userinfo uri-userinfo) | |
50 | (host uri-host) | |
51 | (port uri-port) | |
52 | (path uri-path) | |
53 | (query uri-query) | |
54 | (fragment uri-fragment)) | |
55 | ||
dc871261 DH |
56 | (define (absolute-uri? obj) |
57 | (and (uri? obj) (uri-scheme obj) #t)) | |
4e81e9d9 | 58 | |
5a2f7fb3 AW |
59 | (define (uri-error message . args) |
60 | (throw 'uri-error message args)) | |
61 | ||
73124c6c AW |
62 | (define (positive-exact-integer? port) |
63 | (and (number? port) (exact? port) (integer? port) (positive? port))) | |
64 | ||
65 | (define (validate-uri scheme userinfo host port path query fragment) | |
15c9af8c | 66 | (cond |
73124c6c | 67 | ((not (symbol? scheme)) |
5a2f7fb3 | 68 | (uri-error "Expected a symbol for the URI scheme: ~s" scheme)) |
73124c6c | 69 | ((and (or userinfo port) (not host)) |
5a2f7fb3 | 70 | (uri-error "Expected a host, given userinfo or port")) |
73124c6c | 71 | ((and port (not (positive-exact-integer? port))) |
5a2f7fb3 | 72 | (uri-error "Expected port to be an integer: ~s" port)) |
73124c6c | 73 | ((and host (or (not (string? host)) (not (valid-host? host)))) |
5a2f7fb3 | 74 | (uri-error "Expected valid host: ~s" host)) |
73124c6c | 75 | ((and userinfo (not (string? userinfo))) |
5a2f7fb3 | 76 | (uri-error "Expected string for userinfo: ~s" userinfo)) |
73124c6c | 77 | ((not (string? path)) |
5a2f7fb3 | 78 | (uri-error "Expected string for path: ~s" path)) |
73124c6c AW |
79 | ((and host (not (string-null? path)) |
80 | (not (eqv? (string-ref path 0) #\/))) | |
5a2f7fb3 | 81 | (uri-error "Expected path of absolute URI to start with a /: ~a" path)))) |
15c9af8c | 82 | |
73124c6c AW |
83 | (define* (build-uri scheme #:key userinfo host port (path "") query fragment |
84 | (validate? #t)) | |
89064755 DH |
85 | "Construct a URI object. SCHEME should be a symbol, PORT |
86 | either a positive, exact integer or ‘#f’, and the rest of the | |
87 | fields are either strings or ‘#f’. If VALIDATE? is true, | |
88 | also run some consistency checks to make sure that the constructed URI | |
89 | is valid." | |
73124c6c AW |
90 | (if validate? |
91 | (validate-uri scheme userinfo host port path query fragment)) | |
92 | (make-uri scheme userinfo host port path query fragment)) | |
15c9af8c | 93 | |
73124c6c AW |
94 | ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC |
95 | ;; 3490), and non-ASCII host names. | |
96 | ;; | |
97 | (define ipv4-regexp | |
1868309a | 98 | (make-regexp "^([0-9.]+)$")) |
73124c6c | 99 | (define ipv6-regexp |
3fabb2d2 | 100 | (make-regexp "^([0-9a-fA-F:.]+)$")) |
73124c6c AW |
101 | (define domain-label-regexp |
102 | (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) | |
103 | (define top-label-regexp | |
4f6e8ba7 | 104 | (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$")) |
15c9af8c | 105 | |
73124c6c AW |
106 | (define (valid-host? host) |
107 | (cond | |
108 | ((regexp-exec ipv4-regexp host) | |
7ea70f35 | 109 | (false-if-exception (inet-pton AF_INET host))) |
73124c6c | 110 | ((regexp-exec ipv6-regexp host) |
7ea70f35 | 111 | (false-if-exception (inet-pton AF_INET6 host))) |
15c9af8c | 112 | (else |
7ea70f35 AW |
113 | (let lp ((start 0)) |
114 | (let ((end (string-index host #\. start))) | |
115 | (if end | |
116 | (and (regexp-exec domain-label-regexp | |
117 | (substring host start end)) | |
118 | (lp (1+ end))) | |
119 | (regexp-exec top-label-regexp host start))))))) | |
73124c6c AW |
120 | |
121 | (define userinfo-pat | |
122 | "[a-zA-Z0-9_.!~*'();:&=+$,-]+") | |
123 | (define host-pat | |
3fabb2d2 AW |
124 | "[a-zA-Z0-9.-]+") |
125 | (define ipv6-host-pat | |
126 | "[0-9a-fA-F:.]+") | |
73124c6c AW |
127 | (define port-pat |
128 | "[0-9]*") | |
129 | (define authority-regexp | |
130 | (make-regexp | |
3fabb2d2 AW |
131 | (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$" |
132 | userinfo-pat host-pat ipv6-host-pat port-pat))) | |
73124c6c AW |
133 | |
134 | (define (parse-authority authority fail) | |
679eea4f AW |
135 | (if (equal? authority "//") |
136 | ;; Allow empty authorities: file:///etc/hosts is a synonym of | |
137 | ;; file:/etc/hosts. | |
138 | (values #f #f #f) | |
139 | (let ((m (regexp-exec authority-regexp authority))) | |
3fabb2d2 AW |
140 | (if (and m (valid-host? (or (match:substring m 4) |
141 | (match:substring m 6)))) | |
679eea4f | 142 | (values (match:substring m 2) |
3fabb2d2 AW |
143 | (or (match:substring m 4) |
144 | (match:substring m 6)) | |
145 | (let ((port (match:substring m 8))) | |
679eea4f AW |
146 | (and port (not (string-null? port)) |
147 | (string->number port)))) | |
148 | (fail))))) | |
73124c6c AW |
149 | |
150 | ||
151 | ;;; RFC 3986, #3. | |
152 | ;;; | |
153 | ;;; URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
154 | ;;; | |
155 | ;;; hier-part = "//" authority path-abempty | |
156 | ;;; / path-absolute | |
157 | ;;; / path-rootless | |
158 | ;;; / path-empty | |
159 | ||
160 | (define scheme-pat | |
161 | "[a-zA-Z][a-zA-Z0-9+.-]*") | |
162 | (define authority-pat | |
163 | "[^/?#]*") | |
164 | (define path-pat | |
165 | "[^?#]*") | |
166 | (define query-pat | |
167 | "[^#]*") | |
168 | (define fragment-pat | |
169 | ".*") | |
170 | (define uri-pat | |
4e81e9d9 | 171 | (format #f "^((~a):)?(//~a)?(~a)(\\?(~a))?(#(~a))?$" |
73124c6c AW |
172 | scheme-pat authority-pat path-pat query-pat fragment-pat)) |
173 | (define uri-regexp | |
174 | (make-regexp uri-pat)) | |
175 | ||
4e81e9d9 | 176 | (define (string->uri* string) |
06883ae0 | 177 | "Parse STRING into a URI object. Return ‘#f’ if the string |
277bbe96 | 178 | could not be parsed." |
73124c6c AW |
179 | (% (let ((m (regexp-exec uri-regexp string))) |
180 | (if (not m) (abort)) | |
4e81e9d9 DH |
181 | (let ((scheme (let ((str (match:substring m 2))) |
182 | (and str (string->symbol (string-downcase str))))) | |
183 | (authority (match:substring m 3)) | |
184 | (path (match:substring m 4)) | |
185 | (query (match:substring m 6)) | |
73124c6c AW |
186 | (fragment (match:substring m 7))) |
187 | (call-with-values | |
188 | (lambda () | |
189 | (if authority | |
190 | (parse-authority authority abort) | |
191 | (values #f #f #f))) | |
192 | (lambda (userinfo host port) | |
193 | (make-uri scheme userinfo host port path query fragment))))) | |
194 | (lambda (k) | |
195 | #f))) | |
196 | ||
4e81e9d9 DH |
197 | (define (string->uri string) |
198 | "Parse STRING into a URI object. Return ‘#f’ if the string | |
199 | could not be parsed." | |
200 | (let ((uri (string->uri* string))) | |
201 | (and uri (uri-scheme uri) uri))) | |
202 | ||
4eb7c8f0 AW |
203 | (define *default-ports* (make-hash-table)) |
204 | ||
205 | (define (declare-default-port! scheme port) | |
06883ae0 | 206 | "Declare a default port for the given URI scheme." |
4eb7c8f0 AW |
207 | (hashq-set! *default-ports* scheme port)) |
208 | ||
209 | (define (default-port? scheme port) | |
210 | (or (not port) | |
211 | (eqv? port (hashq-ref *default-ports* scheme)))) | |
212 | ||
213 | (declare-default-port! 'http 80) | |
214 | (declare-default-port! 'https 443) | |
215 | ||
8745c33a | 216 | (define (uri->string uri) |
06883ae0 DH |
217 | "Serialize URI to a string. If the URI has a port that is the |
218 | default port for its scheme, the port is not included in the | |
219 | serialization." | |
4e81e9d9 | 220 | (let* ((scheme (uri-scheme uri)) |
73124c6c AW |
221 | (userinfo (uri-userinfo uri)) |
222 | (host (uri-host uri)) | |
223 | (port (uri-port uri)) | |
224 | (path (uri-path uri)) | |
225 | (query (uri-query uri)) | |
226 | (fragment (uri-fragment uri))) | |
227 | (string-append | |
4e81e9d9 DH |
228 | (if scheme |
229 | (string-append (symbol->string scheme) ":") | |
230 | "") | |
73124c6c AW |
231 | (if host |
232 | (string-append "//" | |
233 | (if userinfo (string-append userinfo "@") | |
234 | "") | |
3fabb2d2 AW |
235 | (if (string-index host #\:) |
236 | (string-append "[" host "]") | |
237 | host) | |
4eb7c8f0 AW |
238 | (if (default-port? (uri-scheme uri) port) |
239 | "" | |
240 | (string-append ":" (number->string port)))) | |
73124c6c AW |
241 | "") |
242 | path | |
243 | (if query | |
244 | (string-append "?" query) | |
245 | "") | |
246 | (if fragment | |
247 | (string-append "#" fragment) | |
248 | "")))) | |
249 | ||
250 | ||
a964aa62 AW |
251 | ;; like call-with-output-string, but actually closes the port (doh) |
252 | (define (call-with-output-string* proc) | |
253 | (let ((port (open-output-string))) | |
254 | (proc port) | |
255 | (let ((str (get-output-string port))) | |
256 | (close-port port) | |
257 | str))) | |
258 | ||
259 | (define (call-with-output-bytevector* proc) | |
260 | (call-with-values | |
261 | (lambda () | |
262 | (open-bytevector-output-port)) | |
263 | (lambda (port get-bytevector) | |
264 | (proc port) | |
265 | (let ((bv (get-bytevector))) | |
266 | (close-port port) | |
267 | bv)))) | |
268 | ||
7d6b8b75 AW |
269 | (define (call-with-encoded-output-string encoding proc) |
270 | (if (string-ci=? encoding "utf-8") | |
a964aa62 AW |
271 | (string->utf8 (call-with-output-string* proc)) |
272 | (call-with-output-bytevector* | |
273 | (lambda (port) | |
274 | (set-port-encoding! port encoding) | |
275 | (proc port))))) | |
c7857da6 | 276 | |
7d6b8b75 AW |
277 | (define (encode-string str encoding) |
278 | (if (string-ci=? encoding "utf-8") | |
c7857da6 | 279 | (string->utf8 str) |
7d6b8b75 | 280 | (call-with-encoded-output-string encoding |
c7857da6 AW |
281 | (lambda (port) |
282 | (display str port))))) | |
283 | ||
7d6b8b75 AW |
284 | (define (decode-string bv encoding) |
285 | (if (string-ci=? encoding "utf-8") | |
c7857da6 AW |
286 | (utf8->string bv) |
287 | (let ((p (open-bytevector-input-port bv))) | |
7d6b8b75 | 288 | (set-port-encoding! p encoding) |
5a35d42a | 289 | (let ((res (read-string p))) |
a964aa62 AW |
290 | (close-port p) |
291 | res)))) | |
c7857da6 AW |
292 | |
293 | ||
73124c6c AW |
294 | ;; A note on characters and bytes: URIs are defined to be sequences of |
295 | ;; characters in a subset of ASCII. Those characters may encode a | |
296 | ;; sequence of bytes (octets), which in turn may encode sequences of | |
297 | ;; characters in other character sets. | |
298 | ;; | |
299 | ||
06883ae0 DH |
300 | ;; Return a new string made from uri-decoding STR. Specifically, |
301 | ;; turn ‘+’ into space, and hex-encoded ‘%XX’ strings into | |
73124c6c AW |
302 | ;; their eight-bit characters. |
303 | ;; | |
304 | (define hex-chars | |
305 | (string->char-set "0123456789abcdefABCDEF")) | |
306 | ||
7d6b8b75 | 307 | (define* (uri-decode str #:key (encoding "utf-8")) |
06883ae0 DH |
308 | "Percent-decode the given STR, according to ENCODING, |
309 | which should be the name of a character encoding. | |
277bbe96 AW |
310 | |
311 | Note that this function should not generally be applied to a full URI | |
dc871261 | 312 | string. For paths, use ‘split-and-decode-uri-path’ instead. For query |
06883ae0 | 313 | strings, split the query on ‘&’ and ‘=’ boundaries, and decode |
277bbe96 AW |
314 | the components separately. |
315 | ||
dc871261 DH |
316 | Note also that percent-encoded strings encode _bytes_, not characters. |
317 | There is no guarantee that a given byte sequence is a valid string | |
318 | encoding. Therefore this routine may signal an error if the decoded | |
319 | bytes are not valid for the given encoding. Pass ‘#f’ for ENCODING if | |
320 | you want decoded bytes as a bytevector directly. ‘set-port-encoding!’, | |
321 | for more information on character encodings. | |
06883ae0 DH |
322 | |
323 | Returns a string of the decoded characters, or a bytevector if | |
324 | ENCODING was ‘#f’." | |
a964aa62 AW |
325 | (let* ((len (string-length str)) |
326 | (bv | |
327 | (call-with-output-bytevector* | |
328 | (lambda (port) | |
329 | (let lp ((i 0)) | |
330 | (if (< i len) | |
331 | (let ((ch (string-ref str i))) | |
332 | (cond | |
333 | ((eqv? ch #\+) | |
334 | (put-u8 port (char->integer #\space)) | |
335 | (lp (1+ i))) | |
336 | ((and (< (+ i 2) len) (eqv? ch #\%) | |
337 | (let ((a (string-ref str (+ i 1))) | |
338 | (b (string-ref str (+ i 2)))) | |
339 | (and (char-set-contains? hex-chars a) | |
340 | (char-set-contains? hex-chars b) | |
341 | (string->number (string a b) 16)))) | |
342 | => (lambda (u8) | |
343 | (put-u8 port u8) | |
344 | (lp (+ i 3)))) | |
345 | ((< (char->integer ch) 128) | |
346 | (put-u8 port (char->integer ch)) | |
347 | (lp (1+ i))) | |
348 | (else | |
349 | (uri-error "Invalid character in encoded URI ~a: ~s" | |
350 | str ch)))))))))) | |
351 | (if encoding | |
352 | (decode-string bv encoding) | |
353 | ;; Otherwise return raw bytevector | |
354 | bv))) | |
355 | ||
73124c6c AW |
356 | (define ascii-alnum-chars |
357 | (string->char-set | |
358 | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")) | |
359 | ||
360 | ;; RFC 3986, #2.2. | |
361 | (define gen-delims | |
362 | (string->char-set ":/?#[]@")) | |
363 | (define sub-delims | |
364 | (string->char-set "!$&'()*+,l=")) | |
15c9af8c | 365 | (define reserved-chars |
73124c6c | 366 | (char-set-union gen-delims sub-delims)) |
15c9af8c | 367 | |
73124c6c AW |
368 | ;; RFC 3986, #2.3 |
369 | (define unreserved-chars | |
370 | (char-set-union ascii-alnum-chars | |
371 | (string->char-set "-._~"))) | |
15c9af8c | 372 | |
06883ae0 DH |
373 | ;; Return a new string made from uri-encoding STR, unconditionally |
374 | ;; transforming any characters not in UNESCAPED-CHARS. | |
73124c6c | 375 | ;; |
7d6b8b75 | 376 | (define* (uri-encode str #:key (encoding "utf-8") |
73124c6c | 377 | (unescaped-chars unreserved-chars)) |
06883ae0 DH |
378 | "Percent-encode any character not in the character set, |
379 | UNESCAPED-CHARS. | |
380 | ||
381 | The default character set includes alphanumerics from ASCII, as well as | |
dc871261 DH |
382 | the special characters ‘-’, ‘.’, ‘_’, and ‘~’. Any other character will |
383 | be percent-encoded, by writing out the character to a bytevector within | |
384 | the given ENCODING, then encoding each byte as ‘%HH’, where HH is the | |
6fe2803b | 385 | uppercase hexadecimal representation of the byte." |
866210bf IP |
386 | (define (needs-escaped? ch) |
387 | (not (char-set-contains? unescaped-chars ch))) | |
388 | (if (string-index str needs-escaped?) | |
a964aa62 | 389 | (call-with-output-string* |
91b320fe AW |
390 | (lambda (port) |
391 | (string-for-each | |
392 | (lambda (ch) | |
393 | (if (char-set-contains? unescaped-chars ch) | |
394 | (display ch port) | |
395 | (let* ((bv (encode-string (string ch) encoding)) | |
396 | (len (bytevector-length bv))) | |
397 | (let lp ((i 0)) | |
398 | (if (< i len) | |
399 | (let ((byte (bytevector-u8-ref bv i))) | |
400 | (display #\% port) | |
b401fe71 IP |
401 | (when (< byte 16) |
402 | (display #\0 port)) | |
6fe2803b ACF |
403 | (display (string-upcase (number->string byte 16)) |
404 | port) | |
91b320fe AW |
405 | (lp (1+ i)))))))) |
406 | str))) | |
407 | str)) | |
15c9af8c | 408 | |
73124c6c | 409 | (define (split-and-decode-uri-path path) |
06883ae0 DH |
410 | "Split PATH into its components, and decode each component, |
411 | removing empty components. | |
277bbe96 | 412 | |
06883ae0 DH |
413 | For example, ‘\"/foo/bar%20baz/\"’ decodes to the two-element list, |
414 | ‘(\"foo\" \"bar baz\")’." | |
15c9af8c | 415 | (filter (lambda (x) (not (string-null? x))) |
73124c6c | 416 | (map uri-decode (string-split path #\/)))) |
15c9af8c | 417 | |
73124c6c | 418 | (define (encode-and-join-uri-path parts) |
06883ae0 DH |
419 | "URI-encode each element of PARTS, which should be a list of |
420 | strings, and join the parts together with ‘/’ as a delimiter. | |
421 | ||
422 | For example, the list ‘(\"scrambled eggs\" \"biscuits&gravy\")’ | |
423 | encodes as ‘\"scrambled%20eggs/biscuits%26gravy\"’." | |
73124c6c | 424 | (string-join (map uri-encode parts) "/")) |