Commit | Line | Data |
---|---|---|
7f918cf1 CE |
1 | (* Copyright (C) 2009 Matthew Fluet. |
2 | * Copyright (C) 1999-2006 Henry Cejtin, Matthew Fluet, Suresh | |
3 | * Jagannathan, and Stephen Weeks. | |
4 | * | |
5 | * MLton is released under a BSD-style license. | |
6 | * See the file MLton-LICENSE for details. | |
7 | *) | |
8 | ||
9 | structure Url: URL = | |
10 | struct | |
11 | ||
12 | val escapeQuery = ref true | |
13 | ||
14 | structure Char = | |
15 | struct | |
16 | open Char | |
17 | ||
18 | val radix: int = 16 | |
19 | ||
20 | fun fromHexChars (hi: t, lo: t) = | |
21 | chr (toHexDigit hi * radix + toHexDigit lo) | |
22 | ||
23 | fun toHexChars (c: t): t * t = | |
24 | let | |
25 | val (hi, lo) = Int.divMod (ord c, radix) | |
26 | in | |
27 | (fromHexDigit hi, fromHexDigit lo) | |
28 | end | |
29 | ||
30 | fun escapeHex (c: t): string = | |
31 | let | |
32 | val (hi, lo) = toHexChars c | |
33 | in | |
34 | implode [#"%", hi, lo] | |
35 | end | |
36 | ||
37 | end | |
38 | ||
39 | fun unescape (s: string): string = | |
40 | let | |
41 | fun sub i = String.sub (s, i) | |
42 | val n = String.size s | |
43 | fun loop (i, cs) = | |
44 | if i >= n | |
45 | then implode (rev cs) | |
46 | else let val c = sub i | |
47 | in if c = #"%" | |
48 | then loop (i + 3, | |
49 | Char.fromHexChars (sub (i + 1), sub (i + 2)) :: cs) | |
50 | else loop (i + 1, c :: cs) | |
51 | end | |
52 | in loop (0, []) | |
53 | end | |
54 | ||
55 | val shouldEscape: char -> bool = | |
56 | Char.memoize (fn c => | |
57 | not (Char.isGraph c) | |
58 | orelse c = Char.dquote | |
59 | orelse String.contains ("?<>#% {}|\\^ []`", c)) | |
60 | ||
61 | val shouldEscape = | |
62 | Trace.trace ("Url.shouldEscape", Char.layout, Bool.layout) shouldEscape | |
63 | ||
64 | fun escape s = | |
65 | String.translate | |
66 | (s, fn c => | |
67 | if shouldEscape c | |
68 | then Char.escapeHex c | |
69 | else str c) | |
70 | ||
71 | structure Scheme = | |
72 | struct | |
73 | datatype t = | |
74 | File | |
75 | | Ftp | |
76 | | Gopher | |
77 | | Http | |
78 | | Https | |
79 | | Telnet | |
80 | ||
81 | val map = | |
82 | [("file", File), | |
83 | ("ftp", Ftp), | |
84 | ("gopher", Gopher), | |
85 | ("http", Http), | |
86 | ("https", Https), | |
87 | ("telnet", Telnet)] | |
88 | ||
89 | val fromString = | |
90 | String.memoizeList (fn _ => Error.bug "Url.Scheme.fromString", map) | |
91 | ||
92 | val equals: t * t -> bool = op = | |
93 | ||
94 | fun toString s = | |
95 | #1 (valOf (List.peek (map, fn (_, s') => equals (s, s')))) | |
96 | ||
97 | val layout = Layout.str o toString | |
98 | end | |
99 | ||
100 | structure Authority = | |
101 | struct | |
102 | type t = {user: string option, | |
103 | host: string, | |
104 | port: int option} | |
105 | ||
106 | fun layout ({user, host, port}: t) = | |
107 | Layout.record [("user", Option.layout String.layout user), | |
108 | ("host", String.layout host), | |
109 | ("port", Option.layout Int.layout port)] | |
110 | ||
111 | fun canonicalize {user, host, port} = | |
112 | {user = Option.map (user, String.toLower), | |
113 | host = String.toLower host, | |
114 | port = port} | |
115 | ||
116 | fun equals ({user = u, host = h, port = p}: t, | |
117 | {user = u', host = h', port = p'}: t): bool = | |
118 | Option.equals (u, u', String.equals) | |
119 | andalso String.toLower h = String.toLower h' | |
120 | andalso Option.equals (p, p', Port.equals) | |
121 | ||
122 | val equals = | |
123 | Trace.trace2 ("Url.Authority.equals", layout, layout, Bool.layout) equals | |
124 | end | |
125 | ||
126 | (* The numbers in comments are rule numbers from Section 5.2 of RFC 2396. *) | |
127 | (* canonicalizePath (p1, p2, f) | |
128 | * Assume p1 is already canonicalized. | |
129 | *) | |
130 | fun canonicalizePath (p1: string list, p2: string list, f: string) = | |
131 | let | |
132 | fun loop (r, ac) = | |
133 | case r of | |
134 | [] => | |
135 | (case f of | |
136 | "." => (rev ac, "") (* 6d *) | |
137 | | ".." => (case ac of | |
138 | [] => ([], "..") | |
139 | | ".." :: _ => (rev ac, "..") | |
140 | | _ :: ac => (rev ac, "")) (* 6f *) | |
141 | | _ => (rev ac, f)) | |
142 | | "" :: r => loop (r, ac) | |
143 | | "." :: r => loop (r, ac) (* 6c *) | |
144 | | ".." :: r => loop (r, | |
145 | case ac of | |
146 | [] => [".."] | |
147 | | ".." :: _ => ".." :: ac | |
148 | | _ :: ac => ac) (* 6e *) | |
149 | | s :: r => loop (r, s :: ac) | |
150 | in loop (p2, rev p1) | |
151 | end | |
152 | ||
153 | structure Path = | |
154 | struct | |
155 | type t = {file: string, | |
156 | isAbsolute: bool, | |
157 | path: string list} | |
158 | ||
159 | local | |
160 | fun make f (p: t) = f p | |
161 | in | |
162 | val file = make #file | |
163 | val isAbsolute = make #isAbsolute | |
164 | val path = make #path | |
165 | end | |
166 | ||
167 | val root = {isAbsolute = true, | |
168 | path = [], | |
169 | file = ""} | |
170 | ||
171 | fun canonicalize {isAbsolute = i, path = p, file = f} = | |
172 | let val (p, f) = canonicalizePath ([], p, f) | |
173 | in {isAbsolute = i, path = p, file = f} | |
174 | end | |
175 | ||
176 | fun toString ({isAbsolute, path, file}) = | |
177 | concat [if isAbsolute then "/" else "", | |
178 | escape (concat (List.separate (path @ [file], "/")))] | |
179 | ||
180 | val layout = Layout.str o toString | |
181 | end | |
182 | ||
183 | datatype t = | |
184 | T of {authority: Authority.t option, | |
185 | fragment: string option, | |
186 | path: Path.t option, | |
187 | query: string option, | |
188 | scheme: Scheme.t option} (* NONE in relative urls *) | |
189 | | JavaScript of string | |
190 | | MailTo of string | |
191 | | News of string | |
192 | | Opaque of {scheme: string, | |
193 | rest: string} | |
194 | ||
195 | fun addQuery (u: t, q) = | |
196 | case u of | |
197 | T {authority, fragment, path, query, scheme}=> | |
198 | if isSome query | |
199 | then Error.bug "Url.addQuery" | |
200 | else | |
201 | T {authority = authority, | |
202 | fragment = fragment, | |
203 | path = path, | |
204 | query = SOME q, | |
205 | scheme = scheme} | |
206 | | _ => Error.bug "Url.addQuery" | |
207 | ||
208 | fun host (u: t): string = | |
209 | case u of | |
210 | T {authority = SOME {host, ...}, ...} => host | |
211 | | _ => Error.bug "Url.host" | |
212 | ||
213 | fun path (u: t): Path.t = | |
214 | case u of | |
215 | T {path = SOME p, ...} => p | |
216 | | _ => Error.bug "Url.path" | |
217 | ||
218 | fun mo (opt, f) = | |
219 | case opt of | |
220 | NONE => "" | |
221 | | SOME x => f x | |
222 | ||
223 | fun toString url = | |
224 | case url of | |
225 | T {scheme, authority, path, query, fragment} => | |
226 | concat [mo (scheme, fn s => concat [Scheme.toString s, ":"]), | |
227 | mo (authority, fn {user, host, port} => | |
228 | concat ["//", | |
229 | mo (user, fn u => concat [escape u, "@"]), | |
230 | host, | |
231 | mo (port, fn p => concat [":", Int.toString p])]), | |
232 | mo (path, Path.toString), | |
233 | mo (query, fn q => concat ["?", if !escapeQuery then escape q | |
234 | else q]), | |
235 | mo (fragment, fn f => concat ["#", escape f]) | |
236 | ] | |
237 | | JavaScript s => concat ["javascript:", escape s] | |
238 | | MailTo email => concat ["mailto:", escape email] | |
239 | | News group => concat ["news:", escape group] | |
240 | | Opaque {scheme, rest} => concat [scheme, ":", escape rest] | |
241 | ||
242 | val layout = Layout.str o toString | |
243 | ||
244 | val toString = | |
245 | Trace.trace ("Url.toString", layout, String.layout) toString | |
246 | ||
247 | val layout = | |
248 | fn T {scheme, authority, path, query, fragment} => | |
249 | Layout.record [("scheme", Option.layout Scheme.layout scheme), | |
250 | ("authority", Option.layout Authority.layout authority), | |
251 | ("path", Option.layout Path.layout path), | |
252 | ("query", Option.layout String.layout query), | |
253 | ("fragment", Option.layout String.layout fragment)] | |
254 | | u => layout u | |
255 | ||
256 | val equals: t * t -> bool = op = | |
257 | ||
258 | structure Regexp = | |
259 | struct | |
260 | open Regexp | |
261 | ||
262 | val digit = isChar Char.isDigit | |
263 | val upalpha = isChar Char.isUpper | |
264 | val lowalpha = isChar Char.isLower | |
265 | val alpha = isChar Char.isAlpha | |
266 | val alphanum = isChar Char.isAlphaNum | |
267 | val hex = isChar Char.isHexDigit | |
268 | val escaped = seq [char #"%", hex, hex] | |
269 | val mark = oneOf "-_.!~*' ()" | |
270 | val unreserved = or [alphanum, mark] | |
271 | val reserved = oneOf ";/?:@&=+$," | |
272 | val printable = isChar Char.isPrint | |
273 | (*val urlc = or [reserved, unreserved, escaped]*) | |
274 | (* It's pointless to follow the spec on urlc, which rules out lots of | |
275 | * printable characters. Lot's of sites use printable characters outside | |
276 | * the spec, and browsers handle them, so we should too. | |
277 | *) | |
278 | val urlc = printable | |
279 | val fragment' = Save.new () | |
280 | val fragment = save (star urlc, fragment') | |
281 | val query' = Save.new () | |
282 | (* The official definition of query says urlc*, but this doesn't work with | |
283 | * our expanded meaning of urlc = printable, since then the query consumes | |
284 | * the fragment. | |
285 | *) | |
286 | (* val query = save (star urlc, query') *) | |
287 | val query = save (star (isChar (fn c => Char.isPrint c | |
288 | andalso c <> #"#")), | |
289 | query') | |
290 | val port' = Save.new () | |
291 | val port = save (star digit, port') | |
292 | val IPv4address = seq [oneOrMore digit, char #".", | |
293 | oneOrMore digit, char #".", | |
294 | oneOrMore digit, char #".", | |
295 | oneOrMore digit] | |
296 | val toplabel = or [alpha, | |
297 | seq [alpha, star (or [alphanum, char #"-"]), alphanum]] | |
298 | val domainlabel = or [alphanum, | |
299 | seq [alphanum, | |
300 | star (or [alphanum, char #"-"]), | |
301 | alphanum]] | |
302 | val hostname = seq [star (seq [domainlabel, char #"."]), | |
303 | toplabel, | |
304 | optional (char #".")] | |
305 | val host' = Save.new () | |
306 | val host = save (or [hostname, IPv4address], host') | |
307 | val hostport = seq [host, optional (seq [char #":", port])] | |
308 | val userinfo' = Save.new () | |
309 | val userinfo = | |
310 | save (star (or [unreserved, escaped, oneOf ";:&=+$"]), userinfo') | |
311 | val server = optional (seq [optional (seq [userinfo, char #"@"]), | |
312 | hostport]) | |
313 | val regName' = Save.new () | |
314 | val regName = | |
315 | save (oneOrMore (or [unreserved, | |
316 | escaped, | |
317 | oneOf "$,;:@&=+"]), | |
318 | regName') | |
319 | val authority = or [server, regName] | |
320 | val scheme' = Save.new () | |
321 | val scheme = | |
322 | save (seq [alpha, star (or [alpha, digit, oneOf "+-."])], scheme') | |
323 | val relSegment' = Save.new () | |
324 | val relSegment = | |
325 | save (oneOrMore (or [unreserved, escaped, oneOf ";@&=+$,"]), | |
326 | relSegment') | |
327 | (* val pchar = or [unreserved, escaped, oneOf ":@&=+$,", wrong] *) | |
328 | (* val param = star pchar *) | |
329 | (* val segment = seq [star pchar, star (seq [char #";", param])] *) | |
330 | (* val pathSegments = seq [segment, star (seq [char #"/", segment])] *) | |
331 | val pathSegments' = Save.new () | |
332 | val pathSegments = | |
333 | save (star (isChar (fn c => (Char.isPrint c andalso | |
334 | not (String.contains ("?#", c))))), | |
335 | pathSegments') | |
336 | val absPath = seq [char #"/", pathSegments] | |
337 | val relPath = seq [relSegment, optional absPath] | |
338 | val netPath = seq [string "//", authority, optional absPath] | |
339 | val urlcNoSlash = or [unreserved, escaped, oneOf ";?:@&=+$,"] | |
340 | val opaquePart' = Save.new () | |
341 | val opaquePart = save (seq [urlcNoSlash, star urlc], opaquePart') | |
342 | val hierPart = seq [or [netPath, absPath], | |
343 | optional (seq [char #"?", query])] | |
344 | (* netPath occurs before absPath in the following regexp because | |
345 | * you want urls like //foo.com/z to be a netPath with host foo.com and | |
346 | * not as an absPath. Fortunately, the regexp library returns the | |
347 | * first matching choice in an or. | |
348 | *) | |
349 | val relativeUrl = | |
350 | seq [or [netPath, absPath, relPath, | |
351 | null (* null added for empty urls -- these are | |
352 | * not in RFC 2396 as far as I can tell, but | |
353 | * some of their examples use them. | |
354 | *) | |
355 | ], | |
356 | optional (seq [char #"?", query])] | |
357 | val absoluteUrl = seq [scheme, char #":", or [hierPart, opaquePart]] | |
358 | val url = seq [optional (or [absoluteUrl, relativeUrl]), | |
359 | optional (seq [char #"#", fragment])] | |
360 | val url = Promise.lazy (fn () => compileDFA url) | |
361 | ||
362 | fun peekQuery (m: Match.t): string option = | |
363 | Option.map (Match.peek (m, query'), fn ss => | |
364 | let | |
365 | val s = Substring.toString ss | |
366 | in | |
367 | if !escapeQuery | |
368 | then unescape s | |
369 | else s | |
370 | end) | |
371 | ||
372 | fun getAbsPath (m: Match.t): Path.t = | |
373 | case Match.peek (m, pathSegments') of | |
374 | NONE => Error.bug "Url.Regexp.getAbsPath" | |
375 | | SOME ss => | |
376 | let | |
377 | val s = Substring.toString ss | |
378 | val (p, f) = | |
379 | List.splitLast | |
380 | (String.fields (unescape s, fn c => c = #"/")) | |
381 | in {isAbsolute = true, path = p, file = f} | |
382 | end | |
383 | end | |
384 | ||
385 | fun getMatch (m: Regexp.Match.t): t = | |
386 | let open Regexp | |
387 | val {peek, lookup, exists, ...} = Match.stringFuns m | |
388 | in if exists opaquePart' | |
389 | then | |
390 | let | |
391 | val scheme = String.toLower (lookup scheme') | |
392 | val rest = unescape (lookup opaquePart') | |
393 | in case scheme of | |
394 | "javascript" => JavaScript rest | |
395 | | "mailto" => MailTo rest | |
396 | | "news" => News rest | |
397 | | _ => Opaque {scheme = scheme, rest = rest} | |
398 | end | |
399 | else | |
400 | let | |
401 | val authority = | |
402 | if exists host' | |
403 | then | |
404 | SOME {user = Option.map (peek userinfo', unescape), | |
405 | host = lookup host', | |
406 | port = Option.map (peek port', | |
407 | valOf o Int.fromString)} | |
408 | else NONE | |
409 | fun split ss = String.fields (unescape ss, fn c => c = #"/") | |
410 | val path = | |
411 | case (Option.map (peek relSegment', unescape), | |
412 | Option.map (peek pathSegments', split)) of | |
413 | (NONE, NONE) => NONE | |
414 | | (SOME file, NONE) => SOME {isAbsolute = false, | |
415 | path = [], | |
416 | file = file} | |
417 | | (NONE, SOME ss) => | |
418 | let val (p, f) = List.splitLast ss | |
419 | in SOME {isAbsolute = true, | |
420 | path = p, file = f} | |
421 | end | |
422 | | (SOME s, SOME ss) => | |
423 | let val (p, f) = List.splitLast ss | |
424 | in SOME {isAbsolute = false, | |
425 | path = s :: p, file = f} | |
426 | end | |
427 | in T {scheme = Option.map (peek scheme', Scheme.fromString), | |
428 | authority = authority, | |
429 | path = path, | |
430 | query = peekQuery m, | |
431 | fragment = Option.map (peek fragment', unescape)} | |
432 | end | |
433 | end | |
434 | ||
435 | fun fromString (urlString: string): t option = | |
436 | Option.map (Regexp.Compiled.matchAll (Regexp.url(), urlString), getMatch) | |
437 | ||
438 | val fromString = | |
439 | Trace.trace ("Url.fromString", String.layout, Option.layout layout) | |
440 | fromString | |
441 | ||
442 | fun equals (u: t, u': t): bool = u = u' | |
443 | ||
444 | val mailto = MailTo | |
445 | val news = News | |
446 | ||
447 | (* ------------------------------------------------- *) | |
448 | (* relativize *) | |
449 | (* ------------------------------------------------- *) | |
450 | ||
451 | fun relativize {base = b, relative = r} = | |
452 | case (b, r) of | |
453 | (T {scheme = SOME s, authority = SOME a, path = p, ...}, | |
454 | T {scheme = SOME s', authority = SOME a', path = p', query = q', | |
455 | fragment = f'}) => | |
456 | if Scheme.equals (s, s') | |
457 | andalso Authority.equals (a, a') | |
458 | then let | |
459 | fun some (p, f) = | |
460 | let | |
461 | val (p, f) = | |
462 | case (p, f) of | |
463 | ([], "") => ([], ".") | |
464 | | _ => (p, f) | |
465 | in SOME {isAbsolute = false, path = p, file = f} | |
466 | end | |
467 | val p': Path.t option = | |
468 | case (p, p') of | |
469 | (NONE, NONE) => NONE | |
470 | | (NONE, SOME {path, file, ...}) => some (path, file) | |
471 | | (SOME {path, ...}, NONE) => | |
472 | some (List.map (path, fn _ => ".."), "") | |
473 | | (SOME {path = p, ...}, SOME {path = p', file, ...}) => | |
474 | let | |
475 | val (p, p') = | |
476 | List.removeCommonPrefix (p, p', String.equals) | |
477 | ||
478 | in some (List.map (p, fn _ => "..") @ p', file) | |
479 | end | |
480 | in SOME (T {scheme = NONE, authority = NONE, path = p', query = q', | |
481 | fragment = f'}) | |
482 | end | |
483 | else NONE | |
484 | | _ => NONE | |
485 | ||
486 | val relativize = | |
487 | Trace.trace ("Url.relativize", | |
488 | fn {base = b, relative = r} => Layout.tuple [layout b, layout r], | |
489 | Option.layout layout) | |
490 | relativize | |
491 | ||
492 | (* ------------------------------------------------- *) | |
493 | (* resolve *) | |
494 | (* ------------------------------------------------- *) | |
495 | ||
496 | (* The numbers in comments are rule numbers from Section 5.2 of RFC 2396. *) | |
497 | fun resolve {base, relative} = | |
498 | case (base, relative) of | |
499 | (_, T {scheme = SOME _, ...}) => relative (* 3 *) | |
500 | | (T {scheme = s, authority = a, path = p, query = q, ...}, | |
501 | T {authority = a', path = p', query = q', fragment = f', ...}) => | |
502 | let | |
503 | val (a, p, q) = | |
504 | case (a', p', q') of | |
505 | (SOME _, _, _) => (a', p', q') (* 4 *) | |
506 | | (_, NONE, NONE) => (a, p, q) (* 2 *) | |
507 | | (_, NONE, SOME _) => (* 6 *) | |
508 | let | |
509 | val p = | |
510 | Option.map (p, fn {isAbsolute, path, file} => | |
511 | {isAbsolute = isAbsolute, | |
512 | path = path, | |
513 | file = ""}) | |
514 | in (a, p, q') | |
515 | end | |
516 | | (_, SOME {isAbsolute = true, ...}, _) => (a, p', q') (* 5 *) | |
517 | | (_, SOME {isAbsolute = false, path = p', file = f'}, _) => (* 6 *) | |
518 | let | |
519 | val (p', f') = | |
520 | case p of | |
521 | NONE => (p', f') | |
522 | | SOME {path, ...} => canonicalizePath (path, p', f') | |
523 | in (a, SOME {isAbsolute = true, path = p', file = f'}, q') | |
524 | end | |
525 | in T {scheme = s, authority = a, path = p, query = q, fragment = f'} | |
526 | end | |
527 | | _ => relative | |
528 | ||
529 | val resolve = | |
530 | Trace.trace | |
531 | ("Url.resolve", | |
532 | fn {base = b, relative = r} => Layout.tuple [layout b, layout r], | |
533 | layout) | |
534 | resolve | |
535 | ||
536 | (* ------------------------------------------------- *) | |
537 | (* canonicalize *) | |
538 | (* ------------------------------------------------- *) | |
539 | ||
540 | fun canonicalize (u: t): t = | |
541 | case u of | |
542 | T {scheme, authority, path, query, fragment} => | |
543 | T {scheme = scheme, | |
544 | authority = Option.map (authority, Authority.canonicalize), | |
545 | path = Option.map (path, Path.canonicalize), | |
546 | query = query, | |
547 | fragment = fragment} | |
548 | | _ => u | |
549 | ||
550 | ||
551 | end |