Import Upstream version 20180207
[hcoop/debian/mlton.git] / lib / mlton / basic / url.sml
CommitLineData
7f918cf1
CE
1(* Copyright (C) 2009 Matthew Fluet.
2 * Copyright (C) 1999-2006 Henry Cejtin, Matthew Fluet, Suresh
3 * Jagannathan, and Stephen Weeks.
4 *
5 * MLton is released under a BSD-style license.
6 * See the file MLton-LICENSE for details.
7 *)
8
9structure Url: URL =
10struct
11
12val escapeQuery = ref true
13
14structure Char =
15 struct
16 open Char
17
18 val radix: int = 16
19
20 fun fromHexChars (hi: t, lo: t) =
21 chr (toHexDigit hi * radix + toHexDigit lo)
22
23 fun toHexChars (c: t): t * t =
24 let
25 val (hi, lo) = Int.divMod (ord c, radix)
26 in
27 (fromHexDigit hi, fromHexDigit lo)
28 end
29
30 fun escapeHex (c: t): string =
31 let
32 val (hi, lo) = toHexChars c
33 in
34 implode [#"%", hi, lo]
35 end
36
37 end
38
39fun unescape (s: string): string =
40 let
41 fun sub i = String.sub (s, i)
42 val n = String.size s
43 fun loop (i, cs) =
44 if i >= n
45 then implode (rev cs)
46 else let val c = sub i
47 in if c = #"%"
48 then loop (i + 3,
49 Char.fromHexChars (sub (i + 1), sub (i + 2)) :: cs)
50 else loop (i + 1, c :: cs)
51 end
52 in loop (0, [])
53 end
54
55val shouldEscape: char -> bool =
56 Char.memoize (fn c =>
57 not (Char.isGraph c)
58 orelse c = Char.dquote
59 orelse String.contains ("?<>#% {}|\\^ []`", c))
60
61val shouldEscape =
62 Trace.trace ("Url.shouldEscape", Char.layout, Bool.layout) shouldEscape
63
64fun escape s =
65 String.translate
66 (s, fn c =>
67 if shouldEscape c
68 then Char.escapeHex c
69 else str c)
70
71structure Scheme =
72 struct
73 datatype t =
74 File
75 | Ftp
76 | Gopher
77 | Http
78 | Https
79 | Telnet
80
81 val map =
82 [("file", File),
83 ("ftp", Ftp),
84 ("gopher", Gopher),
85 ("http", Http),
86 ("https", Https),
87 ("telnet", Telnet)]
88
89 val fromString =
90 String.memoizeList (fn _ => Error.bug "Url.Scheme.fromString", map)
91
92 val equals: t * t -> bool = op =
93
94 fun toString s =
95 #1 (valOf (List.peek (map, fn (_, s') => equals (s, s'))))
96
97 val layout = Layout.str o toString
98 end
99
100structure Authority =
101 struct
102 type t = {user: string option,
103 host: string,
104 port: int option}
105
106 fun layout ({user, host, port}: t) =
107 Layout.record [("user", Option.layout String.layout user),
108 ("host", String.layout host),
109 ("port", Option.layout Int.layout port)]
110
111 fun canonicalize {user, host, port} =
112 {user = Option.map (user, String.toLower),
113 host = String.toLower host,
114 port = port}
115
116 fun equals ({user = u, host = h, port = p}: t,
117 {user = u', host = h', port = p'}: t): bool =
118 Option.equals (u, u', String.equals)
119 andalso String.toLower h = String.toLower h'
120 andalso Option.equals (p, p', Port.equals)
121
122 val equals =
123 Trace.trace2 ("Url.Authority.equals", layout, layout, Bool.layout) equals
124 end
125
126(* The numbers in comments are rule numbers from Section 5.2 of RFC 2396. *)
127(* canonicalizePath (p1, p2, f)
128 * Assume p1 is already canonicalized.
129 *)
130fun canonicalizePath (p1: string list, p2: string list, f: string) =
131 let
132 fun loop (r, ac) =
133 case r of
134 [] =>
135 (case f of
136 "." => (rev ac, "") (* 6d *)
137 | ".." => (case ac of
138 [] => ([], "..")
139 | ".." :: _ => (rev ac, "..")
140 | _ :: ac => (rev ac, "")) (* 6f *)
141 | _ => (rev ac, f))
142 | "" :: r => loop (r, ac)
143 | "." :: r => loop (r, ac) (* 6c *)
144 | ".." :: r => loop (r,
145 case ac of
146 [] => [".."]
147 | ".." :: _ => ".." :: ac
148 | _ :: ac => ac) (* 6e *)
149 | s :: r => loop (r, s :: ac)
150 in loop (p2, rev p1)
151 end
152
153structure Path =
154 struct
155 type t = {file: string,
156 isAbsolute: bool,
157 path: string list}
158
159 local
160 fun make f (p: t) = f p
161 in
162 val file = make #file
163 val isAbsolute = make #isAbsolute
164 val path = make #path
165 end
166
167 val root = {isAbsolute = true,
168 path = [],
169 file = ""}
170
171 fun canonicalize {isAbsolute = i, path = p, file = f} =
172 let val (p, f) = canonicalizePath ([], p, f)
173 in {isAbsolute = i, path = p, file = f}
174 end
175
176 fun toString ({isAbsolute, path, file}) =
177 concat [if isAbsolute then "/" else "",
178 escape (concat (List.separate (path @ [file], "/")))]
179
180 val layout = Layout.str o toString
181 end
182
183datatype t =
184 T of {authority: Authority.t option,
185 fragment: string option,
186 path: Path.t option,
187 query: string option,
188 scheme: Scheme.t option} (* NONE in relative urls *)
189 | JavaScript of string
190 | MailTo of string
191 | News of string
192 | Opaque of {scheme: string,
193 rest: string}
194
195fun addQuery (u: t, q) =
196 case u of
197 T {authority, fragment, path, query, scheme}=>
198 if isSome query
199 then Error.bug "Url.addQuery"
200 else
201 T {authority = authority,
202 fragment = fragment,
203 path = path,
204 query = SOME q,
205 scheme = scheme}
206 | _ => Error.bug "Url.addQuery"
207
208fun host (u: t): string =
209 case u of
210 T {authority = SOME {host, ...}, ...} => host
211 | _ => Error.bug "Url.host"
212
213fun path (u: t): Path.t =
214 case u of
215 T {path = SOME p, ...} => p
216 | _ => Error.bug "Url.path"
217
218fun mo (opt, f) =
219 case opt of
220 NONE => ""
221 | SOME x => f x
222
223fun toString url =
224 case url of
225 T {scheme, authority, path, query, fragment} =>
226 concat [mo (scheme, fn s => concat [Scheme.toString s, ":"]),
227 mo (authority, fn {user, host, port} =>
228 concat ["//",
229 mo (user, fn u => concat [escape u, "@"]),
230 host,
231 mo (port, fn p => concat [":", Int.toString p])]),
232 mo (path, Path.toString),
233 mo (query, fn q => concat ["?", if !escapeQuery then escape q
234 else q]),
235 mo (fragment, fn f => concat ["#", escape f])
236 ]
237 | JavaScript s => concat ["javascript:", escape s]
238 | MailTo email => concat ["mailto:", escape email]
239 | News group => concat ["news:", escape group]
240 | Opaque {scheme, rest} => concat [scheme, ":", escape rest]
241
242val layout = Layout.str o toString
243
244val toString =
245 Trace.trace ("Url.toString", layout, String.layout) toString
246
247val layout =
248 fn T {scheme, authority, path, query, fragment} =>
249 Layout.record [("scheme", Option.layout Scheme.layout scheme),
250 ("authority", Option.layout Authority.layout authority),
251 ("path", Option.layout Path.layout path),
252 ("query", Option.layout String.layout query),
253 ("fragment", Option.layout String.layout fragment)]
254 | u => layout u
255
256val equals: t * t -> bool = op =
257
258structure Regexp =
259 struct
260 open Regexp
261
262 val digit = isChar Char.isDigit
263 val upalpha = isChar Char.isUpper
264 val lowalpha = isChar Char.isLower
265 val alpha = isChar Char.isAlpha
266 val alphanum = isChar Char.isAlphaNum
267 val hex = isChar Char.isHexDigit
268 val escaped = seq [char #"%", hex, hex]
269 val mark = oneOf "-_.!~*' ()"
270 val unreserved = or [alphanum, mark]
271 val reserved = oneOf ";/?:@&=+$,"
272 val printable = isChar Char.isPrint
273 (*val urlc = or [reserved, unreserved, escaped]*)
274 (* It's pointless to follow the spec on urlc, which rules out lots of
275 * printable characters. Lot's of sites use printable characters outside
276 * the spec, and browsers handle them, so we should too.
277 *)
278 val urlc = printable
279 val fragment' = Save.new ()
280 val fragment = save (star urlc, fragment')
281 val query' = Save.new ()
282(* The official definition of query says urlc*, but this doesn't work with
283 * our expanded meaning of urlc = printable, since then the query consumes
284 * the fragment.
285 *)
286(* val query = save (star urlc, query') *)
287 val query = save (star (isChar (fn c => Char.isPrint c
288 andalso c <> #"#")),
289 query')
290 val port' = Save.new ()
291 val port = save (star digit, port')
292 val IPv4address = seq [oneOrMore digit, char #".",
293 oneOrMore digit, char #".",
294 oneOrMore digit, char #".",
295 oneOrMore digit]
296 val toplabel = or [alpha,
297 seq [alpha, star (or [alphanum, char #"-"]), alphanum]]
298 val domainlabel = or [alphanum,
299 seq [alphanum,
300 star (or [alphanum, char #"-"]),
301 alphanum]]
302 val hostname = seq [star (seq [domainlabel, char #"."]),
303 toplabel,
304 optional (char #".")]
305 val host' = Save.new ()
306 val host = save (or [hostname, IPv4address], host')
307 val hostport = seq [host, optional (seq [char #":", port])]
308 val userinfo' = Save.new ()
309 val userinfo =
310 save (star (or [unreserved, escaped, oneOf ";:&=+$"]), userinfo')
311 val server = optional (seq [optional (seq [userinfo, char #"@"]),
312 hostport])
313 val regName' = Save.new ()
314 val regName =
315 save (oneOrMore (or [unreserved,
316 escaped,
317 oneOf "$,;:@&=+"]),
318 regName')
319 val authority = or [server, regName]
320 val scheme' = Save.new ()
321 val scheme =
322 save (seq [alpha, star (or [alpha, digit, oneOf "+-."])], scheme')
323 val relSegment' = Save.new ()
324 val relSegment =
325 save (oneOrMore (or [unreserved, escaped, oneOf ";@&=+$,"]),
326 relSegment')
327 (* val pchar = or [unreserved, escaped, oneOf ":@&=+$,", wrong] *)
328 (* val param = star pchar *)
329 (* val segment = seq [star pchar, star (seq [char #";", param])] *)
330 (* val pathSegments = seq [segment, star (seq [char #"/", segment])] *)
331 val pathSegments' = Save.new ()
332 val pathSegments =
333 save (star (isChar (fn c => (Char.isPrint c andalso
334 not (String.contains ("?#", c))))),
335 pathSegments')
336 val absPath = seq [char #"/", pathSegments]
337 val relPath = seq [relSegment, optional absPath]
338 val netPath = seq [string "//", authority, optional absPath]
339 val urlcNoSlash = or [unreserved, escaped, oneOf ";?:@&=+$,"]
340 val opaquePart' = Save.new ()
341 val opaquePart = save (seq [urlcNoSlash, star urlc], opaquePart')
342 val hierPart = seq [or [netPath, absPath],
343 optional (seq [char #"?", query])]
344 (* netPath occurs before absPath in the following regexp because
345 * you want urls like //foo.com/z to be a netPath with host foo.com and
346 * not as an absPath. Fortunately, the regexp library returns the
347 * first matching choice in an or.
348 *)
349 val relativeUrl =
350 seq [or [netPath, absPath, relPath,
351 null (* null added for empty urls -- these are
352 * not in RFC 2396 as far as I can tell, but
353 * some of their examples use them.
354 *)
355 ],
356 optional (seq [char #"?", query])]
357 val absoluteUrl = seq [scheme, char #":", or [hierPart, opaquePart]]
358 val url = seq [optional (or [absoluteUrl, relativeUrl]),
359 optional (seq [char #"#", fragment])]
360 val url = Promise.lazy (fn () => compileDFA url)
361
362 fun peekQuery (m: Match.t): string option =
363 Option.map (Match.peek (m, query'), fn ss =>
364 let
365 val s = Substring.toString ss
366 in
367 if !escapeQuery
368 then unescape s
369 else s
370 end)
371
372 fun getAbsPath (m: Match.t): Path.t =
373 case Match.peek (m, pathSegments') of
374 NONE => Error.bug "Url.Regexp.getAbsPath"
375 | SOME ss =>
376 let
377 val s = Substring.toString ss
378 val (p, f) =
379 List.splitLast
380 (String.fields (unescape s, fn c => c = #"/"))
381 in {isAbsolute = true, path = p, file = f}
382 end
383 end
384
385fun getMatch (m: Regexp.Match.t): t =
386 let open Regexp
387 val {peek, lookup, exists, ...} = Match.stringFuns m
388 in if exists opaquePart'
389 then
390 let
391 val scheme = String.toLower (lookup scheme')
392 val rest = unescape (lookup opaquePart')
393 in case scheme of
394 "javascript" => JavaScript rest
395 | "mailto" => MailTo rest
396 | "news" => News rest
397 | _ => Opaque {scheme = scheme, rest = rest}
398 end
399 else
400 let
401 val authority =
402 if exists host'
403 then
404 SOME {user = Option.map (peek userinfo', unescape),
405 host = lookup host',
406 port = Option.map (peek port',
407 valOf o Int.fromString)}
408 else NONE
409 fun split ss = String.fields (unescape ss, fn c => c = #"/")
410 val path =
411 case (Option.map (peek relSegment', unescape),
412 Option.map (peek pathSegments', split)) of
413 (NONE, NONE) => NONE
414 | (SOME file, NONE) => SOME {isAbsolute = false,
415 path = [],
416 file = file}
417 | (NONE, SOME ss) =>
418 let val (p, f) = List.splitLast ss
419 in SOME {isAbsolute = true,
420 path = p, file = f}
421 end
422 | (SOME s, SOME ss) =>
423 let val (p, f) = List.splitLast ss
424 in SOME {isAbsolute = false,
425 path = s :: p, file = f}
426 end
427 in T {scheme = Option.map (peek scheme', Scheme.fromString),
428 authority = authority,
429 path = path,
430 query = peekQuery m,
431 fragment = Option.map (peek fragment', unescape)}
432 end
433 end
434
435fun fromString (urlString: string): t option =
436 Option.map (Regexp.Compiled.matchAll (Regexp.url(), urlString), getMatch)
437
438val fromString =
439 Trace.trace ("Url.fromString", String.layout, Option.layout layout)
440 fromString
441
442fun equals (u: t, u': t): bool = u = u'
443
444val mailto = MailTo
445val news = News
446
447(* ------------------------------------------------- *)
448(* relativize *)
449(* ------------------------------------------------- *)
450
451fun relativize {base = b, relative = r} =
452 case (b, r) of
453 (T {scheme = SOME s, authority = SOME a, path = p, ...},
454 T {scheme = SOME s', authority = SOME a', path = p', query = q',
455 fragment = f'}) =>
456 if Scheme.equals (s, s')
457 andalso Authority.equals (a, a')
458 then let
459 fun some (p, f) =
460 let
461 val (p, f) =
462 case (p, f) of
463 ([], "") => ([], ".")
464 | _ => (p, f)
465 in SOME {isAbsolute = false, path = p, file = f}
466 end
467 val p': Path.t option =
468 case (p, p') of
469 (NONE, NONE) => NONE
470 | (NONE, SOME {path, file, ...}) => some (path, file)
471 | (SOME {path, ...}, NONE) =>
472 some (List.map (path, fn _ => ".."), "")
473 | (SOME {path = p, ...}, SOME {path = p', file, ...}) =>
474 let
475 val (p, p') =
476 List.removeCommonPrefix (p, p', String.equals)
477
478 in some (List.map (p, fn _ => "..") @ p', file)
479 end
480 in SOME (T {scheme = NONE, authority = NONE, path = p', query = q',
481 fragment = f'})
482 end
483 else NONE
484 | _ => NONE
485
486val relativize =
487 Trace.trace ("Url.relativize",
488 fn {base = b, relative = r} => Layout.tuple [layout b, layout r],
489 Option.layout layout)
490 relativize
491
492(* ------------------------------------------------- *)
493(* resolve *)
494(* ------------------------------------------------- *)
495
496(* The numbers in comments are rule numbers from Section 5.2 of RFC 2396. *)
497fun resolve {base, relative} =
498 case (base, relative) of
499 (_, T {scheme = SOME _, ...}) => relative (* 3 *)
500 | (T {scheme = s, authority = a, path = p, query = q, ...},
501 T {authority = a', path = p', query = q', fragment = f', ...}) =>
502 let
503 val (a, p, q) =
504 case (a', p', q') of
505 (SOME _, _, _) => (a', p', q') (* 4 *)
506 | (_, NONE, NONE) => (a, p, q) (* 2 *)
507 | (_, NONE, SOME _) => (* 6 *)
508 let
509 val p =
510 Option.map (p, fn {isAbsolute, path, file} =>
511 {isAbsolute = isAbsolute,
512 path = path,
513 file = ""})
514 in (a, p, q')
515 end
516 | (_, SOME {isAbsolute = true, ...}, _) => (a, p', q') (* 5 *)
517 | (_, SOME {isAbsolute = false, path = p', file = f'}, _) => (* 6 *)
518 let
519 val (p', f') =
520 case p of
521 NONE => (p', f')
522 | SOME {path, ...} => canonicalizePath (path, p', f')
523 in (a, SOME {isAbsolute = true, path = p', file = f'}, q')
524 end
525 in T {scheme = s, authority = a, path = p, query = q, fragment = f'}
526 end
527 | _ => relative
528
529val resolve =
530 Trace.trace
531 ("Url.resolve",
532 fn {base = b, relative = r} => Layout.tuple [layout b, layout r],
533 layout)
534 resolve
535
536(* ------------------------------------------------- *)
537(* canonicalize *)
538(* ------------------------------------------------- *)
539
540fun canonicalize (u: t): t =
541 case u of
542 T {scheme, authority, path, query, fragment} =>
543 T {scheme = scheme,
544 authority = Option.map (authority, Authority.canonicalize),
545 path = Option.map (path, Path.canonicalize),
546 query = query,
547 fragment = fragment}
548 | _ => u
549
550
551end