Commit | Line | Data |
---|---|---|
7f918cf1 CE |
1 | (* Copyright (C) 1999-2007 Henry Cejtin, Matthew Fluet, Suresh |
2 | * Jagannathan, and Stephen Weeks. | |
3 | * Copyright (C) 1997-2000 NEC Research Institute. | |
4 | * | |
5 | * MLton is released under a BSD-style license. | |
6 | * See the file MLton-LICENSE for details. | |
7 | *) | |
8 | ||
9 | signature CHAR_ARG = | |
10 | sig | |
11 | structure PreChar : PRE_CHAR | |
12 | structure CharVector: EQTYPE_MONO_VECTOR_EXTRA | |
13 | structure CharArray: MONO_ARRAY_EXTRA | |
14 | sharing type PreChar.char = CharVector.elem = CharArray.elem | |
15 | sharing type PreChar.string = CharVector.vector = CharArray.vector | |
16 | end | |
17 | ||
18 | functor CharFn(Arg : CHAR_ARG) | |
19 | :> CHAR_EXTRA | |
20 | where type char = Arg.PreChar.char | |
21 | where type string = Arg.PreChar.string = | |
22 | struct | |
23 | open Arg.PreChar | |
24 | ||
25 | type string = Arg.CharVector.vector | |
26 | val maxOrd: int = numChars - 1 | |
27 | ||
28 | val fromString = Arg.CharVector.fromPoly o | |
29 | Vector.map (fn x => fromChar x) o | |
30 | String.toPoly | |
31 | ||
32 | fun succ c = | |
33 | if Primitive.Controls.safe | |
34 | andalso c = maxChar | |
35 | then raise Chr | |
36 | else chrUnsafe (Int.+ (ord c, 1)) | |
37 | ||
38 | fun pred c = | |
39 | if Primitive.Controls.safe | |
40 | andalso c = minChar | |
41 | then raise Chr | |
42 | else chrUnsafe (Int.- (ord c, 1)) | |
43 | ||
44 | fun chrOpt c = | |
45 | if Primitive.Controls.safe | |
46 | andalso Int.gtu (c, maxOrd) | |
47 | then NONE | |
48 | else SOME (chrUnsafe c) | |
49 | ||
50 | fun chr c = | |
51 | case chrOpt c of | |
52 | NONE => raise Chr | |
53 | | SOME c => c | |
54 | ||
55 | (* To implement character classes, we cannot use lookup tables on the | |
56 | * order of the number of characters. We don't want to scan the string | |
57 | * each time, so instead we'll sort it and use binary search. | |
58 | *) | |
59 | fun contains s = | |
60 | let | |
61 | val a = Array.tabulate (Arg.CharVector.length s, | |
62 | fn i => Arg.CharVector.sub (s, i)) | |
63 | val () = Heap.heapSort (a, op <) | |
64 | in | |
65 | fn c => | |
66 | let | |
67 | val x = Heap.binarySearch (a, fn d => d < c) | |
68 | in | |
69 | if x = Array.length a then false else | |
70 | Array.sub (a, x) = c | |
71 | end | |
72 | end | |
73 | ||
74 | fun notContains s = not o contains s | |
75 | ||
76 | val c = fromChar | |
77 | val ( la, lA, lf, lF, lz, lZ, l0, l9, lSPACE,lBANG, lTIL, lTAB, lCR, lDEL) = | |
78 | (c#"a", c#"A", c#"f", c#"F", c#"z", c#"Z", c#"0", c#"9", c#" ", c#"!", c#"~", c#"\t", c#"\r", c#"\127") | |
79 | ||
80 | (* Range comparisons don't need tables! It's faster to just compare. *) | |
81 | fun isLower c = la <= c andalso c <= lz | |
82 | fun isUpper c = c <= lZ andalso lA <= c (* More discriminating first! *) | |
83 | fun isDigit c = c <= l9 andalso l0 <= c (* More discriminating first! *) | |
84 | fun isGraph c = lBANG <= c andalso c <= lTIL | |
85 | fun isPrint c = lSPACE <= c andalso c <= lTIL | |
86 | fun isCntrl c = c < lSPACE orelse c = lDEL | |
87 | fun isAscii c = c <= lDEL | |
88 | ||
89 | (* These take advantage of ASCII ordering to minimize comparisons. *) | |
90 | fun isAlpha c = if la <= c then c <= lz else lA <= c andalso c <= lZ | |
91 | fun isAlphaNum c = | |
92 | if lA <= c then | |
93 | if la <= c then c <= lz else c <= lZ | |
94 | else | |
95 | l0 <= c andalso c <= l9 | |
96 | fun isHexDigit c = | |
97 | if lA <= c then | |
98 | if la <= c then c <= lf else c <= lF | |
99 | else | |
100 | l0 <= c andalso c <= l9 | |
101 | fun isSpace c = if lCR < c then c = lSPACE else lTAB <= c | |
102 | fun isPunct c = isGraph c andalso not (isAlphaNum c) | |
103 | ||
104 | local | |
105 | fun make (test, diff) c = | |
106 | if test c then chrUnsafe (Int.+? (ord c, diff)) else c | |
107 | val diff = Int.- (ord lA, ord la) | |
108 | in | |
109 | val toLower = make (isUpper, Int.~ diff) | |
110 | val toUpper = make (isLower, diff) | |
111 | end | |
112 | ||
113 | fun control reader state = | |
114 | case reader state of | |
115 | NONE => NONE | |
116 | | SOME (c, state) => | |
117 | if Char.<= (#"@", c) andalso Char.<= (c, #"_") | |
118 | then SOME (chr (Int.-? (Char.ord c, Char.ord #"@")), state) | |
119 | else NONE | |
120 | ||
121 | fun formatChar reader state = | |
122 | case reader state of | |
123 | NONE => NONE | |
124 | | SOME (c, state) => | |
125 | if StringCvt.isSpace c | |
126 | then SOME ((), state) | |
127 | else NONE | |
128 | ||
129 | fun formatChars reader = | |
130 | let | |
131 | fun loop state = | |
132 | case formatChar reader state of | |
133 | NONE => state | |
134 | | SOME ((), state) => loop state | |
135 | in | |
136 | loop | |
137 | end | |
138 | ||
139 | val 'a formatSequences: (Char.char, 'a) StringCvt.reader -> 'a -> 'a = | |
140 | fn reader => | |
141 | let | |
142 | fun loop state = | |
143 | case reader state of | |
144 | SOME (#"\\", state1) => | |
145 | (case formatChar reader state1 of | |
146 | NONE => state | |
147 | | SOME ((), state2) => | |
148 | let | |
149 | val state3 = formatChars reader state2 | |
150 | in | |
151 | case reader state3 of | |
152 | SOME (#"\\", state4) => loop state4 | |
153 | | _ => state | |
154 | end) | |
155 | | _ => state | |
156 | in | |
157 | loop | |
158 | end | |
159 | ||
160 | fun 'a scan (reader: (Char.char, 'a) StringCvt.reader) | |
161 | : (char, 'a) StringCvt.reader = | |
162 | let | |
163 | val escape : (char, 'a) StringCvt.reader = | |
164 | fn state => | |
165 | case reader state of | |
166 | NONE => NONE | |
167 | | SOME (c, state') => | |
168 | let | |
169 | fun yes c = SOME (fromChar c, state') | |
170 | in | |
171 | case c of | |
172 | #"a" => yes #"\a" | |
173 | | #"b" => yes #"\b" | |
174 | | #"t" => yes #"\t" | |
175 | | #"n" => yes #"\n" | |
176 | | #"v" => yes #"\v" | |
177 | | #"f" => yes #"\f" | |
178 | | #"r" => yes #"\r" | |
179 | | #"\\" => yes #"\\" | |
180 | | #"\"" => yes #"\"" | |
181 | | #"^" => control reader state' | |
182 | | #"u" => | |
183 | Reader.mapOpt chrOpt | |
184 | (StringCvt.digitsExact (StringCvt.HEX, 4) reader) | |
185 | state' | |
186 | | #"U" => | |
187 | Reader.mapOpt chrOpt | |
188 | (StringCvt.digitsExact (StringCvt.HEX, 8) reader) | |
189 | state' | |
190 | | _ => (* 3 decimal digits *) | |
191 | Reader.mapOpt chrOpt | |
192 | (StringCvt.digitsExact (StringCvt.DEC, 3) | |
193 | reader) | |
194 | state | |
195 | end | |
196 | val main: (char, 'a) StringCvt.reader = | |
197 | fn state => | |
198 | let | |
199 | val state = formatSequences reader state | |
200 | in | |
201 | case reader state of | |
202 | NONE => NONE | |
203 | | SOME (c, state) => | |
204 | (* isPrint doesn't exist. yuck: *) | |
205 | if Char.>= (c, #" ") andalso Char.<= (c, #"~") | |
206 | then | |
207 | case c of | |
208 | #"\\" => escape state | |
209 | | #"\"" => NONE | |
210 | | _ => SOME (fromChar c, formatSequences reader state) | |
211 | else NONE | |
212 | end | |
213 | in | |
214 | main | |
215 | end | |
216 | ||
217 | val fromString = StringCvt.scanString scan | |
218 | ||
219 | fun 'a scanC (reader: (Char.char, 'a) StringCvt.reader) | |
220 | : (char, 'a) StringCvt.reader = | |
221 | let | |
222 | val rec escape = | |
223 | fn state => | |
224 | case reader state of | |
225 | NONE => NONE | |
226 | | SOME (c, state') => | |
227 | let fun yes c = SOME (fromChar c, state') | |
228 | in case c of | |
229 | #"a" => yes #"\a" | |
230 | | #"b" => yes #"\b" | |
231 | | #"t" => yes #"\t" | |
232 | | #"n" => yes #"\n" | |
233 | | #"v" => yes #"\v" | |
234 | | #"f" => yes #"\f" | |
235 | | #"r" => yes #"\r" | |
236 | | #"?" => yes #"?" | |
237 | | #"\\" => yes #"\\" | |
238 | | #"\"" => yes #"\"" | |
239 | | #"'" => yes #"'" | |
240 | | #"^" => control reader state' | |
241 | | #"x" => | |
242 | Reader.mapOpt chrOpt | |
243 | (StringCvt.digits StringCvt.HEX reader) | |
244 | state' | |
245 | | #"u" => | |
246 | Reader.mapOpt chrOpt | |
247 | (StringCvt.digitsExact (StringCvt.HEX, 4) reader) | |
248 | state' | |
249 | | #"U" => | |
250 | Reader.mapOpt chrOpt | |
251 | (StringCvt.digitsExact (StringCvt.HEX, 8) reader) | |
252 | state' | |
253 | | _ => | |
254 | Reader.mapOpt chrOpt | |
255 | (StringCvt.digitsPlus (StringCvt.OCT, 3) reader) | |
256 | state | |
257 | end | |
258 | and main = | |
259 | fn NONE => NONE | |
260 | | SOME (c, state) => | |
261 | (* yuck. isPrint is not defined yet: *) | |
262 | if Char.>= (c, #" ") andalso Char.<= (c, #"~") | |
263 | then | |
264 | case c of | |
265 | #"\\" => escape state | |
266 | | _ => SOME (fromChar c, state) | |
267 | else NONE | |
268 | in | |
269 | main o reader | |
270 | end | |
271 | ||
272 | val fromCString = StringCvt.scanString scanC | |
273 | ||
274 | fun padLeft (s: String.string, n: int): String.string = | |
275 | let | |
276 | val m = String.size s | |
277 | val diff = Int.-? (n, m) | |
278 | in if Int.> (diff, 0) | |
279 | then String.concat [String.new (diff, #"0"), s] | |
280 | else if diff = 0 | |
281 | then s | |
282 | else raise Fail "padLeft" | |
283 | end | |
284 | ||
285 | fun unicodeEscape ord = | |
286 | if Int.< (ord, 65536) | |
287 | then String.concat | |
288 | ["\\u", padLeft (Int.fmt StringCvt.HEX ord, 4)] | |
289 | else String.concat | |
290 | ["\\U", padLeft (Int.fmt StringCvt.HEX ord, 8)] | |
291 | ||
292 | fun toString c = | |
293 | let | |
294 | val ord = ord c | |
295 | in | |
296 | if isPrint c | |
297 | then | |
298 | case ord of | |
299 | 92 (* #"\\" *) => "\\\\" | |
300 | | 34 (* #"\"" *) => "\\\"" | |
301 | | _ => String.new (1, Char.chrUnsafe ord) | |
302 | (* ^^^^ safe b/c isPrint < 128 *) | |
303 | else | |
304 | case ord of | |
305 | 7 (* #"\a" *) => "\\a" | |
306 | | 8 (* #"\b" *) => "\\b" | |
307 | | 9 (* #"\t" *) => "\\t" | |
308 | | 10 (* #"\n" *) => "\\n" | |
309 | | 11 (* #"\v" *) => "\\v" | |
310 | | 12 (* #"\f" *) => "\\f" | |
311 | | 13 (* #"\r" *) => "\\r" | |
312 | | _ => | |
313 | if Int.< (ord, 32) | |
314 | then String.concat | |
315 | ["\\^", String.new | |
316 | (1, Char.chrUnsafe | |
317 | (Int.+? (ord, 64 (* #"@" *) )))] | |
318 | else if Int.< (ord, 256) | |
319 | then String.concat | |
320 | ["\\", padLeft (Int.fmt StringCvt.DEC ord, 3)] | |
321 | else unicodeEscape ord | |
322 | end | |
323 | ||
324 | fun toCString c = | |
325 | let | |
326 | val ord = ord c | |
327 | in | |
328 | if isPrint c | |
329 | then | |
330 | case ord of | |
331 | 92 (* #"\\" *) => "\\\\" | |
332 | | 34 (* #"\"" *) => "\\\"" | |
333 | | 63 (* #"?" *) => "\\?" | |
334 | | 39 (* #"'" *) => "\\'" | |
335 | | _ => String.new (1, Char.chrUnsafe ord) | |
336 | else | |
337 | case ord of | |
338 | 7 (* #"\a" *) => "\\a" | |
339 | | 8 (* #"\b" *) => "\\b" | |
340 | | 9 (* #"\t" *) => "\\t" | |
341 | | 10 (* #"\n" *) => "\\n" | |
342 | | 11 (* #"\v" *) => "\\v" | |
343 | | 12 (* #"\f" *) => "\\f" | |
344 | | 13 (* #"\r" *) => "\\r" | |
345 | | _ => | |
346 | if Int.< (ord, 256) | |
347 | then String.concat | |
348 | ["\\", padLeft (Int.fmt StringCvt.OCT ord, 3)] | |
349 | else unicodeEscape ord | |
350 | end | |
351 | end | |
352 | ||
353 | structure CharArg : CHAR_ARG = | |
354 | struct | |
355 | structure PreChar = Char | |
356 | structure CharVector = CharVector | |
357 | structure CharArray = CharArray | |
358 | end | |
359 | ||
360 | structure WideCharArg : CHAR_ARG = | |
361 | struct | |
362 | structure PreChar = WideChar | |
363 | structure CharVector = WideCharVector | |
364 | structure CharArray = WideCharArray | |
365 | end | |
366 | ||
367 | structure Char : CHAR_EXTRA = CharFn(CharArg) | |
368 | structure WideChar : CHAR_EXTRA = CharFn(WideCharArg) |