Import Upstream version 20180207
[hcoop/debian/mlton.git] / basis-library / text / char.sml
1 (* Copyright (C) 1999-2007 Henry Cejtin, Matthew Fluet, Suresh
2 * Jagannathan, and Stephen Weeks.
3 * Copyright (C) 1997-2000 NEC Research Institute.
4 *
5 * MLton is released under a BSD-style license.
6 * See the file MLton-LICENSE for details.
7 *)
8
9 signature CHAR_ARG =
10 sig
11 structure PreChar : PRE_CHAR
12 structure CharVector: EQTYPE_MONO_VECTOR_EXTRA
13 structure CharArray: MONO_ARRAY_EXTRA
14 sharing type PreChar.char = CharVector.elem = CharArray.elem
15 sharing type PreChar.string = CharVector.vector = CharArray.vector
16 end
17
18 functor CharFn(Arg : CHAR_ARG)
19 :> CHAR_EXTRA
20 where type char = Arg.PreChar.char
21 where type string = Arg.PreChar.string =
22 struct
23 open Arg.PreChar
24
25 type string = Arg.CharVector.vector
26 val maxOrd: int = numChars - 1
27
28 val fromString = Arg.CharVector.fromPoly o
29 Vector.map (fn x => fromChar x) o
30 String.toPoly
31
32 fun succ c =
33 if Primitive.Controls.safe
34 andalso c = maxChar
35 then raise Chr
36 else chrUnsafe (Int.+ (ord c, 1))
37
38 fun pred c =
39 if Primitive.Controls.safe
40 andalso c = minChar
41 then raise Chr
42 else chrUnsafe (Int.- (ord c, 1))
43
44 fun chrOpt c =
45 if Primitive.Controls.safe
46 andalso Int.gtu (c, maxOrd)
47 then NONE
48 else SOME (chrUnsafe c)
49
50 fun chr c =
51 case chrOpt c of
52 NONE => raise Chr
53 | SOME c => c
54
55 (* To implement character classes, we cannot use lookup tables on the
56 * order of the number of characters. We don't want to scan the string
57 * each time, so instead we'll sort it and use binary search.
58 *)
59 fun contains s =
60 let
61 val a = Array.tabulate (Arg.CharVector.length s,
62 fn i => Arg.CharVector.sub (s, i))
63 val () = Heap.heapSort (a, op <)
64 in
65 fn c =>
66 let
67 val x = Heap.binarySearch (a, fn d => d < c)
68 in
69 if x = Array.length a then false else
70 Array.sub (a, x) = c
71 end
72 end
73
74 fun notContains s = not o contains s
75
76 val c = fromChar
77 val ( la, lA, lf, lF, lz, lZ, l0, l9, lSPACE,lBANG, lTIL, lTAB, lCR, lDEL) =
78 (c#"a", c#"A", c#"f", c#"F", c#"z", c#"Z", c#"0", c#"9", c#" ", c#"!", c#"~", c#"\t", c#"\r", c#"\127")
79
80 (* Range comparisons don't need tables! It's faster to just compare. *)
81 fun isLower c = la <= c andalso c <= lz
82 fun isUpper c = c <= lZ andalso lA <= c (* More discriminating first! *)
83 fun isDigit c = c <= l9 andalso l0 <= c (* More discriminating first! *)
84 fun isGraph c = lBANG <= c andalso c <= lTIL
85 fun isPrint c = lSPACE <= c andalso c <= lTIL
86 fun isCntrl c = c < lSPACE orelse c = lDEL
87 fun isAscii c = c <= lDEL
88
89 (* These take advantage of ASCII ordering to minimize comparisons. *)
90 fun isAlpha c = if la <= c then c <= lz else lA <= c andalso c <= lZ
91 fun isAlphaNum c =
92 if lA <= c then
93 if la <= c then c <= lz else c <= lZ
94 else
95 l0 <= c andalso c <= l9
96 fun isHexDigit c =
97 if lA <= c then
98 if la <= c then c <= lf else c <= lF
99 else
100 l0 <= c andalso c <= l9
101 fun isSpace c = if lCR < c then c = lSPACE else lTAB <= c
102 fun isPunct c = isGraph c andalso not (isAlphaNum c)
103
104 local
105 fun make (test, diff) c =
106 if test c then chrUnsafe (Int.+? (ord c, diff)) else c
107 val diff = Int.- (ord lA, ord la)
108 in
109 val toLower = make (isUpper, Int.~ diff)
110 val toUpper = make (isLower, diff)
111 end
112
113 fun control reader state =
114 case reader state of
115 NONE => NONE
116 | SOME (c, state) =>
117 if Char.<= (#"@", c) andalso Char.<= (c, #"_")
118 then SOME (chr (Int.-? (Char.ord c, Char.ord #"@")), state)
119 else NONE
120
121 fun formatChar reader state =
122 case reader state of
123 NONE => NONE
124 | SOME (c, state) =>
125 if StringCvt.isSpace c
126 then SOME ((), state)
127 else NONE
128
129 fun formatChars reader =
130 let
131 fun loop state =
132 case formatChar reader state of
133 NONE => state
134 | SOME ((), state) => loop state
135 in
136 loop
137 end
138
139 val 'a formatSequences: (Char.char, 'a) StringCvt.reader -> 'a -> 'a =
140 fn reader =>
141 let
142 fun loop state =
143 case reader state of
144 SOME (#"\\", state1) =>
145 (case formatChar reader state1 of
146 NONE => state
147 | SOME ((), state2) =>
148 let
149 val state3 = formatChars reader state2
150 in
151 case reader state3 of
152 SOME (#"\\", state4) => loop state4
153 | _ => state
154 end)
155 | _ => state
156 in
157 loop
158 end
159
160 fun 'a scan (reader: (Char.char, 'a) StringCvt.reader)
161 : (char, 'a) StringCvt.reader =
162 let
163 val escape : (char, 'a) StringCvt.reader =
164 fn state =>
165 case reader state of
166 NONE => NONE
167 | SOME (c, state') =>
168 let
169 fun yes c = SOME (fromChar c, state')
170 in
171 case c of
172 #"a" => yes #"\a"
173 | #"b" => yes #"\b"
174 | #"t" => yes #"\t"
175 | #"n" => yes #"\n"
176 | #"v" => yes #"\v"
177 | #"f" => yes #"\f"
178 | #"r" => yes #"\r"
179 | #"\\" => yes #"\\"
180 | #"\"" => yes #"\""
181 | #"^" => control reader state'
182 | #"u" =>
183 Reader.mapOpt chrOpt
184 (StringCvt.digitsExact (StringCvt.HEX, 4) reader)
185 state'
186 | #"U" =>
187 Reader.mapOpt chrOpt
188 (StringCvt.digitsExact (StringCvt.HEX, 8) reader)
189 state'
190 | _ => (* 3 decimal digits *)
191 Reader.mapOpt chrOpt
192 (StringCvt.digitsExact (StringCvt.DEC, 3)
193 reader)
194 state
195 end
196 val main: (char, 'a) StringCvt.reader =
197 fn state =>
198 let
199 val state = formatSequences reader state
200 in
201 case reader state of
202 NONE => NONE
203 | SOME (c, state) =>
204 (* isPrint doesn't exist. yuck: *)
205 if Char.>= (c, #" ") andalso Char.<= (c, #"~")
206 then
207 case c of
208 #"\\" => escape state
209 | #"\"" => NONE
210 | _ => SOME (fromChar c, formatSequences reader state)
211 else NONE
212 end
213 in
214 main
215 end
216
217 val fromString = StringCvt.scanString scan
218
219 fun 'a scanC (reader: (Char.char, 'a) StringCvt.reader)
220 : (char, 'a) StringCvt.reader =
221 let
222 val rec escape =
223 fn state =>
224 case reader state of
225 NONE => NONE
226 | SOME (c, state') =>
227 let fun yes c = SOME (fromChar c, state')
228 in case c of
229 #"a" => yes #"\a"
230 | #"b" => yes #"\b"
231 | #"t" => yes #"\t"
232 | #"n" => yes #"\n"
233 | #"v" => yes #"\v"
234 | #"f" => yes #"\f"
235 | #"r" => yes #"\r"
236 | #"?" => yes #"?"
237 | #"\\" => yes #"\\"
238 | #"\"" => yes #"\""
239 | #"'" => yes #"'"
240 | #"^" => control reader state'
241 | #"x" =>
242 Reader.mapOpt chrOpt
243 (StringCvt.digits StringCvt.HEX reader)
244 state'
245 | #"u" =>
246 Reader.mapOpt chrOpt
247 (StringCvt.digitsExact (StringCvt.HEX, 4) reader)
248 state'
249 | #"U" =>
250 Reader.mapOpt chrOpt
251 (StringCvt.digitsExact (StringCvt.HEX, 8) reader)
252 state'
253 | _ =>
254 Reader.mapOpt chrOpt
255 (StringCvt.digitsPlus (StringCvt.OCT, 3) reader)
256 state
257 end
258 and main =
259 fn NONE => NONE
260 | SOME (c, state) =>
261 (* yuck. isPrint is not defined yet: *)
262 if Char.>= (c, #" ") andalso Char.<= (c, #"~")
263 then
264 case c of
265 #"\\" => escape state
266 | _ => SOME (fromChar c, state)
267 else NONE
268 in
269 main o reader
270 end
271
272 val fromCString = StringCvt.scanString scanC
273
274 fun padLeft (s: String.string, n: int): String.string =
275 let
276 val m = String.size s
277 val diff = Int.-? (n, m)
278 in if Int.> (diff, 0)
279 then String.concat [String.new (diff, #"0"), s]
280 else if diff = 0
281 then s
282 else raise Fail "padLeft"
283 end
284
285 fun unicodeEscape ord =
286 if Int.< (ord, 65536)
287 then String.concat
288 ["\\u", padLeft (Int.fmt StringCvt.HEX ord, 4)]
289 else String.concat
290 ["\\U", padLeft (Int.fmt StringCvt.HEX ord, 8)]
291
292 fun toString c =
293 let
294 val ord = ord c
295 in
296 if isPrint c
297 then
298 case ord of
299 92 (* #"\\" *) => "\\\\"
300 | 34 (* #"\"" *) => "\\\""
301 | _ => String.new (1, Char.chrUnsafe ord)
302 (* ^^^^ safe b/c isPrint < 128 *)
303 else
304 case ord of
305 7 (* #"\a" *) => "\\a"
306 | 8 (* #"\b" *) => "\\b"
307 | 9 (* #"\t" *) => "\\t"
308 | 10 (* #"\n" *) => "\\n"
309 | 11 (* #"\v" *) => "\\v"
310 | 12 (* #"\f" *) => "\\f"
311 | 13 (* #"\r" *) => "\\r"
312 | _ =>
313 if Int.< (ord, 32)
314 then String.concat
315 ["\\^", String.new
316 (1, Char.chrUnsafe
317 (Int.+? (ord, 64 (* #"@" *) )))]
318 else if Int.< (ord, 256)
319 then String.concat
320 ["\\", padLeft (Int.fmt StringCvt.DEC ord, 3)]
321 else unicodeEscape ord
322 end
323
324 fun toCString c =
325 let
326 val ord = ord c
327 in
328 if isPrint c
329 then
330 case ord of
331 92 (* #"\\" *) => "\\\\"
332 | 34 (* #"\"" *) => "\\\""
333 | 63 (* #"?" *) => "\\?"
334 | 39 (* #"'" *) => "\\'"
335 | _ => String.new (1, Char.chrUnsafe ord)
336 else
337 case ord of
338 7 (* #"\a" *) => "\\a"
339 | 8 (* #"\b" *) => "\\b"
340 | 9 (* #"\t" *) => "\\t"
341 | 10 (* #"\n" *) => "\\n"
342 | 11 (* #"\v" *) => "\\v"
343 | 12 (* #"\f" *) => "\\f"
344 | 13 (* #"\r" *) => "\\r"
345 | _ =>
346 if Int.< (ord, 256)
347 then String.concat
348 ["\\", padLeft (Int.fmt StringCvt.OCT ord, 3)]
349 else unicodeEscape ord
350 end
351 end
352
353 structure CharArg : CHAR_ARG =
354 struct
355 structure PreChar = Char
356 structure CharVector = CharVector
357 structure CharArray = CharArray
358 end
359
360 structure WideCharArg : CHAR_ARG =
361 struct
362 structure PreChar = WideChar
363 structure CharVector = WideCharVector
364 structure CharArray = WideCharArray
365 end
366
367 structure Char : CHAR_EXTRA = CharFn(CharArg)
368 structure WideChar : CHAR_EXTRA = CharFn(WideCharArg)