1 (* Copyright (C
) 1999-2007 Henry Cejtin
, Matthew Fluet
, Suresh
2 * Jagannathan
, and Stephen Weeks
.
3 * Copyright (C
) 1997-2000 NEC Research Institute
.
5 * MLton is released under a BSD
-style license
.
6 * See the file MLton
-LICENSE for details
.
11 structure PreChar
: PRE_CHAR
12 structure CharVector
: EQTYPE_MONO_VECTOR_EXTRA
13 structure CharArray
: MONO_ARRAY_EXTRA
14 sharing type PreChar
.char
= CharVector
.elem
= CharArray
.elem
15 sharing type PreChar
.string = CharVector
.vector
= CharArray
.vector
18 functor CharFn(Arg
: CHAR_ARG
)
20 where type char
= Arg
.PreChar
.char
21 where type string = Arg
.PreChar
.string =
25 type string = Arg
.CharVector
.vector
26 val maxOrd
: int = numChars
- 1
28 val fromString
= Arg
.CharVector
.fromPoly
o
29 Vector.map (fn x
=> fromChar x
) o
33 if Primitive
.Controls
.safe
36 else chrUnsafe (Int.+ (ord c
, 1))
39 if Primitive
.Controls
.safe
42 else chrUnsafe (Int.- (ord c
, 1))
45 if Primitive
.Controls
.safe
46 andalso Int.gtu (c
, maxOrd
)
48 else SOME (chrUnsafe c
)
55 (* To implement character classes
, we cannot use lookup tables on the
56 * order
of the number
of characters
. We don
't want to scan the
string
57 * each time
, so instead we
'll sort it
and use binary search
.
61 val a
= Array
.tabulate (Arg
.CharVector
.length s
,
62 fn i
=> Arg
.CharVector
.sub (s
, i
))
63 val () = Heap
.heapSort (a
, op <)
67 val x
= Heap
.binarySearch (a
, fn d
=> d
< c
)
69 if x
= Array
.length a
then false else
74 fun notContains s
= not
o contains s
77 val ( la
, lA
, lf
, lF
, lz
, lZ
, l0
, l9
, lSPACE
,lBANG
, lTIL
, lTAB
, lCR
, lDEL
) =
78 (c#
"a", c#
"A", c#
"f", c#
"F", c#
"z", c#
"Z", c#
"0", c#
"9", c#
" ", c#
"!", c#
"~", c#
"\t", c#
"\r", c#
"\127")
80 (* Range comparisons don
't need tables
! It
's faster to just compare
. *)
81 fun isLower c
= la
<= c
andalso c
<= lz
82 fun isUpper c
= c
<= lZ
andalso lA
<= c (* More discriminating first
! *)
83 fun isDigit c
= c
<= l9
andalso l0
<= c (* More discriminating first
! *)
84 fun isGraph c
= lBANG
<= c
andalso c
<= lTIL
85 fun isPrint c
= lSPACE
<= c
andalso c
<= lTIL
86 fun isCntrl c
= c
< lSPACE
orelse c
= lDEL
87 fun isAscii c
= c
<= lDEL
89 (* These take advantage
of ASCII ordering to minimize comparisons
. *)
90 fun isAlpha c
= if la
<= c
then c
<= lz
else lA
<= c
andalso c
<= lZ
93 if la
<= c
then c
<= lz
else c
<= lZ
95 l0
<= c
andalso c
<= l9
98 if la
<= c
then c
<= lf
else c
<= lF
100 l0
<= c
andalso c
<= l9
101 fun isSpace c
= if lCR
< c
then c
= lSPACE
else lTAB
<= c
102 fun isPunct c
= isGraph c
andalso not (isAlphaNum c
)
105 fun make (test
, diff
) c
=
106 if test c
then chrUnsafe (Int.+?
(ord c
, diff
)) else c
107 val diff
= Int.- (ord lA
, ord la
)
109 val toLower
= make (isUpper
, Int.~ diff
)
110 val toUpper
= make (isLower
, diff
)
113 fun control reader state
=
117 if Char.<= (#
"@", c
) andalso Char.<= (c
, #
"_")
118 then SOME (chr (Int.-?
(Char.ord c
, Char.ord #
"@")), state
)
121 fun formatChar reader state
=
125 if StringCvt.isSpace c
126 then SOME ((), state
)
129 fun formatChars reader
=
132 case formatChar reader state
of
134 |
SOME ((), state
) => loop state
139 val 'a formatSequences
: (Char.char
, 'a
) StringCvt.reader
-> 'a
-> 'a
=
144 SOME (#
"\\", state1
) =>
145 (case formatChar reader state1
of
147 |
SOME ((), state2
) =>
149 val state3
= formatChars reader state2
151 case reader state3
of
152 SOME (#
"\\", state4
) => loop state4
160 fun 'a
scan (reader
: (Char.char
, 'a
) StringCvt.reader
)
161 : (char
, 'a
) StringCvt.reader
=
163 val escape
: (char
, 'a
) StringCvt.reader
=
167 |
SOME (c
, state
') =>
169 fun yes c
= SOME (fromChar c
, state
')
181 | #
"^" => control reader state
'
184 (StringCvt.digitsExact (StringCvt.HEX
, 4) reader
)
188 (StringCvt.digitsExact (StringCvt.HEX
, 8) reader
)
190 | _
=> (* 3 decimal digits
*)
192 (StringCvt.digitsExact (StringCvt.DEC
, 3)
196 val main
: (char
, 'a
) StringCvt.reader
=
199 val state
= formatSequences reader state
204 (* isPrint doesn
't exist
. yuck
: *)
205 if Char.>= (c
, #
" ") andalso Char.<= (c
, #
"~")
208 #
"\\" => escape state
210 | _
=> SOME (fromChar c
, formatSequences reader state
)
217 val fromString
= StringCvt.scanString scan
219 fun 'a
scanC (reader
: (Char.char
, 'a
) StringCvt.reader
)
220 : (char
, 'a
) StringCvt.reader
=
226 |
SOME (c
, state
') =>
227 let fun yes c
= SOME (fromChar c
, state
')
240 | #
"^" => control reader state
'
243 (StringCvt.digits
StringCvt.HEX reader
)
247 (StringCvt.digitsExact (StringCvt.HEX
, 4) reader
)
251 (StringCvt.digitsExact (StringCvt.HEX
, 8) reader
)
255 (StringCvt.digitsPlus (StringCvt.OCT
, 3) reader
)
261 (* yuck
. isPrint is not defined yet
: *)
262 if Char.>= (c
, #
" ") andalso Char.<= (c
, #
"~")
265 #
"\\" => escape state
266 | _
=> SOME (fromChar c
, state
)
272 val fromCString
= StringCvt.scanString scanC
274 fun padLeft (s
: String.string, n
: int): String.string =
276 val m
= String.size s
277 val diff
= Int.-?
(n
, m
)
278 in if Int.> (diff
, 0)
279 then String.concat
[String.new (diff
, #
"0"), s
]
282 else raise Fail
"padLeft"
285 fun unicodeEscape ord
=
286 if Int.< (ord
, 65536)
288 ["\\u", padLeft (Int.fmt
StringCvt.HEX ord
, 4)]
290 ["\\U", padLeft (Int.fmt
StringCvt.HEX ord
, 8)]
299 92 (* #
"\\" *) => "\\\\"
300 |
34 (* #
"\"" *) => "\\\""
301 | _
=> String.new (1, Char.chrUnsafe ord
)
302 (* ^^^^ safe b
/c isPrint
< 128 *)
305 7 (* #
"\a" *) => "\\a"
306 |
8 (* #
"\b" *) => "\\b"
307 |
9 (* #
"\t" *) => "\\t"
308 |
10 (* #
"\n" *) => "\\n"
309 |
11 (* #
"\v" *) => "\\v"
310 |
12 (* #
"\f" *) => "\\f"
311 |
13 (* #
"\r" *) => "\\r"
317 (Int.+?
(ord
, 64 (* #
"@" *) )))]
318 else if Int.< (ord
, 256)
320 ["\\", padLeft (Int.fmt
StringCvt.DEC ord
, 3)]
321 else unicodeEscape ord
331 92 (* #
"\\" *) => "\\\\"
332 |
34 (* #
"\"" *) => "\\\""
333 |
63 (* #
"?" *) => "\\?"
334 |
39 (* #
"'" *) => "\\'"
335 | _
=> String.new (1, Char.chrUnsafe ord
)
338 7 (* #
"\a" *) => "\\a"
339 |
8 (* #
"\b" *) => "\\b"
340 |
9 (* #
"\t" *) => "\\t"
341 |
10 (* #
"\n" *) => "\\n"
342 |
11 (* #
"\v" *) => "\\v"
343 |
12 (* #
"\f" *) => "\\f"
344 |
13 (* #
"\r" *) => "\\r"
348 ["\\", padLeft (Int.fmt
StringCvt.OCT ord
, 3)]
349 else unicodeEscape ord
353 structure CharArg
: CHAR_ARG
=
355 structure PreChar
= Char
356 structure CharVector
= CharVector
357 structure CharArray
= CharArray
360 structure WideCharArg
: CHAR_ARG
=
362 structure PreChar
= WideChar
363 structure CharVector
= WideCharVector
364 structure CharArray
= WideCharArray
367 structure Char : CHAR_EXTRA
= CharFn(CharArg
)
368 structure WideChar
: CHAR_EXTRA
= CharFn(WideCharArg
)