Import Debian changes 20180207-1
[hcoop/debian/mlton.git] / benchmark / tests / fxp.sml
1 (* MLton 20010629 (built Fri Jun 29 11:14:21 2001 on starlinux) *)
2 (* created this file on Fri Jun 29 13:02:19 2001. *)
3 (* Do not edit this file. *)
4 (* Flag settings: *)
5 (* aux: false *)
6 (* chunk: chunk per function *)
7 (* debug: false *)
8 (* defines: [NODEBUG,MLton_safe=TRUE,MLton_detectOverflow=TRUE] *)
9 (* detect overflow: true *)
10 (* fixed heap: None *)
11 (* indentation: 3 *)
12 (* includes: [mlton.h] *)
13 (* inline: NonRecursive {product = 320,small = 60} *)
14 (* input file: fxp.cm *)
15 (* instrument: false *)
16 (* instrument Sxml: false *)
17 (* keep Cps: false *)
18 (* native: true *)
19 (* native commented: 0 *)
20 (* native copy prop: true *)
21 (* future: 64 *)
22 (* native ieee fp: false *)
23 (* native live transfer: true *)
24 (* native move hoist: true *)
25 (* native optimize: 1 *)
26 (* native split: Some (20000) *)
27 (* polyvariance: Some ({rounds = 2,small = 30,product = 300}) *)
28 (* print at fun entry: false *)
29 (* profile: false *)
30 (* safe: true *)
31 (* show types: false *)
32 (* static: false *)
33 (* use basis library: true *)
34 (* verbosity: Silent *)
35 (* start of ../../Util/utilTime.sml *)
36
37
38
39 (*--------------------------------------------------------------------------*)
40 (* Structure: UtilTime *)
41 (* *)
42 (* Depends on: *)
43 (* *)
44 (* Exceptions raised by functions in this structure: *)
45 (* time : none *)
46 (* timeN : none *)
47 (*--------------------------------------------------------------------------*)
48 signature UtilTime =
49 sig
50 val time : ('a -> 'b) -> 'a -> 'b * {usr:Time.time, sys:Time.time, gc:Time.time}
51 val timeN : int -> ('a -> 'b) -> 'a -> 'b * {usr:Time.time, sys:Time.time, gc:Time.time}
52 end
53
54 structure UtilTime : UtilTime =
55 struct
56 (*--------------------------------------------------------------------*)
57 (* run f on x, and measure the runtime. return the result and time. *)
58 (*--------------------------------------------------------------------*)
59 fun time f x = let val timer = Timer.startCPUTimer ()
60 val y = f x
61 val ptime = Timer.checkCPUTimer timer
62 in (y,ptime)
63 end
64
65 (*--------------------------------------------------------------------*)
66 (* run f n times on x, and measure the runtime. return the time. *)
67 (*--------------------------------------------------------------------*)
68 fun timeN n f x =
69 let fun iter m = if m<=1 then f x else (ignore (f x); iter (m-1))
70 in time iter n
71 end
72 end
73 (* stop of ../../Util/utilTime.sml *)
74 (* start of ../../Util/utilString.sml *)
75 (*--------------------------------------------------------------------------*)
76 (* Structure: UtilString *)
77 (*--------------------------------------------------------------------------*)
78 signature UtilString =
79 sig
80 val quoteString : char -> string -> string
81
82 val numberNth : int -> string
83 val prependAnA : string -> string
84
85 val nBlanks : int -> string
86 val padxLeft : char -> string * int -> string
87 val padxRight : char -> string * int -> string
88
89 val breakLines : int -> string -> string list
90
91 val toUpperFirst : string -> string
92 val toUpperString : string -> string
93
94 val Int2String : int -> string
95
96 val Bool2xString : string * string -> bool -> string
97 val Bool2String : bool -> string
98
99 val Option2xString : string * (('a -> string) -> 'a -> string)
100 -> ('a -> string) -> 'a option -> string
101 val Option2String0 : ('a -> string) -> 'a option -> string
102 val Option2String : ('a -> string) -> 'a option -> string
103
104 val List2xString : string * string * string -> ('a -> string) -> 'a list -> string
105 val List2String0 : ('a -> string) -> 'a list -> string
106 val List2String : ('a -> string) -> 'a list -> string
107
108 val Vector2xString : string * string * string -> ('a -> string) -> 'a vector -> string
109 val Vector2String : ('a -> string) -> 'a vector -> string
110 end
111
112 structure UtilString : UtilString =
113 struct
114 fun quoteString q s = let val quote = String.implode [q] in quote^s^quote end
115
116 (*--------------------------------------------------------------------*)
117 (* generate a string with the ordinal number of n, by appending *)
118 (* "st", "nd", "rd" or "th" to the number. *)
119 (*--------------------------------------------------------------------*)
120 fun numberNth n =
121 let val suffix = case n mod 9
122 of 1 => "st"
123 | 2 => "nd"
124 | 3 => "rd"
125 | _ => "th"
126 in Int.toString n^suffix
127 end
128
129 (*--------------------------------------------------------------------*)
130 (* is the single character c represented by a word starting with a *)
131 (* vocal in the alphabet? (l~ell->true, k~kay->false) *)
132 (*--------------------------------------------------------------------*)
133 fun vocalLetter c =
134 case Char.toLower c
135 of #"a" => true
136 | #"f" => true
137 | #"h" => true
138 | #"i" => true
139 | #"l" => true
140 | #"m" => true
141 | #"n" => true
142 | #"o" => true
143 | #"r" => true
144 | #"s" => true
145 | #"x" => true
146 | #"8" => true
147 | _ => false
148
149 (*--------------------------------------------------------------------*)
150 (* is character c a vocal? *)
151 (*--------------------------------------------------------------------*)
152 fun isVocal c =
153 case Char.toLower c
154 of #"a" => true
155 | #"e" => true
156 | #"i" => true
157 | #"o" => true
158 | #"u" => true
159 | _ => false
160
161 (*--------------------------------------------------------------------*)
162 (* does a word require "an" as undefinite article? true if: *)
163 (* - it is a single letter that starts with a vocal in the alphabet *)
164 (* - its first two letters are capitals, i.e. it is an abbreviation, *)
165 (* and its first letter starts with a vocal in the alphabet *)
166 (* - it has more than one letter, is not an abbreviation, and either *)
167 (* + it starts with a, i or o *)
168 (* + it starts with e and the second letter is not a u (europe) *)
169 (* + it starts with a u and continues with a vocal (very unlikely, *)
170 (* only in c.s., like uuencoded or uid *)
171 (* + it starts with a u, continues with a consonant not followed by *)
172 (* an i (like in unicode); that is something like un-... *)
173 (* This ruleset is not complete since it does not cover, e.g., the *)
174 (* word uninvented, but sufficient for most cases. *)
175 (* (Is english pronounciation decidable at all?) *)
176 (*--------------------------------------------------------------------*)
177 fun extendsAtoAn word =
178 case String.explode word
179 of nil => false
180 | [c] => vocalLetter c
181 | c1::c2::cs => if not (Char.isLower c1 orelse Char.isLower c2)
182 then vocalLetter c1
183 else case Char.toLower c1
184 of #"a" => true
185 | #"i" => true
186 | #"o" => true
187 | #"e" => Char.toLower c2 <> #"u"
188 | #"u" => if isVocal c2 then false
189 else (case cs
190 of nil => true
191 | c3::_ => Char.toLower c3 <> #"i")
192 | _ => false
193
194 (*--------------------------------------------------------------------*)
195 (* add an undefinite article to a word. *)
196 (*--------------------------------------------------------------------*)
197 fun prependAnA word = if extendsAtoAn word then "an "^word else "a "^word
198
199 (*--------------------------------------------------------------------*)
200 (* generate a list/string of n times character c. *)
201 (*--------------------------------------------------------------------*)
202 fun nCharsC c n = if n>0 then c::nCharsC c (n-1) else nil
203 fun nChars c n = String.implode (nCharsC c n)
204 val nBlanks = nChars #" "
205
206 (*--------------------------------------------------------------------*)
207 (* add a minimal number of characters c to the left/right of a string *)
208 (* in order to make its length at least n. *)
209 (*--------------------------------------------------------------------*)
210 fun padxLeft c (s,n) = (nChars c (n-String.size s))^s
211 fun padxRight c (s,n) = s^(nChars c (n-String.size s))
212 val padLeft = padxLeft #" "
213 val padRight = padxRight #" "
214
215 (*--------------------------------------------------------------------*)
216 (* break a string into several lines of length width. *)
217 (*--------------------------------------------------------------------*)
218 fun breakLines width str =
219 let
220 val tokens = String.tokens (fn c => #" "=c) str
221 fun makeLine(toks,lines) = if null toks then lines
222 else (String.concat (rev toks))::lines
223 fun doit w (toks,lines) nil = makeLine(toks,lines)
224 | doit w (toks,lines) (one::rest) =
225 let
226 val l = String.size one
227 val w1 = w+l
228 in
229 if w1<width then doit (w1+1) (" "::one::toks,lines) rest
230 else if w1=width then doit 0 (nil,makeLine(one::toks,lines)) rest
231 else if l>=width then doit 0 (nil,one::makeLine(toks,lines)) rest
232 else doit (l+1) ([" ",one],makeLine(toks,lines)) rest
233 end
234 in List.rev (doit 0 (nil,nil) tokens)
235 end
236
237 (*--------------------------------------------------------------------*)
238 (* convert the first/all characters of a string to upper case *)
239 (*--------------------------------------------------------------------*)
240 fun toUpperFirst str =
241 case String.explode str
242 of nil => ""
243 | c::cs => String.implode (Char.toUpper c::cs)
244 fun toUpperString str =
245 String.implode(map Char.toUpper (String.explode str))
246
247 (*--------------------------------------------------------------------*)
248 (* return a string representation of an int, char or unit. *)
249 (*--------------------------------------------------------------------*)
250 val Int2String = Int.toString
251 val Char2String = Char.toString
252 fun Unit2String() = "()"
253
254 (*--------------------------------------------------------------------*)
255 (* return a string representation of a boolean. *)
256 (*--------------------------------------------------------------------*)
257 fun Bool2xString (t,f) b = if b then t else f
258 val Bool2String = Bool2xString ("true","false")
259
260 (*--------------------------------------------------------------------*)
261 (* return a string representation of an option. *)
262 (* the first arg is a string for the NONE case, the second a function *)
263 (* that converts x to a string, given a function for doing so. *)
264 (*--------------------------------------------------------------------*)
265 fun Option2xString (none,Some2String) x2String opt =
266 case opt
267 of NONE => none
268 | SOME x => Some2String x2String x
269 fun Option2String0 x2String = Option2xString ("",fn f => fn x => f x) x2String
270 fun Option2String x2String = Option2xString ("NONE",fn f => fn x => "SOME "^f x) x2String
271
272 (*--------------------------------------------------------------------*)
273 (* return a string representation of list; start with pre, separate *)
274 (* with sep and finish with post; use X2String for each element. *)
275 (*--------------------------------------------------------------------*)
276 fun List2xString (pre,sep,post) X2String nil = pre^post
277 | List2xString (pre,sep,post) X2String l =
278 let fun doit nil _ = [post]
279 | doit (x::r) str = str::X2String x::doit r sep
280 in String.concat (doit l pre)
281 end
282 fun List2String X2String nil = "[]"
283 | List2String X2String l =
284 let fun doit nil _ = ["]"]
285 | doit (x::r) str = str::X2String x::doit r ","
286 in String.concat (doit l "[")
287 end
288 fun List2String0 X2String nil = ""
289 | List2String0 X2String l =
290 let fun doit nil _ = nil
291 | doit (x::r) str = str::X2String x::doit r " "
292 in String.concat (doit l "")
293 end
294
295 (* a compiler bug in smlnj 110 makes the following uncompilable: *)
296 (* fun List2String X2String xs = List2xString ("[",",","]") X2String xs *)
297 (* fun List2String0 X2String xs = List2xString (""," ","") X2String xs *)
298
299 (*--------------------------------------------------------------------*)
300 (* return a string representation of list; start with pre, separate *)
301 (* with sep and finish with post; use X2String for each element. *)
302 (*--------------------------------------------------------------------*)
303 fun Vector2xString (pre,sep,post) X2String vec =
304 if Vector.length vec=0 then pre^post
305 else String.concat
306 (pre::X2String(Vector.sub(vec,0))::
307 Vector.foldri (fn (_,x,yet) => sep::X2String x::yet) [post] (vec,1,NONE))
308 fun Vector2String X2String vec = Vector2xString ("#[",",","]") X2String vec
309 end
310 (* stop of ../../Util/utilString.sml *)
311 (* start of ../../Util/utilCompare.sml *)
312 signature UtilCompare =
313 sig
314 type 'a Comparer = 'a * 'a -> order
315
316 val comparePair : 'a Comparer * 'b Comparer -> ('a * 'b) Comparer
317 val compareTriple : 'a Comparer * 'b Comparer * 'c Comparer -> ('a * 'b * 'c) Comparer
318
319 val compareOption : 'a Comparer -> 'a option Comparer
320 val compareList : 'a Comparer -> 'a list Comparer
321 val compareVector : 'a Comparer -> 'a vector Comparer
322
323 val compareInt : int Comparer
324 val compareIntPair : (int * int) Comparer
325 val compareIntTriple : (int * int * int) Comparer
326
327 val compareWord : word Comparer
328 val compareWordPair : (word * word) Comparer
329 val compareWordTriple : (word * word * word) Comparer
330 end
331
332 structure UtilCompare : UtilCompare =
333 struct
334 type 'a Comparer = 'a * 'a -> order
335
336 fun comparePair (compareA,compareB) ((a1,b1),(a2,b2)) =
337 case compareA(a1,a2)
338 of EQUAL => compareB(b1,b2)
339 | order => order
340 fun compareTriple (compareA,compareB,compareC) ((a1,b1,c1),(a2,b2,c2)) =
341 case compareA(a1,a2)
342 of EQUAL => (case compareB(b1,b2)
343 of EQUAL => compareC(c1,c2)
344 | order => order)
345 | order => order
346
347 val compareInt = Int.compare
348 fun compareIntPair((x1,y1),(x2,y2)) =
349 case Int.compare(x1,x2)
350 of EQUAL => Int.compare (y1,y2)
351 | order => order
352 fun compareIntTriple((x1,y1,z1),(x2,y2,z2)) =
353 case Int.compare(x1,x2)
354 of EQUAL => (case Int.compare (y1,y2)
355 of EQUAL => Int.compare (z1,z2)
356 | order => order)
357 | order => order
358
359 val compareWord = Word.compare
360 fun compareWordPair((x1,y1),(x2,y2)) =
361 case Word.compare(x1,x2)
362 of EQUAL => Word.compare (y1,y2)
363 | order => order
364 fun compareWordTriple((x1,y1,z1),(x2,y2,z2)) =
365 case Word.compare(x1,x2)
366 of EQUAL => (case Word.compare (y1,y2)
367 of EQUAL => Word.compare (z1,z2)
368 | order => order)
369 | order => order
370
371 fun compareOption compareA opts =
372 case opts
373 of (NONE,NONE) => EQUAL
374 | (NONE,SOME x) => LESS
375 | (SOME x,NONE) => GREATER
376 | (SOME x,SOME y) => compareA(x,y)
377
378 fun compareList compA ll =
379 let fun doit (nil,nil) = EQUAL
380 | doit (nil,_) = LESS
381 | doit (_,nil) = GREATER
382 | doit (a1::as1,a2::as2) = case compA(a1,a2)
383 of EQUAL => doit(as1,as2)
384 | order => order
385 in doit ll
386 end
387
388 fun compareVector compA (vec1,vec2) =
389 let val (l,l2) = (Vector.length vec1,Vector.length vec2)
390 in case Int.compare(l,l2)
391 of EQUAL => let fun doit i = if i>=l then EQUAL
392 else case compA(Vector.sub(vec1,i),Vector.sub(vec2,i))
393 of EQUAL => doit (i+1)
394 | order => order
395 in doit 0
396 end
397 | order => order
398 end
399 end
400
401 (* stop of ../../Util/utilCompare.sml *)
402 (* start of ../../Util/utilHash.sml *)
403 signature UtilHash =
404 sig
405 val hashPair : ('a -> word) * ('b -> word) -> 'a * 'b -> word
406 val hashTriple : ('a -> word) * ('b -> word) * ('c -> word) -> 'a * 'b * 'c -> word
407
408 val hashOption : ('a -> word) -> 'a option -> word
409 val hashList : ('a -> word) -> 'a list -> word
410 val hashVector : ('a -> word) -> 'a vector -> word
411
412 val hashString : string -> word
413
414 val hashInt : int -> word
415 val hashIntPair : int * int -> word
416 val hashIntTriple : int * int * int -> word
417
418 val hashWord : word -> word
419 val hashWordPair : word * word -> word
420 val hashWordTriple : word * word * word -> word
421 end
422
423 structure UtilHash : UtilHash =
424 struct
425 fun hashPair (hashA,hashB) (a,b) =
426 0w1327 * hashA a + 0w3853 * hashB b
427 fun hashTriple (hashA,hashB,hashC) (a,b,c) =
428 0w1327 * hashA a + 0w3853 * hashB b + 0w2851 * hashC c
429
430 val hashInt =
431 Word.fromInt
432 fun hashIntPair (i,j) =
433 0w1327 * Word.fromInt i + 0w3853 * Word.fromInt j
434 fun hashIntTriple (i,j,k) =
435 0w1327 * Word.fromInt i + 0w3853 * Word.fromInt j + 0w2851 * Word.fromInt k
436
437 fun hashWord w = w
438 fun hashWordPair (i,j) = 0w1327 * i + 0w3853 * j
439 fun hashWordTriple (i,j,k) = 0w1327 * i + 0w3853 * j + 0w2851 * k
440
441 val hashChar = Word.fromInt o ord
442 fun hashString s =
443 case String.size s
444 of 0 => 0wx0
445 | 1 => 0w1 + hashChar(String.sub(s,0))
446 | 2 => let val w1 = String.sub(s,0)
447 val w2 = String.sub(s,1)
448 in 0w2 + hashChar w1 * 0wx1327 + hashChar w2
449 end
450 | n => let val w1 = String.sub(s,0)
451 val w2 = String.sub(s,1)
452 val wn = String.sub(s,n-1)
453 in 0w3 + hashChar w1 * 0wx3853 + hashChar w2 * 0wx1327 + hashChar wn
454 end
455
456
457 fun hashOption hashA opt =
458 case opt
459 of NONE => 0w0
460 | SOME a => 0w1 + hashA a
461
462 fun hashList hashA l =
463 case l
464 of nil => 0wx0
465 | [a] => 0w1 + hashA a
466 | a1::a2::_ => 0w2 + 0w3853 * hashA a1 + 0wx1327 * hashA a2
467
468 fun hashVector hashA cv =
469 case Vector.length cv
470 of 0 => 0wx0
471 | 1 => 0w1 + hashA(Vector.sub(cv,0))
472 | 2 => let val w1 = Vector.sub(cv,0)
473 val w2 = Vector.sub(cv,1)
474 in 0w2 + hashA w1 * 0wx1327 + hashA w2
475 end
476 | n => let val w1 = Vector.sub(cv,0)
477 val w2 = Vector.sub(cv,1)
478 val wn = Vector.sub(cv,n-1)
479 in 0w3 + hashA w1 * 0wx3853 + hashA w2 * 0wx1327 + hashA wn
480 end
481 end
482
483 (* stop of ../../Util/utilHash.sml *)
484 (* start of ../../Util/SymDict/key.sml *)
485
486
487
488 (*--------------------------------------------------------------------------*)
489 (* In order to be used as a dictinary/symbol table key, a type must have a *)
490 (* null value, hash to words, must be comparable and printable. *)
491 (*--------------------------------------------------------------------------*)
492 signature Key =
493 sig
494 type Key
495
496 val null : Key
497 val hash : Key -> word
498 val compare : Key * Key -> order
499 val toString : Key -> string
500 end
501 (* stop of ../../Util/SymDict/key.sml *)
502 (* start of ../../Util/utilInt.sml *)
503 (*--------------------------------------------------------------------------*)
504 (* Structure: UtilInt *)
505 (* *)
506 (* Depends on: *)
507 (* *)
508 (* Exceptions raised by functions in this structure: *)
509 (* appInterval : none *)
510 (* insertInt : none *)
511 (* insertNewInt : none *)
512 (* nextPowerTwo : none *)
513 (*--------------------------------------------------------------------------*)
514 signature UtilInt =
515 sig
516 val intervalList : (int * int) -> int list
517 val appInterval : (int -> unit) -> (int * int) -> unit
518 val insertInt : int * int list -> int list
519 val insertNewInt : int * int list -> int list option
520 val powerOfTwo : int -> int
521 val nextPowerTwo : int -> int
522 end
523
524 structure UtilInt : UtilInt =
525 struct
526 (*--------------------------------------------------------------------*)
527 (* generate the list [n,...,m] *)
528 (*--------------------------------------------------------------------*)
529 fun intervalList(n,m) = if n>m then nil else n::intervalList(n+1,m)
530
531 (*--------------------------------------------------------------------*)
532 (* apply f to each number in [n...m] *)
533 (*--------------------------------------------------------------------*)
534 fun appInterval f (n,m) =
535 let fun doit i =
536 if i>m then ()
537 else let val _ = f i
538 in doit (i+1)
539 end
540 in doit n
541 end
542
543 (*--------------------------------------------------------------------*)
544 (* insert an integer into a sorted list without duplicates. *)
545 (*--------------------------------------------------------------------*)
546 fun insertInt (x:int,l) =
547 let fun go nil = [x]
548 | go (l as y::ys) = case Int.compare (x,y)
549 of LESS => x::l
550 | EQUAL => l
551 | GREATER => y::go ys
552 in go l
553 end
554
555 (*--------------------------------------------------------------------*)
556 (* insert an integer into a sorted list if it is not yet in it. *)
557 (*--------------------------------------------------------------------*)
558 fun insertNewInt (x:int,l) =
559 let
560 fun go nil = SOME [x]
561 | go (l as y::ys) = case Int.compare (x,y)
562 of LESS => SOME(x::l)
563 | EQUAL => NONE
564 | GREATER => case go ys
565 of NONE => NONE
566 | SOME xys => SOME(y::xys)
567 in go l
568 end
569
570 (*--------------------------------------------------------------------*)
571 (* compute the power to the base of two. *)
572 (*--------------------------------------------------------------------*)
573 fun powerOfTwo n =
574 if n=0 then 1
575 else if n mod 2=0 then let val x=powerOfTwo (n div 2) in x*x end
576 else let val x=powerOfTwo (n-1) in 2*x end
577
578 (*--------------------------------------------------------------------*)
579 (* find the smallest p with 2^p >= n. *)
580 (*--------------------------------------------------------------------*)
581 fun nextPowerTwo n =
582 let fun doit (p,m) =
583 if m>=n then p
584 else if m*m<2*n then doit (2*p,m*m)
585 else doit (1+p,2*m)
586 in doit (1,2)
587 end
588 end
589 (* stop of ../../Util/utilInt.sml *)
590 (* start of ../../Util/utilError.sml *)
591
592
593
594
595
596 signature UtilError =
597 sig
598 exception InternalError of string * string * string
599 exception NoSuchFile of string * string
600
601 val formatMessage : int * int -> string list -> string
602 end
603
604 structure UtilError : UtilError =
605 struct
606 open UtilString
607
608 exception InternalError of string * string * string
609 exception NoSuchFile of string * string
610
611 fun formatMessage (indentWidth,lineWidth) strs =
612 let
613 val indent = nBlanks indentWidth
614 val nl = "\n"^indent
615 val blank = " "
616 val dot = "."
617
618 fun isSep c = #" "=c orelse #"\n"=c orelse #"\t"=c
619
620 fun go (w,yet) nil = List.rev ("\n"::yet)
621 | go (w,yet) (x::xs) =
622 let
623 val y = if null xs then x^dot else x
624 val l = String.size y
625 val w1 = w+l
626 val (w2,yet2) = if w1<=lineWidth then (w1,y::yet)
627 else (indentWidth+l,y::nl::yet)
628 val (w3,yet3) = if null xs then (w2,yet2)
629 else (if w2<lineWidth then (w2+1,blank::yet2)
630 else (indentWidth,nl::yet2))
631 in go (w3,yet3) xs
632 end
633
634 val tokens = List.concat (map (String.tokens isSep) strs)
635 val fragments = go (0,nil) tokens
636 in
637 String.concat fragments
638 end
639 end
640 (* stop of ../../Util/utilError.sml *)
641 (* start of ../../Util/SymDict/dict.sml *)
642 (*--------------------------------------------------------------------------*)
643 (* Functor: Dict *)
644 (* *)
645 (* Depends on: *)
646 (* Chars *)
647 (* *)
648 (* Exceptions raised by functions in this functor: *)
649 (* addByIndex : NoSuchIndex *)
650 (* addByKey : InternalError *)
651 (* getByIndex : NoSuchIndex *)
652 (* getByKey : InternalError *)
653 (* getIndex : InternalError *)
654 (* getKey : NoSuchIndex *)
655 (* hasIndex : none *)
656 (* makeDict : none *)
657 (* nullDict : none *)
658 (* printDict : none *)
659 (* usedIndices : none *)
660 (*--------------------------------------------------------------------------*)
661 (* A dictionary maps keys to consecutive integers and additionally holds *)
662 (* a value of arbitrary type for each entry. *)
663 (*--------------------------------------------------------------------------*)
664 signature Dict =
665 sig
666 type Key
667 type 'a Dict
668
669 exception NoSuchIndex
670
671 val nullDict : string * 'a -> 'a Dict
672 val makeDict : string * int * 'a -> 'a Dict
673 val clearDict : 'a Dict * int option -> unit
674
675 val hasIndex : 'a Dict * Key -> int option
676 val getIndex : 'a Dict * Key -> int
677 val getKey : 'a Dict * int -> Key
678
679 val getByIndex : 'a Dict * int -> 'a
680 val getByKey : 'a Dict * Key -> 'a
681
682 val setByIndex : 'a Dict * int * 'a -> unit
683 val setByKey : 'a Dict * Key * 'a -> unit
684
685 val usedIndices : 'a Dict -> int
686
687 val extractDict : 'a Dict -> (Key * 'a) array
688 val printDict : ('a -> string) -> 'a Dict -> unit
689 end
690
691 functor Dict (structure Key : Key) : Dict =
692 struct
693 open UtilError UtilInt
694
695 type Key = Key.Key
696
697 exception NoSuchIndex
698
699 (*--------------------------------------------------------------------*)
700 (* a dictionary can have at most size MAX_WIDTH. This is because *)
701 (* arrays may at most have Array.maxLen elements. We only use powers *)
702 (* of two as sizes, so we are really only interested in the position *)
703 (* of maxLen's highest bit. That would be the maximal width for hash *)
704 (* tables, and thus we must decrease it by one for obtaining the max *)
705 (* table width. *)
706 (*--------------------------------------------------------------------*)
707 fun highestBit w = if w=0w0 then 0 else 1+highestBit(Word.>>(w,0w1))
708 val MAX_WIDTH = highestBit (Word.fromInt Array.maxLen)-1
709
710 type Bucket = (Key * int) list
711 val nullBucket = nil : Bucket
712
713 (*--------------------------------------------------------------------*)
714 (* buckets are unsorted - they are probably small, so comparing the *)
715 (* keys might be overkill. *)
716 (*--------------------------------------------------------------------*)
717 fun addToBucket (ni as (key,_),bucket) =
718 let
719 fun doit nil = [ni]
720 | doit (nis as (ni' as (key',_))::rest) =
721 case Key.compare (key',key)
722 of LESS => ni'::doit rest
723 | EQUAL => ni::rest
724 | GREATER => ni::nis
725 in
726 doit bucket
727 end
728 fun searchBucket (key,bucket) =
729 let
730 fun doit nil = NONE
731 | doit ((key',i)::rest) =
732 case Key.compare (key',key)
733 of LESS => doit rest
734 | EQUAL => SOME i
735 | GREATER => NONE
736 in
737 doit bucket
738 end
739
740 (*--------------------------------------------------------------------*)
741 (* a dictionary consists of *)
742 (* - a string desc saying what is stored in this dictionary *)
743 (* - an array tab holding for each index its key and value *)
744 (* - a hash table, i.e. Bucket array, of double size than tab *)
745 (* - a hashFun mapping Key to the range of the hash table *)
746 (* - an integer width for computing table sizes *)
747 (* - an integer size wich is the size of the value table *)
748 (* - an integer count holding the next free index *)
749 (* - a default value for the value table *)
750 (*--------------------------------------------------------------------*)
751 type 'a Dict = {desc : string,
752 tab : (Key * 'a) array ref,
753 hashTab : Bucket array ref,
754 hashFun : (Key -> int) ref,
755 width : int ref, (* bit width *)
756 size : int ref, (* tab size=2^width, hash size is double *)
757 count : int ref, (* number of entries *)
758 def : 'a (* default for values *)
759 }
760 fun nullDict (desc,def) = {desc = desc,
761 tab = ref (Array.array(1,(Key.null,def))),
762 hashTab = ref (Array.array(2,nullBucket)),
763 hashFun = ref (fn _ => 0),
764 count = ref 0,
765 size = ref 1,
766 width = ref 0,
767 def = def}
768
769 (*--------------------------------------------------------------------*)
770 (* how many entries are in the dictionary? *)
771 (*--------------------------------------------------------------------*)
772 fun usedIndices ({count,...}:'a Dict) = !count
773
774 (*--------------------------------------------------------------------*)
775 (* what is the table load, i.e. percentage of number of entries to *)
776 (* hash table size = 100*count/(2*size) = 50*count/size. *)
777 (*--------------------------------------------------------------------*)
778 fun hashRatio({count,size,...}:'a Dict) = 50 * !count div !size
779 handle Div => 100
780
781 (*--------------------------------------------------------------------*)
782 (* this is the hash function. Key.hash hashes data to arbitrary *)
783 (* words, that are mapped to the hash range by this function, where *)
784 (* mask is the bitmask corresponding to the size of the hash table: *)
785 (* 1. square the word produced by Key.hash *)
786 (* 2. take the width bits from the middle of the square, these are *)
787 (* the bit-places influenced by all input bit-places: *)
788 (* - shift to the right by half of the destination width *)
789 (* - mask out all bits to the left of destination *)
790 (* this is a simple strategy but experiences good results. *)
791 (*--------------------------------------------------------------------*)
792 fun square (x:word) = Word.*(x,x)
793 fun hashKey(half,mask) x =
794 Word.toInt(Word.andb(mask,Word.>>(square(Key.hash x),half)))
795 fun makeHashFun(size,width) =
796 let
797 val mask = 0w2*Word.fromInt size-0w1
798 val half = Word.fromInt((width+1) div 2)
799 in
800 hashKey(half,mask)
801 end
802
803 (*--------------------------------------------------------------------*)
804 (* create a new dictionary for 2^w, but at least 2 and at most 2^m *)
805 (* entries, where m is the value of MAX_WIDTH. *)
806 (*--------------------------------------------------------------------*)
807 fun makeDict (desc,w,def) =
808 let
809 val width= Int.min(Int.max(1,w),MAX_WIDTH)
810 val size = Word.toInt(Word.<<(0w1,Word.fromInt(width-1)))
811 in {desc = desc,
812 tab = ref (Array.array(size,(Key.null,def))),
813 hashTab = ref (Array.array(2*size,nullBucket)),
814 hashFun = ref (makeHashFun(size,width)),
815 width = ref width,
816 size = ref size,
817 count = ref 0,
818 def = def}
819 end
820
821 (*--------------------------------------------------------------------*)
822 (* clear a dictionary. If the 2nd arg is SOME w, use w for resizing. *)
823 (*--------------------------------------------------------------------*)
824 fun clearDict (dict:'a Dict,widthOpt) =
825 case widthOpt
826 of NONE =>
827 let
828 val {tab=ref tab,hashTab=ref hashTab,size,count,def,...} = dict
829 val _ = appInterval (fn i => Array.update(tab,i,(Key.null,def))) (0,!count-1)
830 val _ = appInterval (fn i => Array.update(hashTab,i,nullBucket)) (0,!size*2-1)
831 in
832 count := 0
833 end
834 | SOME w =>
835 let
836 val {tab,hashTab,hashFun,width,size,count,def,...} = dict
837 val newWidth = Int.min(Int.max(1,w),MAX_WIDTH)
838 val newSize = Word.toInt(Word.<<(0w1,Word.fromInt(newWidth-1)))
839 val _ = tab := (Array.array(newSize,(Key.null,def)))
840 val _ = hashTab := (Array.array(2*newSize,nullBucket))
841 val _ = hashFun := (makeHashFun(newSize,newWidth))
842 val _ = width := newWidth
843 val _ = size := newSize
844 in
845 count := 0
846 end
847
848 (*--------------------------------------------------------------------*)
849 (* grow a dictionary to the double size. raise InternalError if the *)
850 (* dictionary already has maximal size. *)
851 (*--------------------------------------------------------------------*)
852 fun growDictionary ({desc,tab,hashTab,hashFun,width,size,count,def}:'a Dict) =
853 let
854 val oldTab = !tab
855 val _ = if !width < MAX_WIDTH then width := !width+1
856 else raise InternalError
857 ("Dict","growDictionary",
858 String.concat ["growing the ",desc," dictionary ",
859 "exceeded the system maximum size of ",
860 Int.toString Array.maxLen," for arrays"])
861 val _ = size := !size*2
862 val _ = tab := Array.array(!size,(Key.null,def))
863 val _ = hashTab := Array.array(!size*2,nullBucket)
864 val _ = hashFun := makeHashFun(!size,!width)
865
866 fun addTo (i,kv as (key,_)) =
867 let
868 val idx = !hashFun key
869 val _ = Array.update(!hashTab,idx,addToBucket((key,i),Array.sub(!hashTab,idx)))
870 val _ = Array.update(!tab,i,kv)
871 in ()
872 end
873 in
874 Array.appi addTo (oldTab,0,NONE)
875 end
876
877 (*--------------------------------------------------------------------*)
878 (* lookup the key for an index of the dictionary. *)
879 (*--------------------------------------------------------------------*)
880 fun getKey({tab,count,...}:'a Dict,idx) =
881 if !count>idx then #1(Array.sub(!tab,idx))
882 else raise NoSuchIndex
883
884 (*--------------------------------------------------------------------*)
885 (* map a Key to its index in the dictionary. if it is not in the *)
886 (* dictionary yet, add a new entry with a new index. grow the table *)
887 (* if there is no more free index in the dictionary. *)
888 (*--------------------------------------------------------------------*)
889 fun getIndex(dict as {tab,hashTab,hashFun,size,count,def,...}:'a Dict,key) =
890 let
891 val k = !hashFun key
892 val bucket = Array.sub(!hashTab,k)
893 in
894 case searchBucket(key,bucket)
895 of SOME idx => idx
896 | NONE => let val idx = !count
897 val (k',buck') = if !size>idx then (k,bucket)
898 else let val _ = growDictionary dict
899 val k' = !hashFun key
900 val buck' = Array.sub(!hashTab,k')
901 in (k',buck')
902 end
903 val _ = Array.update(!hashTab,k',addToBucket((key,idx),buck'))
904 val _ = Array.update(!tab,idx,(key,def))
905 val _ = count := idx+1
906 in idx
907 end
908 end
909
910 (*--------------------------------------------------------------------*)
911 (* does a Key have an entry in a dictionary? *)
912 (*--------------------------------------------------------------------*)
913 fun hasIndex({hashTab,hashFun,...}:'a Dict,key) =
914 let
915 val idx = !hashFun key
916 val bucket = Array.sub(!hashTab,idx)
917 in
918 searchBucket(key,bucket)
919 end
920
921 (*--------------------------------------------------------------------*)
922 (* get the value stored for index idx *)
923 (*--------------------------------------------------------------------*)
924 fun getByIndex({tab,count,...}:'a Dict,idx) =
925 if !count>idx then #2(Array.sub(!tab,idx))
926 else raise NoSuchIndex
927
928 (*--------------------------------------------------------------------*)
929 (* get the value stored for a key *)
930 (*--------------------------------------------------------------------*)
931 fun getByKey(dict,key) =
932 getByIndex(dict,getIndex(dict,key))
933
934 (*--------------------------------------------------------------------*)
935 (* enter a value for index idx. *)
936 (*--------------------------------------------------------------------*)
937 fun setByIndex({tab,count,...}:'a Dict,idx,a) =
938 if !count>idx then let val (key,_) = Array.sub(!tab,idx)
939 in Array.update(!tab,idx,(key,a))
940 end
941 else raise NoSuchIndex
942
943 (*--------------------------------------------------------------------*)
944 (* enter a value for a key. *)
945 (*--------------------------------------------------------------------*)
946 fun setByKey(dict,key,v) =
947 setByIndex(dict,getIndex(dict,key),v)
948
949 (*--------------------------------------------------------------------*)
950 (* extract the contents of the dictionary to an array. *)
951 (*--------------------------------------------------------------------*)
952 fun extractDict({count,tab,...}:'a Dict) =
953 Array.tabulate(!count,fn i => Array.sub(!tab,i))
954
955 (*--------------------------------------------------------------------*)
956 (* print the contents of the dictionary. *)
957 (*--------------------------------------------------------------------*)
958 fun printDict X2String ({desc,tab,count,...}:'a Dict) =
959 (print (desc^" dictionary:\n");
960 Array.appi
961 (fn (n,(key,value)) =>
962 print (" "^Int.toString n^": "^Key.toString key^" = "^X2String value^"\n"))
963 (!tab,0,SOME (!count)))
964 end
965 (* stop of ../../Util/SymDict/dict.sml *)
966 (* start of ../../Util/SymDict/symbolTable.sml *)
967
968
969
970
971
972
973 (*--------------------------------------------------------------------------*)
974 (* Functor: SymbolTable *)
975 (* *)
976 (* Exceptions raised by functions in this structure: *)
977 (* getSymIndex : Key.InternalError *)
978 (* getSymKey : NoSuchSymbol *)
979 (* hasSymIndex : none *)
980 (* makeSymTable : none *)
981 (* nullSymTable : none *)
982 (* printSymTable : none *)
983 (* usedSymbols : none *)
984 (*--------------------------------------------------------------------------*)
985 (* A symbol table maps Keys to consecutive integers. *)
986 (*--------------------------------------------------------------------------*)
987 signature SymTable =
988 sig
989 type Key
990 type SymTable
991
992 exception NoSuchSymbol
993
994 val nullSymTable : string -> SymTable
995 val makeSymTable : string * int -> SymTable
996 val clearSymTable : SymTable * int option -> unit
997
998 val hasSymIndex : SymTable * Key -> int option
999 val getSymIndex : SymTable * Key -> int
1000 val getSymKey : SymTable * int -> Key
1001 val usedSymbols : SymTable -> int
1002
1003 val assignSymIndex : SymTable * Key * int -> unit
1004 val reserveSymIndex : SymTable -> int
1005
1006 val extractSymTable : SymTable -> Key vector
1007 val printSymTable : SymTable -> unit
1008 end
1009
1010 functor SymTable (structure Key : Key) : SymTable =
1011 struct
1012 open UtilError UtilInt
1013
1014 exception NoSuchSymbol
1015
1016 type Key = Key.Key
1017
1018 (*--------------------------------------------------------------------*)
1019 (* a symbol table can have at most size MAX_WIDTH. This is because *)
1020 (* arrays may at most have Array.maxLen elements. We only use powers *)
1021 (* of two as sizes, so we are really only interested in the position *)
1022 (* of maxLen's highest bit. That would be the maximal width for hash *)
1023 (* tables, and thus we must decrease it by one for obtaining the max *)
1024 (* table width. *)
1025 (*--------------------------------------------------------------------*)
1026 fun highestBit w = if w=0w0 then 0 else 1+highestBit(Word.>>(w,0w1))
1027 val MAX_WIDTH = highestBit (Word.fromInt Array.maxLen)-1
1028
1029 type Bucket = (Key * int) list
1030 val nullBucket = nil : Bucket
1031
1032 (*--------------------------------------------------------------------*)
1033 (* buckets are sorted - though they are probably small. *)
1034 (*--------------------------------------------------------------------*)
1035 fun addToBucket (ni as (key,_),bucket) =
1036 let
1037 fun doit nil = [ni]
1038 | doit (nis as (ni' as (key',_))::rest) =
1039 case Key.compare (key',key)
1040 of LESS => ni'::doit rest
1041 | EQUAL => ni::rest
1042 | GREATER => ni::nis
1043 in
1044 doit bucket
1045 end
1046 fun searchBucket (key,bucket) =
1047 let
1048 fun doit nil = NONE
1049 | doit ((key',i)::rest) =
1050 case Key.compare (key',key)
1051 of LESS => doit rest
1052 | EQUAL => SOME i
1053 | GREATER => NONE
1054 in
1055 doit bucket
1056 end
1057
1058 (*--------------------------------------------------------------------*)
1059 (* a symbol table consists of *)
1060 (* - an array tab holding for each index its key *)
1061 (* - a hash table, i.e. Bucket array, of double size than tab *)
1062 (* - a hashFun mapping Key to the range of the hash table *)
1063 (* - an integer width for computing table sizes *)
1064 (* - an integer size wich is the size of the value table *)
1065 (* - an integer count holding the next free index *)
1066 (*--------------------------------------------------------------------*)
1067 type SymTable = {desc : string,
1068 tab : Key array ref,
1069 hash : Bucket array ref,
1070 hashFun : (Key -> int) ref,
1071 width : int ref, (* bit width *)
1072 size : int ref, (* tab size=2^width, hash size is double *)
1073 count : int ref (* number of entries *)
1074 }
1075
1076 fun nullSymTable desc = {desc = desc,
1077 tab = ref (Array.array(1,Key.null)),
1078 hash = ref (Array.array(2,nullBucket)),
1079 hashFun = ref (fn _ => 0),
1080 count = ref 0,
1081 size = ref 1,
1082 width = ref 0} : SymTable
1083
1084 (*--------------------------------------------------------------------*)
1085 (* how many entries are in the symtable? *)
1086 (*--------------------------------------------------------------------*)
1087 fun usedSymbols ({count,...}:SymTable) = !count
1088
1089 (*--------------------------------------------------------------------*)
1090 (* what is the table load, i.e. percentage of number of entries to *)
1091 (* hash table size = 100*count/(2*size) = 50*count/size. *)
1092 (*--------------------------------------------------------------------*)
1093 fun hashRatio({count,size,...}:SymTable) = 50 * !count div !size
1094 handle Div => 100
1095
1096 (*--------------------------------------------------------------------*)
1097 (* this is the hash function. Key.hash hashes data to arbitrary *)
1098 (* words, that are mapped to the hash range by this function, where *)
1099 (* mask is the bitmask corresponding to the size of the hash table: *)
1100 (* 1. square the word produced by Key.hash *)
1101 (* 2. take the width bits from the middle of the square, these are *)
1102 (* the bit-places influenced by all input bit-places: *)
1103 (* - shift to the right by half of the destination width *)
1104 (* - mask out all bits to the left of destination *)
1105 (* this is a simple strategy but experiences good results. *)
1106 (*--------------------------------------------------------------------*)
1107 fun square (x:word) = Word.*(x,x)
1108 fun hashKey(half,mask) x =
1109 Word.toInt(Word.andb(mask,Word.>>(square(Key.hash x),half)))
1110 fun makeHashFun(size,width) =
1111 let
1112 val mask = Word.fromInt(2*size-1)
1113 val half = Word.fromInt((width+1) div 2)
1114 in
1115 hashKey(half,mask)
1116 end
1117
1118 (*--------------------------------------------------------------------*)
1119 (* create a new symtable for 2^w, but at least 2 and at most 2^m *)
1120 (* entries, where m is the value of MAX_WIDTH. *)
1121 (*--------------------------------------------------------------------*)
1122 fun makeSymTable (desc,w) =
1123 let
1124 val width= Int.min(Int.max(1,w),MAX_WIDTH)
1125 val size = Word.toInt(Word.<<(0w1,Word.fromInt(width-1)))
1126 in {desc = desc,
1127 tab = ref (Array.array(size,Key.null)),
1128 hash = ref (Array.array(2*size,nullBucket)),
1129 hashFun = ref (makeHashFun(size,width)),
1130 width = ref width,
1131 size = ref size,
1132 count = ref 0}
1133 end
1134
1135 (*--------------------------------------------------------------------*)
1136 (* clear a dictionary. If the 2nd arg is SOME w, use w for resizing. *)
1137 (*--------------------------------------------------------------------*)
1138 fun clearSymTable (symTab:SymTable,widthOpt) =
1139 case widthOpt
1140 of NONE =>
1141 let
1142 val {tab=ref tab,hash=ref hash,size,count,...} = symTab
1143 val _ = appInterval (fn i => Array.update(tab,i,Key.null)) (0,!count-1)
1144 val _ = appInterval (fn i => Array.update(hash,i,nullBucket)) (0,!size*2-1)
1145 in
1146 count := 0
1147 end
1148 | SOME w =>
1149 let
1150 val {tab,hash,hashFun,width,size,count,...} = symTab
1151 val newWidth = Int.min(Int.max(1,w),MAX_WIDTH)
1152 val newSize = Word.toInt(Word.<<(0w1,Word.fromInt(newWidth-1)))
1153 val _ = tab := (Array.array(newSize,Key.null))
1154 val _ = hash := (Array.array(2*newSize,nullBucket))
1155 val _ = hashFun := (makeHashFun(newSize,newWidth))
1156 val _ = width := newWidth
1157 val _ = size := newSize
1158 in
1159 count := 0
1160 end
1161
1162 (*--------------------------------------------------------------------*)
1163 (* grow a symtable to the double size. raise InternalError if the *)
1164 (* table already has maximal size. *)
1165 (*--------------------------------------------------------------------*)
1166 fun growTable ({desc,tab,hash,hashFun,width,size,count}:SymTable) =
1167 let
1168 val newWidth = if !width < MAX_WIDTH then !width+1
1169 else raise InternalError
1170 ("SymTable","growTable",
1171 String.concat ["growing the ",desc," symbol table ",
1172 "exceeded the system maximum size of ",
1173 Int.toString Array.maxLen," for arrays"])
1174 val newSize = !size*2
1175
1176 val oldTab = !tab
1177 val newTab = Array.array(newSize,Key.null)
1178 val newHash = Array.array(2*newSize,nullBucket)
1179 val newHashFun = makeHashFun(newSize,newWidth)
1180
1181 fun addToNew (inv as (i,key)) =
1182 let
1183 val idx = newHashFun key
1184 val _ = Array.update(newHash,idx,addToBucket((key,i),Array.sub(newHash,idx)))
1185 val _ = Array.update(newTab,i,key)
1186 in ()
1187 end
1188 val _ = Array.appi addToNew (!tab,0,NONE)
1189
1190 val _ = tab := newTab
1191 val _ = hash := newHash
1192 val _ = size := newSize
1193 val _ = width := newWidth
1194 val _ = hashFun := newHashFun
1195 in ()
1196 end
1197
1198 (*--------------------------------------------------------------------*)
1199 (* lookup the key for an index of the symbol table. *)
1200 (*--------------------------------------------------------------------*)
1201 fun getSymKey({tab,count,...}:SymTable,idx) =
1202 if !count>idx then Array.sub(!tab,idx)
1203 else raise NoSuchSymbol
1204
1205 (*--------------------------------------------------------------------*)
1206 (* map a Key to its index in the symbol table. if it is not in the *)
1207 (* symbol table yet, add a new entry with a new index. grow the table *)
1208 (* if there is no more free index in the table. *)
1209 (*--------------------------------------------------------------------*)
1210 fun getSymIndex(st as {tab,hash,hashFun,size,count,...}:SymTable,key) =
1211 let
1212 val idx = !hashFun key
1213 val bucket = Array.sub(!hash,idx)
1214 in
1215 case searchBucket(key,bucket)
1216 of SOME i => i
1217 | NONE => let val i = !count
1218 val (idx',buck') = if !size>i then (idx,bucket)
1219 else let val _ = growTable st
1220 val idx' = !hashFun key
1221 val buck' = Array.sub(!hash,idx')
1222 in (idx',buck')
1223 end
1224 val _ = Array.update(!hash,idx',addToBucket((key,i),buck'))
1225 val _ = Array.update(!tab,i,key)
1226 val _ = count := i+1
1227 in i
1228 end
1229 end
1230
1231 (*--------------------------------------------------------------------*)
1232 (* does a Key have an entry in a symbol table? *)
1233 (*--------------------------------------------------------------------*)
1234 fun hasSymIndex({hash,hashFun,...}:SymTable,key) =
1235 let
1236 val idx = !hashFun key
1237 val buck = Array.sub(!hash,idx)
1238 in
1239 searchBucket(key,buck)
1240 end
1241
1242 (*--------------------------------------------------------------------*)
1243 (* reserve an index for a (yet unknown) key. *)
1244 (*--------------------------------------------------------------------*)
1245 fun reserveSymIndex(st as {size,count=count as ref i,...}:SymTable) =
1246 let
1247 val _ = if !size>i then () else growTable st
1248 val _ = count := i+1
1249 in i
1250 end
1251
1252 (*--------------------------------------------------------------------*)
1253 (* assign an index to a (previously reserved) index. *)
1254 (*--------------------------------------------------------------------*)
1255 fun assignSymIndex(st as {count,hash,hashFun,tab,...}:SymTable,key,i) =
1256 if !count<=i then raise NoSuchSymbol
1257 else let val idx = !hashFun key
1258 val buck = Array.sub(!hash,idx)
1259 val newBuck = addToBucket((key,i),buck)
1260 val _ = Array.update(!hash,idx,newBuck)
1261 val _ = Array.update(!tab,i,key)
1262 in ()
1263 end
1264
1265 (*--------------------------------------------------------------------*)
1266 (* extract the contents of a symbol table to a vector. *)
1267 (*--------------------------------------------------------------------*)
1268 fun extractSymTable({count,tab,...}:SymTable) =
1269 Array.extract(!tab,0,SOME(!count))
1270
1271 (*--------------------------------------------------------------------*)
1272 (* print the contents of the symbol table. *)
1273 (*--------------------------------------------------------------------*)
1274 fun printSymTable ({desc,tab,count,...}:SymTable) =
1275 (print (desc^" table:\n");
1276 Array.appi
1277 (fn (n,key) =>
1278 print (" "^Int.toString n^": "^Key.toString key^"\n"))
1279 (!tab,0,SOME (!count)))
1280 end
1281 (* stop of ../../Util/SymDict/symbolTable.sml *)
1282 (* start of ../../Util/SymDict/intListDict.sml *)
1283
1284
1285
1286
1287
1288
1289 structure KeyIntList : Key =
1290 struct
1291 type Key = int list
1292
1293 val null = nil
1294 val hash = UtilHash.hashList Word.fromInt
1295 val compare = UtilCompare.compareList Int.compare
1296 val toString = UtilString.List2String Int.toString
1297 end
1298
1299 structure IntListDict = Dict (structure Key = KeyIntList)
1300 structure IntListSymTab = SymTable (structure Key = KeyIntList)
1301
1302
1303 (* stop of ../../Util/SymDict/intListDict.sml *)
1304 (* start of ../../Util/SymDict/intDict.sml *)
1305
1306
1307
1308
1309
1310
1311
1312 structure KeyInt : Key =
1313 struct
1314 type Key = int
1315
1316 val null = 0
1317 val hash = Word.fromInt
1318 val compare = Int.compare
1319 val toString = Int.toString
1320 end
1321
1322 structure IntDict = Dict (structure Key = KeyInt)
1323 structure IntSymTab = SymTable (structure Key = KeyInt)
1324
1325
1326 (* stop of ../../Util/SymDict/intDict.sml *)
1327 (* start of ../../Unicode/Chars/uniChar.sml *)
1328 (*--------------------------------------------------------------------------*)
1329 (* Structure: UniChar *)
1330 (* *)
1331 (* Depends on: *)
1332 (* UtilString *)
1333 (* *)
1334 (* Exceptions raised by functions in this structure: *)
1335 (*--------------------------------------------------------------------------*)
1336 signature UniChar =
1337 sig
1338 structure Chars : WORD
1339
1340 type Char = Chars.word
1341 type Data = Char list
1342 type Vector = Char vector
1343
1344 val nullData : Data
1345 val nullVector : Vector
1346
1347 val hashChar : Char -> word
1348 val hashData : Data -> word
1349 val hashVector : Vector -> word
1350
1351 val compareChar : Char * Char -> order
1352 val compareData : Data * Data -> order
1353 val compareVector : Vector * Vector -> order
1354
1355 val char2Char : char -> Char
1356 val Char2char : Char -> char
1357
1358 val Char2Uni : Char -> string
1359 val Char2String : Char -> string
1360
1361 val String2Data : string -> Data
1362 val Data2String : Data -> string
1363 val Latin2String : Data -> string
1364
1365 val Data2Vector : Data -> Vector
1366 val Vector2Data : Vector -> Data
1367
1368 val String2Vector : string -> Vector
1369 val Vector2String : Vector -> string
1370
1371 val quoteUni : Char -> string -> string
1372 val quoteChar : Char -> Char -> string
1373 val quoteData : Char -> Data -> string
1374 val quoteVector : Char -> Vector -> string
1375 end
1376
1377 structure UniChar : UniChar =
1378 struct
1379 val O_VECTOR_PRINTLEN = 48
1380
1381 structure Chars = Word
1382
1383 val _ = if Chars.wordSize > 21 then ()
1384 else let val str = ("UniChar: Chars.wordSize is too small.\n"^
1385 "Cannot compile on this system!\n" )
1386 val _ = print str
1387 in raise Fail str
1388 end
1389
1390 type Char = Chars.word
1391 type Data = Char list
1392
1393 type CharInterval = Char * Char
1394 type CharRange = CharInterval list
1395
1396 type Vector = Char vector
1397
1398 val nullChar = 0wx0:Char
1399 val nullData = nil:Data
1400 val nullVector = Vector.fromList nullData
1401
1402 val hashChar = Word.fromLargeWord o Chars.toLargeWord
1403 val hashData = UtilHash.hashList hashChar
1404 val hashVector = UtilHash.hashVector hashChar
1405
1406 val compareChar = Chars.compare
1407 val compareData = UtilCompare.compareList compareChar
1408 val compareVector = UtilCompare.compareVector compareChar
1409
1410 val char2Char = Chars.fromLargeWord o Word8.toLargeWord o Byte.charToByte
1411 val Char2char = Byte.byteToChar o Word8.fromLargeWord o Chars.toLargeWord
1412
1413 fun Char2Uni c =
1414 "U+"^UtilString.toUpperString(StringCvt.padLeft #"0" 4 (Chars.toString c))
1415 fun Char2String c =
1416 case c
1417 of 0wx9 => "\\t"
1418 | 0wxA => "\\n"
1419 | _ => if c<0wx100 then String.implode [Char2char c]
1420 else Char2Uni c
1421
1422 fun String2Data s = map char2Char (String.explode s)
1423 fun Data2String cs = String.concat (map Char2String cs)
1424 fun Latin2String cs = String.implode (map Char2char cs)
1425
1426 val Data2Vector = Vector.fromList
1427 fun String2Vector s = Vector.tabulate(String.size s,fn i => char2Char(String.sub(s,i)))
1428
1429 fun Vector2Data vec = Vector.foldr (op ::) nil vec
1430 fun Vector2String vec =
1431 let
1432 val maxlen = O_VECTOR_PRINTLEN
1433 val len = Vector.length vec
1434 in
1435 if len<=maxlen orelse maxlen=0
1436 then Data2String (Vector2Data vec)
1437 else let
1438 val cs1 = Vector.foldri
1439 (fn (_,c,cs) => c::cs) nil (vec,0,SOME (maxlen div 2))
1440 val cs2 = Vector.foldri
1441 (fn (_,c,cs) => c::cs) nil (vec,len-3-maxlen div 2,NONE)
1442 in Data2String cs1^"..."^Data2String cs2
1443 end
1444 end
1445
1446 fun quoteUni q s = let val sQ = Char2String q in sQ^s^sQ end
1447 fun quoteChar q c = if c=0wx0 then "entity end" else quoteUni q (Char2String c)
1448 fun quoteData q cs = quoteUni q (Data2String cs)
1449 fun quoteVector q v = quoteUni q (Vector2String v)
1450 end
1451
1452
1453 (* stop of ../../Unicode/Chars/uniChar.sml *)
1454 (* start of ../../Unicode/Chars/charVecDict.sml *)
1455 structure KeyVector : Key =
1456 struct
1457 type Key = UniChar.Vector
1458
1459 val null = UniChar.nullVector
1460 val compare = UniChar.compareVector
1461 val toString = UniChar.Vector2String
1462 val hash = UniChar.hashVector
1463 end
1464
1465 structure VectorDict = Dict (structure Key = KeyVector)
1466 (* stop of ../../Unicode/Chars/charVecDict.sml *)
1467 (* start of ../../Util/SymDict/stringDict.sml *)
1468
1469
1470
1471
1472
1473
1474
1475
1476 structure KeyString : Key =
1477 struct
1478 type Key = string
1479
1480 val null = ""
1481 val hash = UtilHash.hashString
1482 val compare = String.compare
1483
1484 fun toString str = str
1485 end
1486
1487 structure StringDict = Dict (structure Key = KeyString)
1488 (* stop of ../../Util/SymDict/stringDict.sml *)
1489 (* start of ../../Unicode/encoding.sml *)
1490
1491
1492 signature Encoding =
1493 sig
1494 datatype Encoding =
1495 NOENC | ASCII | EBCDIC | LATIN1
1496 | UCS4B | UCS4L | UCS4SB | UCS4SL
1497 | UCS2B | UCS2L | UTF16B | UTF16L
1498 | UTF8
1499
1500 val UCS2 : Encoding
1501 val UCS4 : Encoding
1502 val UTF16 : Encoding
1503
1504 val encodingName : Encoding -> string
1505 val isEncoding : string -> Encoding
1506 val switchEncoding : Encoding * Encoding -> Encoding
1507 end
1508
1509 structure Encoding : Encoding =
1510 struct
1511 open StringDict
1512
1513 datatype Encoding =
1514 NOENC | ASCII | EBCDIC | LATIN1
1515 | UCS4B | UCS4L | UCS4SB | UCS4SL
1516 | UCS2B | UCS2L | UTF16B | UTF16L
1517 | UTF8
1518
1519 val UCS2 = UCS2B
1520 val UCS4 = UCS4B
1521 val UTF16 = UTF16B
1522
1523 fun encodingName enc =
1524 case enc
1525 of NOENC => "NONE"
1526 | ASCII => "ASCII"
1527 | EBCDIC => "EBCDIC"
1528 | LATIN1 => "ISO-8859-1"
1529 | UCS2B => "UCS-2"
1530 | UCS2L => "UCS-2"
1531 | UCS4B => "UCS-4"
1532 | UCS4L => "UCS-4"
1533 | UCS4SB => "UCS-4"
1534 | UCS4SL => "UCS-4"
1535 | UTF8 => "UTF-8"
1536 | UTF16B => "UTF-16"
1537 | UTF16L => "UTF-16"
1538
1539 val encDict = makeDict("encoding",6,NOENC)
1540 val encAliases =
1541 [(ASCII,["ANSI_X3.4-1968","ANSI_X3.4-1986","ASCII","US-ASCII","US",
1542 "ISO646-US","ISO-IR-6","ISO_646.IRV:1991","IBM367","CP367"]),
1543 (EBCDIC,["EBCDIC"]),
1544 (LATIN1,["ISO_8859-1:1987","ISO-8859-1","ISO_8859-1",
1545 "ISO-IR-100","CP819","IBM819","L1","LATIN1"]),
1546 (UCS2,["UCS-2","ISO-10646-UCS-2"]),
1547 (UCS4,["UCS-4","ISO-10646-UCS-4"]),
1548 (UTF16,["UTF-16"]),
1549 (UTF8,["UTF-8"])
1550 ]
1551 val _ = app (fn (x,ys) => app (fn y => setByKey(encDict,y,x)) ys) encAliases
1552 fun isEncoding name = getByKey(encDict,name)
1553
1554 fun compatAscii new =
1555 case new
1556 of ASCII => new
1557 | LATIN1 => new
1558 | UTF8 => new
1559 | _ => NOENC
1560 fun compatUcs4 (old,new) =
1561 if new=UCS4 then old else NOENC
1562
1563 fun switchEncoding(old,new) =
1564 case old
1565 of NOENC => NOENC
1566 | ASCII => compatAscii new
1567 | EBCDIC => if new=EBCDIC then new else NOENC
1568 | LATIN1 => compatAscii new
1569 | UCS4B => compatUcs4(old,new)
1570 | UCS4L => compatUcs4(old,new)
1571 | UCS4SB => compatUcs4(old,new)
1572 | UCS4SL => compatUcs4(old,new)
1573 | UTF16B => if new=UTF16 then old else if new=UCS2 then UCS2B else NOENC
1574 | UTF16L => if new=UTF16 then old else if new=UCS2 then UCS2L else NOENC
1575 | UCS2B => if new=UCS2 then old else if new=UTF16 then UTF16B else NOENC
1576 | UCS2L => if new=UCS2 then old else if new=UTF16 then UTF16L else NOENC
1577 | UTF8 => compatAscii new
1578 end
1579 (* stop of ../../Unicode/encoding.sml *)
1580 (* start of ../../Unicode/Encode/encodeBasic.sml *)
1581
1582
1583
1584
1585
1586
1587 (*--------------------------------------------------------------------------*)
1588 (* Structure: EncodeBasic *)
1589 (* *)
1590 (* Exceptions raised by functions in this structure: *)
1591 (* closeFile : none *)
1592 (* fileName : none *)
1593 (* openFile : NoSuchFile *)
1594 (* writeByte : Io *)
1595 (*--------------------------------------------------------------------------*)
1596 signature EncodeBasic =
1597 sig
1598 type File
1599
1600 val stdOutFile : File
1601 val closeFile : File -> unit
1602 val fileName : File -> string
1603 val openFile : string -> File
1604 val writeByte : File * Word8.word -> File
1605 end
1606
1607 structure EncodeBasic : EncodeBasic =
1608 struct
1609 open UtilError
1610
1611 type outstream = TextIO.outstream
1612 val closeOut = TextIO.closeOut
1613 val openOut = TextIO.openOut
1614 val output1 = TextIO.output1
1615 val stdOut = TextIO.stdOut
1616
1617 type File = string * outstream
1618
1619 val stdOutFile = ("-",stdOut)
1620
1621 fun closeFile(fname,s) = if fname="-" then () else closeOut s
1622 fun fileName(fname,_) = if fname="-" then "<stdout>" else fname
1623 fun openFile fname =
1624 if fname = "-" then (fname,stdOut)
1625 else (fname,openOut fname)
1626 handle IO.Io {name,cause,...} => raise NoSuchFile(name,exnMessage cause)
1627 fun writeByte (f as (_,s),b) = f before output1(s,chr(Word8.toInt b))
1628 end
1629 (* stop of ../../Unicode/Encode/encodeBasic.sml *)
1630 (* start of ../../Unicode/Encode/encodeError.sml *)
1631
1632
1633
1634
1635
1636
1637
1638 signature EncodeError =
1639 sig
1640 datatype EncodeError =
1641 ERR_ILLEGAL_CHAR of UniChar.Char * string
1642
1643 val encodeMessage : EncodeError -> string list
1644
1645 exception EncodeError of EncodeBasic.File * EncodeError
1646 end
1647
1648 structure EncodeError : EncodeError =
1649 struct
1650 open
1651 UtilString
1652 UniChar
1653
1654 datatype EncodeError =
1655 ERR_ILLEGAL_CHAR of UniChar.Char * string
1656
1657 fun encodeMessage err =
1658 case err
1659 of ERR_ILLEGAL_CHAR(c,what) => [Char2Uni c,"is not",prependAnA what,"character"]
1660
1661 exception EncodeError of EncodeBasic.File * EncodeError
1662 end
1663 (* stop of ../../Unicode/Encode/encodeError.sml *)
1664 (* start of ../../Unicode/Encode/encodeMisc.sml *)
1665 (*
1666 require "basis.__word";
1667 require "basis.__word8";
1668 require "basis.__word8_vector";
1669
1670 require "chars";
1671 require "encodeBasic";
1672 require "encodeError";
1673 *)
1674 signature EncodeMisc =
1675 sig
1676 val writeCharAscii : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1677 val writeCharEbcdic : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1678 val writeCharLatin1 : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1679 val writeCharUcs4B : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1680 val writeCharUcs4L : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1681 val writeCharUcs4SB : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1682 val writeCharUcs4SL : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1683 val writeCharUtf8 : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1684 val writeCharUtf16B : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1685 val writeCharUtf16L : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1686 val writeCharUcs2B : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1687 val writeCharUcs2L : UniChar.Char * EncodeBasic.File -> EncodeBasic.File
1688
1689 val validCharAscii : UniChar.Char -> bool
1690 val validCharEbcdic : UniChar.Char -> bool
1691 val validCharLatin1 : UniChar.Char -> bool
1692 end
1693
1694 structure EncodeMisc : EncodeMisc =
1695 struct
1696 open UniChar EncodeBasic EncodeError
1697
1698 infix 8 >>
1699 infix 7 &&
1700 infix 6 ||
1701
1702 val op && = Chars.andb
1703 val op >> = Chars.>>
1704 val op || = Word8.orb
1705
1706 fun splitSurrogates (c : Char) =
1707 (((c-0wx10000) >> 0w10)+0wxD800,c && 0wx3FF + 0wxDC00)
1708
1709 fun Char2Byte c = Word8.fromLargeWord(Chars.toLargeWord c)
1710
1711 (*---------------------------------------------------------------------*)
1712 (* Ascii *)
1713 (*---------------------------------------------------------------------*)
1714 fun validCharAscii (c : Char) = c<0wx80
1715 fun writeCharAscii(c,f) =
1716 if c<0wx80 then writeByte(f,Char2Byte c)
1717 else raise EncodeError(f,ERR_ILLEGAL_CHAR(c,"ASCII"))
1718
1719 (*---------------------------------------------------------------------*)
1720 (* Ebcdic *)
1721 (*---------------------------------------------------------------------*)
1722 val latin2ebcdicTab = Word8Vector.fromList
1723 [0wx00,0wx01,0wx02,0wx03,0wx37,0wx2D,0wx2E,0wx2F,
1724 0wx16,0wx05,0wx25,0wx0B,0wx0C,0wx0D,0wx0E,0wx0F,
1725 0wx10,0wx11,0wx12,0wx13,0wx3C,0wx3D,0wx32,0wx26,
1726 0wx18,0wx19,0wx3F,0wx27,0wx1C,0wx1D,0wx1E,0wx1F,
1727 0wx40,0wx4F,0wx7F,0wx7B,0wx5B,0wx6C,0wx50,0wx7D,
1728 0wx4D,0wx5D,0wx5C,0wx4E,0wx6B,0wx60,0wx4B,0wx61,
1729 0wxF0,0wxF1,0wxF2,0wxF3,0wxF4,0wxF5,0wxF6,0wxF7,
1730 0wxF8,0wxF9,0wx7A,0wx5E,0wx4C,0wx7E,0wx6E,0wx6F,
1731 0wx7C,0wxC1,0wxC2,0wxC3,0wxC4,0wxC5,0wxC6,0wxC7,
1732 0wxC8,0wxC9,0wxD1,0wxD2,0wxD3,0wxD4,0wxD5,0wxD6,
1733 0wxD7,0wxD8,0wxD9,0wxE2,0wxE3,0wxE4,0wxE5,0wxE6,
1734 0wxE7,0wxE8,0wxE9,0wx4A,0wxE0,0wx5A,0wx5F,0wx6D,
1735 0wx79,0wx81,0wx82,0wx83,0wx84,0wx85,0wx86,0wx87,
1736 0wx88,0wx89,0wx91,0wx92,0wx93,0wx94,0wx95,0wx96,
1737 0wx97,0wx98,0wx99,0wxA2,0wxA3,0wxA4,0wxA5,0wxA6,
1738 0wxA7,0wxA8,0wxA9,0wxC0,0wx6A,0wxD0,0wxA1,0wx07,
1739 0wx20,0wx21,0wx22,0wx23,0wx24,0wx15,0wx06,0wx17,
1740 0wx28,0wx29,0wx2A,0wx2B,0wx2C,0wx09,0wx0A,0wx1B,
1741 0wx30,0wx31,0wx1A,0wx33,0wx34,0wx35,0wx36,0wx08,
1742 0wx38,0wx39,0wx3A,0wx3B,0wx04,0wx14,0wx3E,0wxE1,
1743 0wx41,0wx42,0wx43,0wx44,0wx45,0wx46,0wx47,0wx48,
1744 0wx49,0wx51,0wx52,0wx53,0wx54,0wx55,0wx56,0wx57,
1745 0wx58,0wx59,0wx62,0wx63,0wx64,0wx65,0wx66,0wx67,
1746 0wx68,0wx69,0wx70,0wx71,0wx72,0wx73,0wx74,0wx75,
1747 0wx76,0wx77,0wx78,0wx80,0wx8A,0wx8B,0wx8C,0wx8D,
1748 0wx8E,0wx8F,0wx90,0wx9A,0wx9B,0wx9C,0wx9D,0wx9E,
1749 0wx9F,0wxA0,0wxAA,0wxAB,0wxAC,0wxAD,0wxAE,0wxAF,
1750 0wxB0,0wxB1,0wxB2,0wxB3,0wxB4,0wxB5,0wxB6,0wxB7,
1751 0wxB8,0wxB9,0wxBA,0wxBB,0wxBC,0wxBD,0wxBE,0wxBF,
1752 0wxCA,0wxCB,0wxCC,0wxCD,0wxCE,0wxCF,0wxDA,0wxDB,
1753 0wxDC,0wxDD,0wxDE,0wxDF,0wxEA,0wxEB,0wxEC,0wxED,
1754 0wxEE,0wxEF,0wxFA,0wxFB,0wxFC,0wxFD,0wxFE,0wxFF
1755 ]
1756 fun validCharEbcdic (c : Char) = c<0wx100
1757 fun writeCharEbcdic(c,f) =
1758 if c<0wx100 then writeByte(f,Word8Vector.sub(latin2ebcdicTab,Chars.toInt c))
1759 else raise EncodeError(f,ERR_ILLEGAL_CHAR(c,"EBCDIC"))
1760
1761 (*---------------------------------------------------------------------*)
1762 (* Latin1 *)
1763 (*---------------------------------------------------------------------*)
1764 fun validCharLatin1 (c : Char) = c<0wx100
1765 fun writeCharLatin1(c,f) =
1766 if c<0wx100 then writeByte(f,Char2Byte c)
1767 else raise EncodeError(f,ERR_ILLEGAL_CHAR(c,"LATIN-1"))
1768
1769 (*---------------------------------------------------------------------*)
1770 (* UCS-4 *)
1771 (*---------------------------------------------------------------------*)
1772 fun ucs4Bytes c = (Char2Byte(c >> 0w24),
1773 Char2Byte(c >> 0w16),
1774 Char2Byte(c >> 0w8),
1775 Char2Byte c)
1776 fun writeCharUcs4 perm =
1777 fn (c,f) => let val bytes = ucs4Bytes c
1778 val (b1,b2,b3,b4) = perm bytes
1779 val f1 = writeByte(f,b1)
1780 val f2 = writeByte(f1,b2)
1781 val f3 = writeByte(f2,b3)
1782 val f4 = writeByte(f3,b4)
1783 in f4
1784 end
1785 fun permUcs4B x = x
1786 fun permUcs4L (b1,b2,b3,b4) = (b4,b3,b2,b1)
1787 fun permUcs4SB (b1,b2,b3,b4) = (b2,b1,b4,b3)
1788 fun permUcs4SL (b1,b2,b3,b4) = (b3,b4,b1,b2)
1789
1790 val writeCharUcs4B = writeCharUcs4 permUcs4B
1791 val writeCharUcs4L = writeCharUcs4 permUcs4L
1792 val writeCharUcs4SB = writeCharUcs4 permUcs4SB
1793 val writeCharUcs4SL = writeCharUcs4 permUcs4SL
1794
1795 (*---------------------------------------------------------------------*)
1796 (* UTF-8 *)
1797 (*---------------------------------------------------------------------*)
1798 fun writeCharUtf8(c,f) =
1799 if c<0wx80 then writeByte(f,Char2Byte c)
1800 else if c<0wx800
1801 then let val f1 = writeByte(f,0wxC0 || Char2Byte(c >> 0w6))
1802 val f2 = writeByte(f1,0wx80 || Char2Byte(c && 0wx3F))
1803 in f2
1804 end
1805 else if c<0wx10000
1806 then let val f1 = writeByte(f, 0wxE0 || Char2Byte(c >> 0w12))
1807 val f2 = writeByte(f1,0wx80 || Char2Byte((c >> 0w6) && 0wx3F))
1808 val f3 = writeByte(f2,0wx80 || Char2Byte(c && 0wx3F))
1809 in f3
1810 end
1811 else if c<0wx200000
1812 then let val f1 = writeByte(f, 0wxF0 || Char2Byte(c >> 0w18))
1813 val f2 = writeByte(f1,0wx80 || Char2Byte((c >> 0w12) && 0wx3F))
1814 val f3 = writeByte(f2,0wx80 || Char2Byte((c >> 0w6) && 0wx3F))
1815 val f4 = writeByte(f3,0wx80 || Char2Byte(c && 0wx3F))
1816 in f4
1817 end
1818 else if c<0wx4000000
1819 then let val f1 = writeByte(f, 0wxF8 || Char2Byte(c >> 0w24))
1820 val f2 = writeByte(f1,0wx80 || Char2Byte((c >> 0w18) && 0wx3F))
1821 val f3 = writeByte(f2,0wx80 || Char2Byte((c >> 0w12) && 0wx3F))
1822 val f4 = writeByte(f3,0wx80 || Char2Byte((c >> 0w6) && 0wx3F))
1823 val f5 = writeByte(f4,0wx80 || Char2Byte(c && 0wx3F))
1824 in f5
1825 end
1826 else let val f1 = writeByte(f, 0wxFC || Char2Byte(c >> 0w30))
1827 val f2 = writeByte(f1,0wx80 || Char2Byte((c >> 0w24) && 0wx3F))
1828 val f3 = writeByte(f2,0wx80 || Char2Byte((c >> 0w18) && 0wx3F))
1829 val f4 = writeByte(f3,0wx80 || Char2Byte((c >> 0w12) && 0wx3F))
1830 val f5 = writeByte(f4,0wx80 || Char2Byte((c >> 0w6) && 0wx3F))
1831 val f6 = writeByte(f5,0wx80 || Char2Byte(c && 0wx3F))
1832 in f6
1833 end
1834
1835 (*---------------------------------------------------------------------*)
1836 (* UTF-16 *)
1837 (*---------------------------------------------------------------------*)
1838 fun oneUtf16 isL (c,f) =
1839 let val (b1,b2) = (Char2Byte(c >> 0w8),Char2Byte c)
1840 in if isL then writeByte(writeByte(f,b2),b1)
1841 else writeByte(writeByte(f,b1),b2)
1842 end
1843 fun writeCharUtf16 isL =
1844 fn (c,f) =>
1845 if c<0wx10000 then oneUtf16 isL (c,f)
1846 else let val (hi,lo) = splitSurrogates c
1847 val f1 = oneUtf16 isL (hi,f)
1848 val f2 = oneUtf16 isL (lo,f1)
1849 in f2
1850 end
1851 val writeCharUtf16B = writeCharUtf16 false
1852 val writeCharUtf16L = writeCharUtf16 true
1853
1854 (*---------------------------------------------------------------------*)
1855 (* UCS-2 *)
1856 (*---------------------------------------------------------------------*)
1857 fun writeCharUcs2 isL =
1858 fn (c,f) =>
1859 if c<0wx10000
1860 then let val (b1,b2) = (Char2Byte(c >> 0w8),Char2Byte c)
1861 in if isL then writeByte(writeByte(f,b2),b1)
1862 else writeByte(writeByte(f,b1),b2)
1863 end
1864 else raise EncodeError(f,ERR_ILLEGAL_CHAR(c,"UCS-2"))
1865
1866 val writeCharUcs2B = writeCharUcs2 false
1867 val writeCharUcs2L = writeCharUcs2 true
1868
1869 end
1870 (* stop of ../../Unicode/Encode/encodeMisc.sml *)
1871 (* start of ../../Unicode/Encode/encode.sml *)
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882 signature Encode =
1883 sig
1884 include EncodeError
1885
1886 type File
1887 type EncFile
1888
1889 val encNoFile : EncFile
1890 val encStdOut : EncFile
1891 val encOpenFile : string * Encoding.Encoding * string -> EncFile
1892 val encCloseFile : EncFile -> unit
1893 val encAdapt : EncFile * File -> EncFile
1894
1895 val encPutChar : EncFile * UniChar.Char -> EncFile
1896 val encValidChar : EncFile * UniChar.Char -> bool
1897 end
1898
1899 structure Encode : Encode =
1900 struct
1901 open
1902 Encoding UtilError
1903 EncodeBasic EncodeError EncodeMisc
1904
1905 type EncFile = Encoding * File
1906
1907 val encNoFile = (NOENC,stdOutFile)
1908 val encStdOut = (LATIN1,stdOutFile)
1909
1910 fun encAdapt((enc,_),f) = (enc,f)
1911
1912 fun encValidChar((enc,_),c) =
1913 case enc
1914 of ASCII => validCharAscii c
1915 | EBCDIC => validCharEbcdic c
1916 | LATIN1 => validCharLatin1 c
1917 | _ => true
1918
1919 fun encPutChar((enc,f),c) =
1920 let val f1 =
1921 case enc
1922 of NOENC => f
1923 | ASCII => (writeCharAscii(c,f))
1924 | EBCDIC => (writeCharEbcdic(c,f))
1925 | LATIN1 => (writeCharLatin1(c,f))
1926 | UCS2B => (writeCharUcs2B(c,f))
1927 | UCS2L => (writeCharUcs2L(c,f))
1928 | UCS4B => (writeCharUcs4B(c,f))
1929 | UCS4L => (writeCharUcs4L(c,f))
1930 | UCS4SB => (writeCharUcs4SB(c,f))
1931 | UCS4SL => (writeCharUcs4SL(c,f))
1932 | UTF8 => (writeCharUtf8(c,f))
1933 | UTF16B => (writeCharUtf16B(c,f))
1934 | UTF16L => (writeCharUtf16L(c,f))
1935 in (enc,f1)
1936 end
1937
1938 fun encCloseFile(_,f) = closeFile f
1939
1940 fun encOpenFile (fname,enc,name) =
1941 let
1942 val outEnc =
1943 case enc
1944 of NOENC =>
1945 (case isEncoding name
1946 of NOENC => raise NoSuchFile(fname,"Unsupported encoding \""^name^"\"")
1947 | enc => enc)
1948 | enc => enc
1949 val f = openFile fname
1950 val f1 = case outEnc
1951 of UTF16B => writeByte(writeByte(f,0wxFE),0wxFF)
1952 | UTF16L => writeByte(writeByte(f,0wxFF),0wxFE)
1953 | _ => f
1954 in (outEnc,f1)
1955 end
1956 end
1957
1958 (* stop of ../../Unicode/Encode/encode.sml *)
1959 (* start of nullHard.sml *)
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981 (*
1982 structure NullHard =
1983 struct
1984 fun parseNull uri = NullParse.parseDocument uri NONE NullHooks.nullStart
1985
1986 open
1987 NullCatOptions NullOptions Options NullParserOptions Uri
1988
1989 val usage = List.concat [parserUsage,[("","")],catalogUsage,[("","")],nullUsage]
1990
1991 exception Exit of OS.Process.status
1992
1993 fun null(prog,args) =
1994 let
1995 val prog = "fxp"
1996 val hadError = ref false
1997
1998 fun optError msg =
1999 let val _ = TextIO.output(TextIO.stdErr,msg^".\n")
2000 in hadError := true
2001 end
2002 fun exitError msg =
2003 let val _ = TextIO.output(TextIO.stdErr,msg^".\n")
2004 in raise Exit OS.Process.failure
2005 end
2006 fun exitHelp prog =
2007 let val _ = printUsage TextIO.stdOut prog usage
2008 in raise Exit OS.Process.success
2009 end
2010 fun exitVersion prog =
2011 let val _ = app print [prog," version ",Version.FXP_VERSION,"\n"]
2012 in raise Exit OS.Process.success
2013 end
2014
2015 fun summOpt prog = "For a summary of options type "^prog^" --help"
2016 fun noFile(f,cause) = "can't open file '"^f^"': "^exnMessage cause
2017
2018 val opts = parseOptions args
2019 val _ = setParserDefaults()
2020 val opts1 = setParserOptions (opts,optError)
2021 val _ = setCatalogDefaults()
2022 val opts2 = setCatalogOptions (opts1,optError)
2023 val _ = setNullDefaults()
2024 val (vers,help,err,file) = setNullOptions (opts2,optError)
2025 val _ = if !hadError then exitError (summOpt prog) else ()
2026 val _ = if vers then exitVersion prog else ()
2027 val _ = if help then exitHelp prog else ()
2028 val _ = case err
2029 of SOME "-" => O_ERROR_DEVICE := TextIO.stdErr
2030 | SOME f => (O_ERROR_DEVICE := TextIO.openOut f
2031 handle IO.Io {cause,...} => exitError(noFile(f,cause)))
2032 | NONE => ()
2033 val f = valOf file handle Option => "-"
2034 val uri = if f="-" then NONE else SOME(String2Uri f)
2035 val status = parseNull uri
2036 val _ = if isSome err then TextIO.closeOut (!O_ERROR_DEVICE) else ()
2037 in status
2038 end
2039 handle Exit status => status
2040 | exn =>
2041 let val _ = TextIO.output
2042 (TextIO.stdErr,prog^": Unexpected exception: "^exnMessage exn^".\n")
2043 in OS.Process.failure
2044 end
2045 end
2046 *)
2047 structure NullHard = struct end
2048 (* stop of nullHard.sml *)
2049 (* start of ../../Util/options.sml *)
2050 signature Options=
2051 sig
2052 datatype Option =
2053 OPT_LONG of string * string option
2054 | OPT_SHORT of char list
2055 | OPT_NEG of char list
2056 | OPT_NOOPT
2057 | OPT_STRING of string
2058 val parseOptions : string list -> Option list
2059
2060 datatype UsageItem =
2061 U_SEP
2062 | U_TITLE of string
2063 | U_ITEM of string list * string
2064 type Usage = UsageItem list
2065 val printUsage : TextIO.outstream -> string -> Usage -> unit
2066 end
2067
2068 structure Options : Options =
2069 struct
2070 exception BadOption of string
2071
2072 datatype Option =
2073 OPT_LONG of string * string option
2074 | OPT_SHORT of char list
2075 | OPT_NEG of char list
2076 | OPT_NOOPT
2077 | OPT_STRING of string
2078
2079 datatype UsageItem =
2080 U_SEP
2081 | U_TITLE of string
2082 | U_ITEM of string list * string
2083 type Usage = UsageItem list
2084
2085 fun parseOptions ss =
2086 let
2087 fun doOne opt =
2088 if String.isPrefix "--" opt
2089 then let val opt1 = Substring.extract(opt,2,NONE)
2090 val (key0,opt2) = Substring.splitl (fn c => #"="<>c) opt1
2091 val key = if Substring.isEmpty key0 then raise BadOption opt
2092 else Substring.string key0
2093 val valOpt = if Substring.isPrefix "=" opt2
2094 then let val val0 = Substring.triml 1 opt2
2095 in if Substring.isEmpty val0
2096 then raise BadOption opt
2097 else SOME(Substring.string val0)
2098 end
2099 else NONE
2100 in OPT_LONG(key,valOpt)
2101 end
2102 handle BadOption s => if opt="--" then OPT_NOOPT else OPT_STRING opt
2103 else if String.isPrefix "-" opt
2104 then let val chars = tl(String.explode opt)
2105 (* val _ = app (fn c => if Char.isAlphaNum c then ()
2106 else raise BadOption opt) chars *)
2107 in case chars
2108 of nil => OPT_STRING opt
2109 | #"n"::(cs as _::_) => OPT_NEG cs
2110 | _ => OPT_SHORT chars
2111 end
2112 handle BadOption s => OPT_STRING opt
2113 else OPT_STRING opt
2114
2115 fun doAll nil = nil
2116 | doAll (s::ss) = let val opt = doOne s
2117 in case opt
2118 of OPT_NOOPT => opt::map OPT_STRING ss
2119 | _ => opt::doAll ss
2120 end
2121 in doAll ss
2122 end
2123
2124 fun printUsage stream prog usage =
2125 let
2126 val KEY_WIDTH = 30
2127 val LINE_WIDTH = 80
2128 val EMPTY_KEY = UtilString.nBlanks KEY_WIDTH
2129 fun appendKeys col nil = if col>KEY_WIDTH then "\n"^EMPTY_KEY
2130 else UtilString.nBlanks (KEY_WIDTH-col)
2131 | appendKeys col [key] = key^" "^appendKeys (col+1+String.size key) nil
2132 | appendKeys col (key::keys) = let val col1 = col+2+String.size key
2133 in if col1>KEY_WIDTH
2134 then key^",\n"^appendKeys 0 keys
2135 else key^", "^appendKeys col1 keys
2136 end
2137 fun makeKey keylist = appendKeys 0 keylist
2138 val makeText = UtilString.breakLines(LINE_WIDTH-KEY_WIDTH)
2139 fun format (keylist,text) =
2140 let val key = makeKey keylist
2141 in case makeText text
2142 of nil => [key]
2143 | line::lines => key^line::map (fn line => EMPTY_KEY^line) lines
2144 end
2145 val _ = app (fn x => TextIO.output(stream,x))
2146 ["Usage: ",prog," [option ...] file\n","where option is one of:\n\n"]
2147 val _ = app (fn item => app (fn x => TextIO.output(stream,x^"\n"))
2148 (case item
2149 of U_SEP => [""]
2150 | U_TITLE txt => ["",txt]
2151 | U_ITEM option => format option)) usage
2152 in ()
2153 end
2154 end
2155 (* stop of ../../Util/options.sml *)
2156 (* start of ../../config.sml *)
2157 structure Config =
2158 struct
2159 (*---------------------------------------------------------------------*)
2160 (* The OS command for retrieving a URI from the internet and storing *)
2161 (* it in a local file, where *)
2162 (* %1 is replaced by the URI. *)
2163 (* %2 is replaced by the local filename. *)
2164 (* It is recommended that the command exits with failure in case the *)
2165 (* URI cannot be retrieved. If the command generates a HTML error *)
2166 (* message instead (like, e.g., lynx), this HTML file is considered *)
2167 (* to be XML and will probably cause a mess of parsing errors. If you *)
2168 (* don't need URI retrieval, use "exit 1" which always fails. *)
2169 (* Sensible values are, e.g.: *)
2170 (* val retrieveCommand = "wget -qO %2 %1" *)
2171 (* val retrieveCommand = "got_it -o %2 %1" *)
2172 (* val retrieveCommand = "urlget -s -o %2 %1" *)
2173 (*---------------------------------------------------------------------*)
2174 val retrieveCommand = "wget -qO %2 %1"
2175 end
2176 (* stop of ../../config.sml *)
2177 (* start of ../../Unicode/Chars/charClasses.sml *)
2178 (*--------------------------------------------------------------------------*)
2179 (* Structure: CharClasses *)
2180 (* *)
2181 (* Notes: *)
2182 (* This implementation uses the UNSAFE array operations, and does NO *)
2183 (* range checks. This is for efficiency reasons. *)
2184 (* If class=makeCharClass(lo,hi) then a filed of size hi-lo+1 is allo- *)
2185 (* cated. In order to lookup a character, first make sure it in [lo..hi], *)
2186 (* then subtract lo before calling inCharClass! *)
2187 (* The same holds for addChar. *)
2188 (* *)
2189 (* Depends on: *)
2190 (* UniChar *)
2191 (* UtilInt *)
2192 (* *)
2193 (* Exceptions raised by functions in this structure: *)
2194 (* addChar : none *)
2195 (* addCharClass : none *)
2196 (* inCharClass : none *)
2197 (* makeCharClass : none *)
2198 (*--------------------------------------------------------------------------*)
2199 signature CharClasses =
2200 sig
2201 type CharClass
2202 type MutableClass
2203 type CharInterval = UniChar.Char * UniChar.Char
2204 type CharRange = CharInterval list
2205
2206 val initialize : CharInterval -> MutableClass
2207 val finalize : MutableClass -> CharClass
2208
2209 val addChar : MutableClass * UniChar.Char * UniChar.Char * UniChar.Char -> unit
2210 val addCharRange : MutableClass * UniChar.Char * UniChar.Char * CharRange -> CharRange
2211
2212 val inCharClass : UniChar.Char * CharClass -> bool
2213 end
2214
2215 structure CharClasses : CharClasses =
2216 struct
2217 open UniChar
2218
2219 type CharInterval = Char * Char
2220 type CharRange = CharInterval list
2221
2222 val Char2Word = Word.fromLargeWord o Chars.toLargeWord
2223
2224 (*--------------------------------------------------------------------*)
2225 (* helpers *)
2226 (*--------------------------------------------------------------------*)
2227 infix 5 >> >>> <<<
2228 infix 6 || |||
2229 infix 6 --
2230 infix 7 & && &&&
2231 val op >> = Chars.>>
2232 val op -- = Chars.-
2233 val op || = Chars.orb
2234 val op && = Chars.andb
2235 val op >>> = Word32.>>
2236 val op <<< = Word32.<<
2237 val op &&& = Word32.andb
2238 val op ||| = Word32.orb
2239 val op & = Word.andb
2240
2241 val max32 = Word32.notb 0wx0
2242
2243 (*--------------------------------------------------------------------*)
2244 (* a char class is an array of words, interpreted as bitvectors. *)
2245 (*--------------------------------------------------------------------*)
2246 type MutableClass = Word32.word array
2247 type CharClass = Word32.word vector
2248
2249 (*--------------------------------------------------------------------*)
2250 (* each word in a char class holds 32 entries. Thus the for a char c *)
2251 (* is c div 32 == c >> 5. The bitmask is a word of zeros, only the *)
2252 (* significant bit for c, i.e. the (c && 31==0x1F)th bit set to one. *)
2253 (*--------------------------------------------------------------------*)
2254 fun indexMask c = let val idx = Chars.toInt(c>>0w5)
2255 val mask = 0wx1 <<< Char2Word c & 0w31
2256 in (idx,mask)
2257 end
2258
2259 (*--------------------------------------------------------------------*)
2260 (* generate index and mask, then lookup. *)
2261 (*--------------------------------------------------------------------*)
2262 fun inCharClass(c,vec) = let val (idx,mask) = indexMask c
2263 in mask &&& Vector.sub(vec,idx) <> 0wx0
2264 end
2265
2266 (*--------------------------------------------------------------------*)
2267 (* generate a CharClass large enough to hold (max-min+1) characters. *)
2268 (*--------------------------------------------------------------------*)
2269 fun initialize(min,max) =
2270 Array.array((Chars.toInt max-Chars.toInt min+1) div 32+1,0wx0):MutableClass
2271 fun finalize arr = Array.extract(arr,0,NONE)
2272
2273 (*--------------------------------------------------------------------*)
2274 (* add a single character to a CharClass. *)
2275 (*--------------------------------------------------------------------*)
2276 fun addChar(cls,min,max,c) =
2277 let
2278 val (idx,new) = indexMask c
2279 val old = Array.sub(cls,idx)
2280 in
2281 Array.update(cls,idx,old|||new)
2282 end
2283
2284 (*--------------------------------------------------------------------*)
2285 (* add a full range of characters to a CharClass. *)
2286 (* this is the only function that computes the offset before access *)
2287 (* to the array. *)
2288 (*--------------------------------------------------------------------*)
2289 fun addCharRange(cls,min,max,range) =
2290 let
2291 fun doOne (lo,hi) =
2292 let
2293 val (l,h) = (lo-min,hi-min)
2294 val (idxL,idxH) = ((Chars.toInt l) div 32,(Chars.toInt h) div 32)
2295 val (bitL,bitH) = (Char2Word l & 0w31,Char2Word h & 0w31)
2296 in
2297 if idxL=idxH then
2298 let
2299 val new = (max32>>>(0w31-bitH+bitL))<<<bitL
2300 val old = Array.sub(cls,idxL)
2301 val _ = Array.update(cls,idxL,old|||new)
2302 in ()
2303 end
2304 else if idxL<idxH then
2305 let
2306 val newL = max32<<<bitL
2307 val newH = max32>>>(0w31-bitH)
2308 val oldL = Array.sub(cls,idxL)
2309 val oldH = Array.sub(cls,idxH)
2310 val _ = Array.update(cls,idxL,oldL|||newL)
2311 val _ = Array.update(cls,idxH,oldH|||newH)
2312 val _ = UtilInt.appInterval (fn i => Array.update(cls,i,max32))
2313 (idxL+1,idxH-1)
2314 in ()
2315 end
2316 else ()
2317 end
2318 fun doAll nil = nil
2319 | doAll ((lh as (lo,hi))::lhs) =
2320 if hi<lo then doAll lhs
2321 else if hi<min then doAll lhs
2322 else if lo>max then lh::doAll lhs
2323 else if lo<min andalso hi<=max
2324 then (doOne(min,hi); doAll lhs)
2325 else if lo>=min andalso hi<=max
2326 then (doOne lh; doAll lhs)
2327 else if lo>=min andalso hi>max
2328 then (doOne(lo,max); (max+0w1,hi)::lhs)
2329 else (doOne(min,max); (max+0w1,hi)::lhs)
2330 val _ = doAll range
2331 in
2332 doAll range
2333 end
2334 end
2335
2336 (* stop of ../../Unicode/Chars/charClasses.sml *)
2337 (* start of ../../Unicode/Chars/uniRanges.sml *)
2338
2339
2340
2341
2342 structure UniRanges =
2343 struct
2344 val digitRange = [(0wx0030,0wx0039),
2345 (0wx0660,0wx0669),
2346 (0wx06F0,0wx06F9),
2347 (0wx0966,0wx096F),
2348 (0wx09E6,0wx09EF),
2349 (0wx0A66,0wx0A6F),
2350 (0wx0AE6,0wx0AEF),
2351 (0wx0B66,0wx0B6F),
2352 (0wx0BE7,0wx0BEF),
2353 (0wx0C66,0wx0C6F),
2354 (0wx0CE6,0wx0CEF),
2355 (0wx0D66,0wx0D6F),
2356 (0wx0E50,0wx0E59),
2357 (0wx0ED0,0wx0ED9),
2358 (0wx0F20,0wx0F29)
2359 ] : CharClasses.CharRange
2360
2361 val digitRange09 = [(0wx0030,0wx0039),
2362 (0wx0660,0wx0669),
2363 (0wx06F0,0wx06F9),
2364 (0wx0E50,0wx0E59),
2365 (0wx0ED0,0wx0ED9),
2366 (0wx0F20,0wx0F29)
2367 ] : CharClasses.CharRange
2368
2369 val digitRange6F = [(0wx0966,0wx096F),
2370 (0wx09E6,0wx09EF),
2371 (0wx0A66,0wx0A6F),
2372 (0wx0AE6,0wx0AEF),
2373 (0wx0B66,0wx0B6F),
2374 (0wx0BE7,0wx0BEF),
2375 (0wx0C66,0wx0C6F),
2376 (0wx0CE6,0wx0CEF),
2377 (0wx0D66,0wx0D6F)
2378 ] : CharClasses.CharRange
2379
2380 val baseRange = [(0wx0041,0wx005A),
2381 (0wx0061,0wx007A),
2382 (0wx00C0,0wx00D6),
2383 (0wx00D8,0wx00F6),
2384 (0wx00F8,0wx00FF),
2385 (0wx0100,0wx0131),
2386 (0wx0134,0wx013E),
2387 (0wx0141,0wx0148),
2388 (0wx014A,0wx017E),
2389 (0wx0180,0wx01C3),
2390 (0wx01CD,0wx01F0),
2391 (0wx01F4,0wx01F5),
2392 (0wx01FA,0wx0217),
2393 (0wx0250,0wx02A8),
2394 (0wx02BB,0wx02C1),
2395 (0wx0386,0wx0386),
2396 (0wx0388,0wx038A),
2397 (0wx038C,0wx038C),
2398 (0wx038E,0wx03A1),
2399 (0wx03A3,0wx03CE),
2400 (0wx03D0,0wx03D6),
2401 (0wx03DA,0wx03DA),
2402 (0wx03DC,0wx03DC),
2403 (0wx03DE,0wx03DE),
2404 (0wx03E0,0wx03E0),
2405 (0wx03E2,0wx03F3),
2406 (0wx0401,0wx040C),
2407 (0wx040E,0wx044F),
2408 (0wx0451,0wx045C),
2409 (0wx045E,0wx0481),
2410 (0wx0490,0wx04C4),
2411 (0wx04C7,0wx04C8),
2412 (0wx04CB,0wx04CC),
2413 (0wx04D0,0wx04EB),
2414 (0wx04EE,0wx04F5),
2415 (0wx04F8,0wx04F9),
2416 (0wx0531,0wx0556),
2417 (0wx0559,0wx0559),
2418 (0wx0561,0wx0586),
2419 (0wx05D0,0wx05EA),
2420 (0wx05F0,0wx05F2),
2421 (0wx0621,0wx063A),
2422 (0wx0641,0wx064A),
2423 (0wx0671,0wx06B7),
2424 (0wx06BA,0wx06BE),
2425 (0wx06C0,0wx06CE),
2426 (0wx06D0,0wx06D3),
2427 (0wx06D5,0wx06D5),
2428 (0wx06E5,0wx06E6),
2429 (0wx0905,0wx0939),
2430 (0wx093D,0wx093D),
2431 (0wx0958,0wx0961),
2432 (0wx0985,0wx098C),
2433 (0wx098F,0wx0990),
2434 (0wx0993,0wx09A8),
2435 (0wx09AA,0wx09B0),
2436 (0wx09B2,0wx09B2),
2437 (0wx09B6,0wx09B9),
2438 (0wx09DC,0wx09DD),
2439 (0wx09DF,0wx09E1),
2440 (0wx09F0,0wx09F1),
2441 (0wx0A05,0wx0A0A),
2442 (0wx0A0F,0wx0A10),
2443 (0wx0A13,0wx0A28),
2444 (0wx0A2A,0wx0A30),
2445 (0wx0A32,0wx0A33),
2446 (0wx0A35,0wx0A36),
2447 (0wx0A38,0wx0A39),
2448 (0wx0A59,0wx0A5C),
2449 (0wx0A5E,0wx0A5E),
2450 (0wx0A72,0wx0A74),
2451 (0wx0A85,0wx0A8B),
2452 (0wx0A8D,0wx0A8D),
2453 (0wx0A8F,0wx0A91),
2454 (0wx0A93,0wx0AA8),
2455 (0wx0AAA,0wx0AB0),
2456 (0wx0AB2,0wx0AB3),
2457 (0wx0AB5,0wx0AB9),
2458 (0wx0ABD,0wx0ABD),
2459 (0wx0AE0,0wx0AE0),
2460 (0wx0B05,0wx0B0C),
2461 (0wx0B0F,0wx0B10),
2462 (0wx0B13,0wx0B28),
2463 (0wx0B2A,0wx0B30),
2464 (0wx0B32,0wx0B33),
2465 (0wx0B36,0wx0B39),
2466 (0wx0B3D,0wx0B3D),
2467 (0wx0B5C,0wx0B5D),
2468 (0wx0B5F,0wx0B61),
2469 (0wx0B85,0wx0B8A),
2470 (0wx0B8E,0wx0B90),
2471 (0wx0B92,0wx0B95),
2472 (0wx0B99,0wx0B9A),
2473 (0wx0B9C,0wx0B9C),
2474 (0wx0B9E,0wx0B9F),
2475 (0wx0BA3,0wx0BA4),
2476 (0wx0BA8,0wx0BAA),
2477 (0wx0BAE,0wx0BB5),
2478 (0wx0BB7,0wx0BB9),
2479 (0wx0C05,0wx0C0C),
2480 (0wx0C0E,0wx0C10),
2481 (0wx0C12,0wx0C28),
2482 (0wx0C2A,0wx0C33),
2483 (0wx0C35,0wx0C39),
2484 (0wx0C60,0wx0C61),
2485 (0wx0C85,0wx0C8C),
2486 (0wx0C8E,0wx0C90),
2487 (0wx0C92,0wx0CA8),
2488 (0wx0CAA,0wx0CB3),
2489 (0wx0CB5,0wx0CB9),
2490 (0wx0CDE,0wx0CDE),
2491 (0wx0CE0,0wx0CE1),
2492 (0wx0D05,0wx0D0C),
2493 (0wx0D0E,0wx0D10),
2494 (0wx0D12,0wx0D28),
2495 (0wx0D2A,0wx0D39),
2496 (0wx0D60,0wx0D61),
2497 (0wx0E01,0wx0E2E),
2498 (0wx0E30,0wx0E30),
2499 (0wx0E32,0wx0E33),
2500 (0wx0E40,0wx0E45),
2501 (0wx0E81,0wx0E82),
2502 (0wx0E84,0wx0E84),
2503 (0wx0E87,0wx0E88),
2504 (0wx0E8A,0wx0E8A),
2505 (0wx0E8D,0wx0E8D),
2506 (0wx0E94,0wx0E97),
2507 (0wx0E99,0wx0E9F),
2508 (0wx0EA1,0wx0EA3),
2509 (0wx0EA5,0wx0EA5),
2510 (0wx0EA7,0wx0EA7),
2511 (0wx0EAA,0wx0EAB),
2512 (0wx0EAD,0wx0EAE),
2513 (0wx0EB0,0wx0EB0),
2514 (0wx0EB2,0wx0EB3),
2515 (0wx0EBD,0wx0EBD),
2516 (0wx0EC0,0wx0EC4),
2517 (0wx0F40,0wx0F47),
2518 (0wx0F49,0wx0F69),
2519 (0wx10A0,0wx10C5),
2520 (0wx10D0,0wx10F6),
2521 (0wx1100,0wx1100),
2522 (0wx1102,0wx1103),
2523 (0wx1105,0wx1107),
2524 (0wx1109,0wx1109),
2525 (0wx110B,0wx110C),
2526 (0wx110E,0wx1112),
2527 (0wx113C,0wx113C),
2528 (0wx113E,0wx113E),
2529 (0wx1140,0wx1140),
2530 (0wx114C,0wx114C),
2531 (0wx114E,0wx114E),
2532 (0wx1150,0wx1150),
2533 (0wx1154,0wx1155),
2534 (0wx1159,0wx1159),
2535 (0wx115F,0wx1161),
2536 (0wx1163,0wx1163),
2537 (0wx1165,0wx1165),
2538 (0wx1167,0wx1167),
2539 (0wx1169,0wx1169),
2540 (0wx116D,0wx116E),
2541 (0wx1172,0wx1173),
2542 (0wx1175,0wx1175),
2543 (0wx119E,0wx119E),
2544 (0wx11A8,0wx11A8),
2545 (0wx11AB,0wx11AB),
2546 (0wx11AE,0wx11AF),
2547 (0wx11B7,0wx11B8),
2548 (0wx11BA,0wx11BA),
2549 (0wx11BC,0wx11C2),
2550 (0wx11EB,0wx11EB),
2551 (0wx11F0,0wx11F0),
2552 (0wx11F9,0wx11F9),
2553 (0wx1E00,0wx1E9B),
2554 (0wx1EA0,0wx1EF9),
2555 (0wx1F00,0wx1F15),
2556 (0wx1F18,0wx1F1D),
2557 (0wx1F20,0wx1F45),
2558 (0wx1F48,0wx1F4D),
2559 (0wx1F50,0wx1F57),
2560 (0wx1F59,0wx1F59),
2561 (0wx1F5B,0wx1F5B),
2562 (0wx1F5D,0wx1F5D),
2563 (0wx1F5F,0wx1F7D),
2564 (0wx1F80,0wx1FB4),
2565 (0wx1FB6,0wx1FBC),
2566 (0wx1FBE,0wx1FBE),
2567 (0wx1FC2,0wx1FC4),
2568 (0wx1FC6,0wx1FCC),
2569 (0wx1FD0,0wx1FD3),
2570 (0wx1FD6,0wx1FDB),
2571 (0wx1FE0,0wx1FEC),
2572 (0wx1FF2,0wx1FF4),
2573 (0wx1FF6,0wx1FFC),
2574 (0wx2126,0wx2126),
2575 (0wx212A,0wx212B),
2576 (0wx212E,0wx212E),
2577 (0wx2180,0wx2182),
2578 (0wx3041,0wx3094),
2579 (0wx30A1,0wx30FA),
2580 (0wx3105,0wx312C),
2581 (0wxAC00,0wxD7A3)
2582 ] : CharClasses.CharRange
2583
2584 val ideoRange = [(0wx3007,0wx3007),
2585 (0wx3021,0wx3029),
2586 (0wx4E00,0wx9FA5)
2587 ] : CharClasses.CharRange
2588
2589 val combRange = [(0wx0300,0wx0345),
2590 (0wx0360,0wx0361),
2591 (0wx0483,0wx0486),
2592 (0wx0591,0wx05A1),
2593 (0wx05A3,0wx05B9),
2594 (0wx05BB,0wx05BD),
2595 (0wx05BF,0wx05BF),
2596 (0wx05C1,0wx05C2),
2597 (0wx05C4,0wx05C4),
2598 (0wx064B,0wx0652),
2599 (0wx0670,0wx0670),
2600 (0wx06D6,0wx06DC),
2601 (0wx06DD,0wx06DF),
2602 (0wx06E0,0wx06E4),
2603 (0wx06E7,0wx06E8),
2604 (0wx06EA,0wx06ED),
2605 (0wx0901,0wx0903),
2606 (0wx093C,0wx093C),
2607 (0wx093E,0wx094C),
2608 (0wx094D,0wx094D),
2609 (0wx0951,0wx0954),
2610 (0wx0962,0wx0963),
2611 (0wx0981,0wx0983),
2612 (0wx09BC,0wx09BC),
2613 (0wx09BE,0wx09BE),
2614 (0wx09BF,0wx09BF),
2615 (0wx09C0,0wx09C4),
2616 (0wx09C7,0wx09C8),
2617 (0wx09CB,0wx09CD),
2618 (0wx09D7,0wx09D7),
2619 (0wx09E2,0wx09E3),
2620 (0wx0A02,0wx0A02),
2621 (0wx0A3C,0wx0A3C),
2622 (0wx0A3E,0wx0A3E),
2623 (0wx0A3F,0wx0A3F),
2624 (0wx0A40,0wx0A42),
2625 (0wx0A47,0wx0A48),
2626 (0wx0A4B,0wx0A4D),
2627 (0wx0A70,0wx0A71),
2628 (0wx0A81,0wx0A83),
2629 (0wx0ABC,0wx0ABC),
2630 (0wx0ABE,0wx0AC5),
2631 (0wx0AC7,0wx0AC9),
2632 (0wx0ACB,0wx0ACD),
2633 (0wx0B01,0wx0B03),
2634 (0wx0B3C,0wx0B3C),
2635 (0wx0B3E,0wx0B43),
2636 (0wx0B47,0wx0B48),
2637 (0wx0B4B,0wx0B4D),
2638 (0wx0B56,0wx0B57),
2639 (0wx0B82,0wx0B83),
2640 (0wx0BBE,0wx0BC2),
2641 (0wx0BC6,0wx0BC8),
2642 (0wx0BCA,0wx0BCD),
2643 (0wx0BD7,0wx0BD7),
2644 (0wx0C01,0wx0C03),
2645 (0wx0C3E,0wx0C44),
2646 (0wx0C46,0wx0C48),
2647 (0wx0C4A,0wx0C4D),
2648 (0wx0C55,0wx0C56),
2649 (0wx0C82,0wx0C83),
2650 (0wx0CBE,0wx0CC4),
2651 (0wx0CC6,0wx0CC8),
2652 (0wx0CCA,0wx0CCD),
2653 (0wx0CD5,0wx0CD6),
2654 (0wx0D02,0wx0D03),
2655 (0wx0D3E,0wx0D43),
2656 (0wx0D46,0wx0D48),
2657 (0wx0D4A,0wx0D4D),
2658 (0wx0D57,0wx0D57),
2659 (0wx0E31,0wx0E31),
2660 (0wx0E34,0wx0E3A),
2661 (0wx0E47,0wx0E4E),
2662 (0wx0EB1,0wx0EB1),
2663 (0wx0EB4,0wx0EB9),
2664 (0wx0EBB,0wx0EBC),
2665 (0wx0EC8,0wx0ECD),
2666 (0wx0F18,0wx0F19),
2667 (0wx0F35,0wx0F35),
2668 (0wx0F37,0wx0F37),
2669 (0wx0F39,0wx0F39),
2670 (0wx0F3E,0wx0F3E),
2671 (0wx0F3F,0wx0F3F),
2672 (0wx0F71,0wx0F84),
2673 (0wx0F86,0wx0F8B),
2674 (0wx0F90,0wx0F95),
2675 (0wx0F97,0wx0F97),
2676 (0wx0F99,0wx0FAD),
2677 (0wx0FB1,0wx0FB7),
2678 (0wx0FB9,0wx0FB9),
2679 (0wx20D0,0wx20DC),
2680 (0wx20E1,0wx20E1),
2681 (0wx302A,0wx302F),
2682 (0wx3099,0wx3099),
2683 (0wx309A,0wx309A)
2684 ] : CharClasses.CharRange
2685
2686 val extRange = [(0wx00B7,0wx00B7),
2687 (0wx02D0,0wx02D0),
2688 (0wx02D1,0wx02D1),
2689 (0wx0387,0wx0387),
2690 (0wx0640,0wx0640),
2691 (0wx0E46,0wx0E46),
2692 (0wx0EC6,0wx0EC6),
2693 (0wx3005,0wx3005),
2694 (0wx3031,0wx3035),
2695 (0wx309D,0wx309E),
2696 (0wx30FC,0wx30FE)
2697 ] : CharClasses.CharRange
2698
2699 val nmsRange = List.concat
2700 [[(0wx3A,0wx3A),(0wx5F,0wx5F)](* :_ *),
2701 baseRange,
2702 ideoRange]
2703
2704 val nameRange = List.concat
2705 [[(0wx2D,0wx2D),(0wx2E,0wx2E)](* -. *),
2706 digitRange,
2707 combRange,
2708 extRange,
2709 nmsRange]
2710
2711 val pubidRange = List.concat
2712 [map (fn c => (c,c)) [0wx0A,0wx0D,0wx20], (* space,cr,lf *)
2713 map (fn c => (c,c)) (UniChar.String2Data "-'()+,./:=?;!*#@$_%"),
2714 [(0wx0030,0wx0039),(0wx0041,0wx005A),(0wx0061,0wx007A)] (* [0-9A-Za-z] *)
2715 ] : CharClasses.CharRange
2716
2717 val encRange =
2718 [(0wx002D,0wx002E), (* -. *)
2719 (0wx0030,0wx0039), (* 0-9 *)
2720 (0wx0041,0wx005A), (* A-Z *)
2721 (0wx005F,0wx005F), (* _ *)
2722 (0wx0061,0wx007A) (* a-z *)
2723 ] : CharClasses.CharRange
2724 end
2725 (* stop of ../../Unicode/Chars/uniRanges.sml *)
2726 (* start of ../../Unicode/Chars/uniClasses.sml *)
2727
2728
2729
2730
2731 (*--------------------------------------------------------------------------*)
2732 (* Structure: UniClasses *)
2733 (* *)
2734 (* Notes: *)
2735 (* read CharClasses in order to understand how CharClasses are handled. *)
2736 (* *)
2737 (* Depends on: *)
2738 (* UniChar *)
2739 (* CharClasses *)
2740 (* *)
2741 (* Exceptions raised by functions in this structure: *)
2742 (* decValue : none *)
2743 (* hexValue : none *)
2744 (* isAsciiLetter : none *)
2745 (* isEnc : none *)
2746 (* isEncS : none *)
2747 (* isName : none *)
2748 (* isNms : none *)
2749 (* isPubid : none *)
2750 (* isS : none *)
2751 (* isXml : none *)
2752 (* isUnicode : none *)
2753 (* isVers : none *)
2754 (*--------------------------------------------------------------------------*)
2755 signature UniClasses =
2756 sig
2757 val isName : UniChar.Char -> bool
2758 val isNms : UniChar.Char -> bool
2759 val isPubid : UniChar.Char -> bool
2760 val isS : UniChar.Char -> bool
2761 val isEnc : UniChar.Char -> bool
2762 val isEncS : UniChar.Char -> bool
2763 val isVers : UniChar.Char -> bool
2764 val isDec : UniChar.Char -> bool
2765 val isHex : UniChar.Char -> bool
2766 val isXml : UniChar.Char -> bool
2767 val isUnicode : UniChar.Char -> bool
2768
2769 val decValue : UniChar.Char -> UniChar.Char option
2770 val hexValue : UniChar.Char -> UniChar.Char option
2771
2772 val isAsciiLetter : UniChar.Char -> bool
2773 end
2774
2775 structure UniClasses : UniClasses =
2776 struct
2777 open UniChar CharClasses UniRanges
2778
2779 (*--------------------------------------------------------------------*)
2780 (* initialize the character classes. *)
2781 (*--------------------------------------------------------------------*)
2782 local
2783 val nmsTemp = initialize(0wx0000,0wx3FFF)
2784 val restNms = addCharRange(nmsTemp,0wx0000,0wx3FFF,nmsRange)
2785 val _ = if restNms=[(0wxAC00,0wxD7A3),(0wx4E00,0wx9FA5)] then ()
2786 else print ("Warning: extra characters after computing nms char class.\n")
2787
2788 val nameTemp = initialize(0wx0000,0wxFFFF)
2789 val restName = addCharRange(nameTemp,0wx0000,0wx3FFF,nameRange)
2790 val _ = if restName=[(0wxAC00,0wxD7A3),(0wx4E00,0wx9FA5)] then ()
2791 else print ("Warning: extra characters after computing name char class.\n")
2792
2793 val pubTemp = initialize(0wx0000,0wx007F)
2794 val restPubid = addCharRange(pubTemp,0wx0000,0wx007F,pubidRange)
2795 val _ = if restPubid=nil then ()
2796 else print ("Warning: extra characters after computing pubid char class.\n")
2797
2798 val encTemp = initialize(0wx0000,0wx007F)
2799 val restEnc = addCharRange(encTemp,0wx0000,0wx007F,encRange)
2800 val _ = if restEnc=nil then ()
2801 else print ("Warning: extra characters after computing enc char class.\n")
2802 in
2803 val nmsClass = finalize nmsTemp
2804 val nameClass = finalize nameTemp
2805 val pubClass = finalize pubTemp
2806 val encClass = finalize encTemp
2807 end
2808
2809 (*--------------------------------------------------------------------*)
2810 (* is a character a name start char? *)
2811 (*--------------------------------------------------------------------*)
2812 fun isNms c = if c<0wx4000 then inCharClass(c,nmsClass)
2813 else
2814 c>=0wx4E00 andalso c<=0wx9FA5 orelse
2815 c>=0wxAC00 andalso c<=0wxD7A3
2816
2817 (*--------------------------------------------------------------------*)
2818 (* is a character a name char? *)
2819 (*--------------------------------------------------------------------*)
2820 fun isName c = if c<0wx4000 then inCharClass(c,nameClass)
2821 else
2822 c>=0wx4E00 andalso c<=0wx9FA5 orelse
2823 c>=0wxAC00 andalso c<=0wxD7A3
2824
2825 (*--------------------------------------------------------------------*)
2826 (* is a character a pubid char? *)
2827 (*--------------------------------------------------------------------*)
2828 fun isPubid c = c<0wx80 andalso inCharClass(c,pubClass)
2829
2830 (*--------------------------------------------------------------------*)
2831 (* is a character valid in an encoding name, at its start, or in a *)
2832 (* version number? *)
2833 (*--------------------------------------------------------------------*)
2834 fun isEnc c =
2835 c<0wx80 andalso inCharClass(c,encClass)
2836 fun isEncS (c:UniChar.Char) =
2837 c>=0wx41 andalso c<=0wx5A orelse
2838 c>=0wx61 andalso c<=0wx7A
2839 fun isVers c =
2840 isEnc c orelse c=0wx3A (* #":" *)
2841
2842 (*--------------------------------------------------------------------*)
2843 (* these are the valid Unicode characters (including surrogates). *)
2844 (*--------------------------------------------------------------------*)
2845 fun isUnicode (c:UniChar.Char) = c<=0wx10FFFF
2846
2847 (*--------------------------------------------------------------------*)
2848 (* XML characters if not checked for Unicode char in advance. *)
2849 (*--------------------------------------------------------------------*)
2850 fun isXml (c:UniChar.Char) =
2851 c>=0wx0020 andalso c<=0wxD7FF orelse
2852 c>=0wxE000 andalso c<=0wxFFFD orelse
2853 c>=0wx10000 andalso c<=0wx10FFFF orelse
2854 c=0wx9 orelse c=0wxA orelse c=0wxD
2855
2856 (*--------------------------------------------------------------------*)
2857 (* the frontend supresses 0wxD (carriage return), but its is still *)
2858 (* present when encoding is recognized. *)
2859 (*--------------------------------------------------------------------*)
2860 fun isS (c:UniChar.Char) =
2861 case c
2862 of 0wx09 => true
2863 | 0wx0A => true
2864 | 0wx0D => true
2865 | 0wx20 => true
2866 | _ => false
2867
2868 (*--------------------------------------------------------------------*)
2869 (* is this character an ascii decimal/hexadecimal digit? *)
2870 (*--------------------------------------------------------------------*)
2871 fun isDec (c:UniChar.Char) =
2872 c>=0wx30 andalso c<=0wx39
2873 fun isHex (c:UniChar.Char) =
2874 c>=0wx30 andalso c<=0wx39 orelse
2875 c>=0wx41 andalso c<=0wx46 orelse
2876 c>=0wx61 andalso c<=0wx66
2877
2878 (*--------------------------------------------------------------------*)
2879 (* calculate the decimal/hexadecimal value of an ascii (hex-)digit. *)
2880 (*--------------------------------------------------------------------*)
2881 fun decValue (c:UniChar.Char) =
2882 let val v = c-0wx30
2883 in if v<=0wx9 then SOME v else NONE
2884 end
2885 fun hexValue (c:UniChar.Char) =
2886 let val v = c-0wx30
2887 in if v<=0wx9 then SOME v
2888 else (if c>=0wx41 andalso c<=0wx46 then SOME(c-0wx37)
2889 else if c>=0wx61 andalso c<=0wx66 then SOME(c-0wx57)
2890 else NONE)
2891 end
2892
2893 (*--------------------------------------------------------------------*)
2894 (* is c in [a-z]+[A-Z]? *)
2895 (*--------------------------------------------------------------------*)
2896 fun isAsciiLetter (c:UniChar.Char) =
2897 c>=0wx41 andalso c<=0wx5A orelse
2898 c>=0wx61 andalso c<=0wx7A
2899 end
2900 (* stop of ../../Unicode/Chars/uniClasses.sml *)
2901 (* start of ../../Unicode/Uri/uriDecode.sml *)
2902 signature UriDecode =
2903 sig
2904 val decodeUriLatin : string -> string
2905 val decodeUriUtf8 : string -> string
2906 end
2907
2908 structure UriDecode : UriDecode =
2909 struct
2910 open UniChar UtilInt
2911
2912 infix 8 << <<<
2913 infix 7 &&
2914 infix 6 || |||
2915
2916 val op && = Word8.andb
2917 val op << = Word8.<<
2918 val op <<< = Chars.<<
2919 val op || = Word8.orb
2920 val op ||| = Chars.orb
2921
2922 val Byte2Char = Chars.fromLargeWord o Word8.toLargeWord
2923
2924 fun hexValue c =
2925 if #"0"<=c andalso #"9">=c then SOME (Byte.charToByte c-0wx30)
2926 else if #"A"<=c andalso #"F">=c then SOME (Byte.charToByte c-0wx37)
2927 else if #"a"<=c andalso #"f">=c then SOME (Byte.charToByte c-0wx57)
2928 else NONE
2929
2930 exception Failed of char list
2931
2932 fun getQuads cs =
2933 case cs
2934 of c1::c2::cs1 => (case (hexValue c1,hexValue c2)
2935 of (SOME b1,SOME b2) => ((b1 << 0w4 || b2),cs1)
2936 | _ => raise Failed cs1)
2937 | _ => raise Failed nil
2938
2939 (*--------------------------------------------------------------------*)
2940 (* decode UTF-8 *)
2941 (*--------------------------------------------------------------------*)
2942 val byte1switch = Array.array(256,1) (* 1 byte *)
2943 val _ = appInterval (fn i => Array.update(byte1switch,i,0)) (0x80,0xBF) (* Error *)
2944 val _ = appInterval (fn i => Array.update(byte1switch,i,2)) (0xC0,0xDF) (* 2 bytes *)
2945 val _ = appInterval (fn i => Array.update(byte1switch,i,3)) (0xE0,0xEF) (* 3 bytes *)
2946 val _ = appInterval (fn i => Array.update(byte1switch,i,4)) (0xF0,0xF7) (* 4 bytes *)
2947 val _ = appInterval (fn i => Array.update(byte1switch,i,5)) (0xF8,0xFB) (* 5 bytes *)
2948 val _ = appInterval (fn i => Array.update(byte1switch,i,6)) (0xFC,0xFD) (* 6 bytes *)
2949
2950 val diff2 = 0wx00003080
2951 val diff3 = diff2 <<< 0wx6 ||| 0wx00020080
2952 val diff4 = diff3 <<< 0wx6 ||| 0wx00400080
2953 val diff5 = diff4 <<< 0wx6 ||| 0wx08000080
2954 val diff6 = diff5 <<< 0wx6 ||| 0wx00000080
2955 val diffsByLen = Vector.fromList [0w0,0w0,diff2,diff3,diff4,diff5,diff6]
2956
2957 fun getByte cs =
2958 case cs
2959 of #"%"::cs1 => getQuads cs1
2960 | c::cs1 => (Byte.charToByte c,cs1)
2961 | nil => raise Failed nil
2962
2963 fun getBytes(b,cs,n) =
2964 let
2965 fun do_err (cs,m) =
2966 if n<m then raise Failed cs
2967 else let val (_,cs1) = getByte cs
2968 in do_err (cs1,m+1)
2969 end
2970 fun doit (w,cs,m) =
2971 if n<m then (w,cs)
2972 else let val (b,cs1) = getByte cs handle Failed cs => do_err(cs,m+1)
2973 val w1 = if b && 0wxC0 = 0wx80 then w <<< 0w6 + Byte2Char b
2974 else do_err(cs1,m+1)
2975 in doit (w1,cs1,m+1)
2976 end
2977 val (w,cs1) = doit (Byte2Char b,cs,2)
2978 val diff = Vector.sub(diffsByLen,n)
2979 val c = w-diff
2980 in
2981 if c<0wx100 then (Char2char c,cs1)
2982 else raise Failed cs1
2983 end
2984
2985 fun getCharUtf8 cs =
2986 let val (b,cs1) = getQuads cs
2987 in case Array.sub(byte1switch,Word8.toInt b)
2988 of 0 (* error *) => raise Failed cs1
2989 | 1 (* 1 byte *) => (Byte.byteToChar b,cs1)
2990 | n (* n bytes *) => getBytes(b,cs1,n)
2991 end
2992
2993 fun decodeUriUtf8 str =
2994 let
2995 val cs = String.explode str
2996
2997 fun doit yet nil = yet
2998 | doit yet (c::cs) =
2999 if #"%"<>c then doit (c::yet) cs
3000 else let val (yet1,cs1) = let val (ch,cs1) = getCharUtf8 cs
3001 in (ch::yet,cs1)
3002 end
3003 handle Failed cs => (yet,cs)
3004 in doit yet1 cs1
3005 end
3006 in
3007 String.implode(rev(doit nil cs))
3008 end
3009
3010 (*--------------------------------------------------------------------*)
3011 (* decode Latin *)
3012 (*--------------------------------------------------------------------*)
3013 fun getChar cs =
3014 case cs
3015 of #"%"::cs1 => let val (b,cs2) = getQuads cs1
3016 in (Byte.byteToChar b,cs2)
3017 end
3018 | c::cs1 => (c,cs1)
3019 | nil => raise Failed nil
3020
3021 fun decodeUriLatin str =
3022 let
3023 val cs = String.explode str
3024
3025 fun doit yet nil = yet
3026 | doit yet (c::cs) =
3027 let val (yet1,cs1) = let val (ch,cs1) = getChar cs
3028 in (ch::yet,cs1)
3029 end
3030 handle Failed cs => (yet,cs)
3031 in doit yet1 cs1
3032 end
3033 in
3034 String.implode(rev(doit nil cs))
3035 end
3036 end
3037 (* stop of ../../Unicode/Uri/uriDecode.sml *)
3038 (* start of ../../Unicode/Uri/uriEncode.sml *)
3039 signature UriEncode =
3040 sig
3041 val Data2UriUtf8 : UniChar.Data -> string
3042 val Data2UriLatin : UniChar.Data -> string
3043
3044 val Vector2UriUtf8 : UniChar.Vector -> string
3045 val Vector2UriLatin : UniChar.Vector -> string
3046
3047 val String2UriUtf8 : string -> string
3048 val String2UriLatin : string -> string
3049 end
3050
3051 structure UriEncode : UriEncode =
3052 struct
3053
3054 open UniChar UniClasses
3055
3056 infix 8 >> >>>
3057 infix 7 && &&&
3058 infix 6 ||
3059
3060 val op && = Word8.andb
3061 val op &&& = Chars.andb
3062 val op >> = Word8.>>
3063 val op >>> = Chars.>>
3064 val op || = Word8.orb
3065
3066 val Char2Byte = Word8.fromLargeWord o Chars.toLargeWord
3067
3068 fun encodeCharUtf8 c =
3069 if c<0wx80 then [Char2Byte c]
3070 else if c<0wx800
3071 then [0wxC0 || Char2Byte(c >>> 0w6),
3072 0wx80 || Char2Byte(c &&& 0wx3F)]
3073 else if c<0wx10000
3074 then [0wxE0 || Char2Byte(c >>> 0w12),
3075 0wx80 || Char2Byte((c >>> 0w6) &&& 0wx3F),
3076 0wx80 || Char2Byte(c &&& 0wx3F)]
3077 else if c<0wx200000
3078 then [0wxF0 || Char2Byte(c >>> 0w18),
3079 0wx80 || Char2Byte((c >>> 0w12) &&& 0wx3F),
3080 0wx80 || Char2Byte((c >>> 0w6) &&& 0wx3F),
3081 0wx80 || Char2Byte(c &&& 0wx3F)]
3082 else if c<0wx4000000
3083 then [0wxF8 || Char2Byte(c >>> 0w24),
3084 0wx80 || Char2Byte((c >>> 0w18) &&& 0wx3F),
3085 0wx80 || Char2Byte((c >>> 0w12) &&& 0wx3F),
3086 0wx80 || Char2Byte((c >>> 0w6) &&& 0wx3F),
3087 0wx80 || Char2Byte(c &&& 0wx3F)]
3088 else [0wxFC || Char2Byte(c >>> 0w30),
3089 0wx80 || Char2Byte((c >>> 0w24) &&& 0wx3F),
3090 0wx80 || Char2Byte((c >>> 0w18) &&& 0wx3F),
3091 0wx80 || Char2Byte((c >>> 0w12) &&& 0wx3F),
3092 0wx80 || Char2Byte((c >>> 0w6) &&& 0wx3F),
3093 0wx80 || Char2Byte(c &&& 0wx3F)]
3094
3095 fun Byte2Cc b =
3096 let fun Quad2C b = if b<0wxA then Byte.byteToChar(b+0wx30) else Byte.byteToChar(b+0wx37)
3097 in (Quad2C(b >> 0w4),Quad2C(b && 0wx0F))
3098 end
3099
3100 fun precedesHex (i,cv) =
3101 if Vector.length cv <= i+2 then false
3102 else let val (c1,c2) = (Vector.sub(cv,i+1),Vector.sub(cv,i+2))
3103 in isHex c1 andalso isHex c2
3104 end
3105
3106 fun Vector2UriUtf8 cv =
3107 let val revd = Vector.foldli
3108 (fn (i,c,s) => if c<0wx80 andalso (c<>0wx25 orelse precedesHex(i,cv))
3109 then Char2char c::s
3110 else foldl (fn (b,s) => let val (c1,c2) = Byte2Cc b
3111 in c2::c1:: #"%"::s
3112 end)
3113 s (encodeCharUtf8 c))
3114 nil (cv,0,NONE)
3115 in String.implode (rev revd)
3116 end
3117
3118 fun Vector2UriLatin cv =
3119 let val revd = Vector.foldli
3120 (fn (i,c,s) => if c<0wx80 andalso (c<>0wx25 orelse precedesHex(i,cv))
3121 then Char2char c::s
3122 else (if c>= 0w100 then s
3123 else let val (c1,c2) = Byte2Cc (Char2Byte c)
3124 in c2::c1:: #"%"::s
3125 end))
3126 nil (cv,0,NONE)
3127 in String.implode (rev revd)
3128 end
3129
3130 val Data2UriUtf8 = Vector2UriUtf8 o Data2Vector
3131 val Data2UriLatin = Vector2UriLatin o Data2Vector
3132
3133 val String2UriUtf8 = Vector2UriUtf8 o String2Vector
3134 val String2UriLatin = Vector2UriLatin o String2Vector
3135 end
3136
3137 (* stop of ../../Unicode/Uri/uriEncode.sml *)
3138 (* start of ../../Unicode/Uri/uri.sml *)
3139 (*
3140 require "basis.__array";
3141 require "basis.__byte";
3142 require "basis.__string";
3143 require "basis.__vector";
3144 require "basis.__word";
3145 require "basis.__word8";
3146
3147 require "util.unsafe";
3148 require "util.utilInt";
3149
3150 require "chars";
3151 require "naming";
3152 *)
3153 signature Uri =
3154 sig
3155 eqtype Uri
3156
3157 val emptyUri : Uri
3158
3159 val hashUri : Uri -> word
3160 val compareUri : Uri * Uri -> order
3161
3162 val uriJoin : Uri * Uri -> Uri
3163 val uriSuffix : Uri -> string
3164
3165 val Data2Uri : UniChar.Data -> Uri
3166 val Vector2Uri : UniChar.Vector -> Uri
3167 val String2Uri : string -> Uri
3168 val Uri2String : Uri -> string
3169
3170 val retrieveUri : Uri -> string * string * bool
3171 end
3172
3173 structure Uri :> Uri =
3174 struct
3175 open UniChar UniClasses UriDecode UriEncode UtilError UtilInt
3176
3177 (*--------------------------------------------------------------------*)
3178 (* decoding *)
3179 (*--------------------------------------------------------------------*)
3180 type Uri = string
3181
3182 val emptyUri = ""
3183
3184 val Vector2Uri = Vector2UriUtf8
3185 val Data2Uri = Data2UriUtf8
3186 val String2Uri = String2UriUtf8
3187 val Uri2String = decodeUriUtf8
3188
3189 val slash = "/"
3190
3191 fun uriSuffix s =
3192 let fun search i = if i<0 then NONE else case String.sub(s,i)
3193 of #"." => SOME i
3194 | #"/" => NONE
3195 | _ => search (i-1)
3196 in case search (String.size s-1)
3197 of NONE => ""
3198 | SOME i => String.extract(s,i+1,NONE)
3199 end
3200
3201 fun isScheme c =
3202 Char.isAlphaNum c orelse #"+"=c orelse #"-"=c orelse #"."=c
3203
3204 fun uriAbsolute uri =
3205 let fun search i =
3206 if i>=String.size uri then false
3207 else let val c=String.sub(uri,i)
3208 in if #":"=c then true else if isScheme c then search (i+1)
3209 else false
3210 end
3211 in
3212 if uri="" then false
3213 else if Char.isAlpha (String.sub(uri,0)) then search 1
3214 else false
3215 end
3216 fun uriRelative uri = not (uriAbsolute uri)
3217
3218 fun uriLocal uri =
3219 if String.isPrefix "file:" uri
3220 then SOME(String.extract(uri,5,NONE))
3221 else if uriRelative uri then SOME uri
3222 else NONE
3223
3224 fun uriPath s =
3225 let
3226 fun search (i,hadSlash) =
3227 if i<0 then if hadSlash then SOME 0 else NONE
3228 else case String.sub(s,i)
3229 of #"/" => if hadSlash then NONE else search(i-1,true)
3230 | _ => if hadSlash then SOME(i+1) else search(i-1,false)
3231 val len = String.size s
3232 val posOpt = search(len-1,false)
3233 in case posOpt
3234 of NONE => emptyUri
3235 | SOME i => if i=0 then slash
3236 else String.extract(s,0,SOME(i+1))
3237 end
3238
3239 fun uriAuth uri =
3240 let
3241 fun searchScheme i =
3242 if i>=String.size uri then NONE
3243 else let val c=String.sub(uri,i)
3244 in if #":"=c then SOME i else if isScheme c then searchScheme (i+1)
3245 else NONE
3246 end
3247 fun searchSlash i =
3248 if i>=String.size uri then NONE
3249 else let val c=String.sub(uri,i)
3250 in if #"/"=c then SOME i else searchSlash (i+1)
3251 end
3252 in
3253 if uri="" then ""
3254 else if not (Char.isAlpha(String.sub(uri,0))) then ""
3255 else case searchScheme 1
3256 of NONE => ""
3257 | SOME i =>
3258 if String.size uri<=i+2 then String.extract(uri,0,SOME(i+1))
3259 else if #"/"=String.sub(uri,i+1) andalso #"/"=String.sub(uri,i+2)
3260 then case searchSlash (i+3)
3261 of NONE => uri
3262 | SOME j => String.extract(uri,0,SOME j)
3263 else String.extract(uri,0,SOME(i+1))
3264 end
3265
3266 fun uriScheme uri =
3267 let
3268 fun searchScheme i =
3269 if i>=String.size uri then NONE
3270 else let val c=String.sub(uri,i)
3271 in if #":"=c then SOME i else if isScheme c then searchScheme (i+1)
3272 else NONE
3273 end
3274 in
3275 if uri="" then ""
3276 else if not (Char.isAlpha(String.sub(uri,0))) then ""
3277 else case searchScheme 1
3278 of NONE => ""
3279 | SOME i => String.extract(uri,0,SOME(i+1))
3280 end
3281
3282 fun uriJoin(abs,rel) =
3283 if rel="" then uriPath abs
3284 else if abs="" then rel
3285 else if String.isPrefix "//" rel then uriScheme abs^rel
3286 else if #"/"=String.sub(rel,0) then uriAuth abs^rel
3287 else if uriAbsolute rel then rel
3288 else uriPath abs^rel
3289
3290 val compareUri = String.compare
3291 val hashUri = UtilHash.hashString
3292
3293 fun convertCommand str (src,dst) =
3294 let
3295 val s = Substring.all str
3296 fun doit ss s =
3297 if Substring.isEmpty s then ss
3298 else let val (sl,sr) = Substring.splitr (fn c => #"%"<>c) s
3299 in if Substring.isEmpty sl then sr::ss
3300 else let val sl' = Substring.trimr 1 sl
3301 in case Substring.first sr
3302 of SOME #"1" => let val sr' = Substring.triml 1 sr
3303 in doit (Substring.all src::sr'::ss) sl'
3304 end
3305 | SOME #"2" => let val sr' = Substring.triml 1 sr
3306 in doit (Substring.all dst::sr'::ss) sl'
3307 end
3308 | _ => doit (Substring.all "%"::sr::ss) sl'
3309 end
3310 end
3311 val ss = doit nil s
3312 val s = Substring.concat ss
3313 in s
3314 end
3315
3316 fun retrieveRemote uri =
3317 let
3318 val tmp = OS.FileSys.tmpName()
3319 val cmd = convertCommand Config.retrieveCommand (uri,tmp)
3320 val status = OS.Process.system cmd
3321 val _ = if status = OS.Process.success then ()
3322 else let val _ = (OS.FileSys.remove tmp
3323 handle OS.SysErr _ => ())
3324 val cmd = convertCommand
3325 Config.retrieveCommand ("<uri>",tmp)
3326 in raise NoSuchFile (uri,"command '"^cmd^"' failed")
3327 end
3328 in (Uri2String uri,tmp,true)
3329 end
3330
3331 fun retrieveUri uri =
3332 case uriLocal uri
3333 of SOME f => (Uri2String uri,Uri2String f,false)
3334 | NONE => retrieveRemote uri
3335 end
3336 (* stop of ../../Unicode/Uri/uri.sml *)
3337 (* start of ../../Parser/version.sml *)
3338 structure Version =
3339 struct
3340 val FXP_VERSION = "1.4.4"
3341 end
3342 (* stop of ../../Parser/version.sml *)
3343 (* start of ../../Util/utilList.sml *)
3344
3345
3346
3347 (*--------------------------------------------------------------------------*)
3348 (* Structure: UtilList *)
3349 (* *)
3350 (* Depends on: *)
3351 (* *)
3352 (* Exceptions raised by functions in this structure: *)
3353 (* member : none *)
3354 (* findAndDelete : none *)
3355 (*--------------------------------------------------------------------------*)
3356 signature UtilList =
3357 sig
3358 val split : ('a -> bool) -> 'a list -> 'a list list
3359 val member : ''a -> ''a list -> bool
3360 val mapAllPairs : ('a * 'b -> 'c) -> 'a list * 'b list -> 'c list
3361 val findAndMap : ('a -> 'b option) -> 'a list -> 'b option
3362 val findAndDelete : ('a -> bool) -> 'a list -> ('a option * 'a list)
3363
3364 val sort : ('a * 'a -> order) -> 'a list -> 'a list
3365 val merge : ('a * 'a -> order) -> 'a list * 'a list -> 'a list
3366 val diff : ('a * 'a -> order) -> 'a list * 'a list -> 'a list
3367 val cap : ('a * 'a -> order) -> 'a list * 'a list -> 'a list
3368 val sub : ('a * 'a -> order) -> 'a list * 'a list -> bool
3369 val insert : ('a * 'a -> order) -> 'a * 'a list -> 'a list
3370 val delete : ('a * 'a -> order) -> 'a * 'a list -> 'a list
3371 val elem : ('a * 'a -> order) -> 'a * 'a list -> bool
3372 end
3373
3374 structure UtilList : UtilList =
3375 struct
3376 (*--------------------------------------------------------------------*)
3377 (* split a list into a list of lists at each element fullfilling p. *)
3378 (*--------------------------------------------------------------------*)
3379 fun split p l =
3380 let val (one,ls) = foldr
3381 (fn (a,(curr,ls)) => if p a then (nil,curr::ls) else (a::curr,ls))
3382 (nil,nil) l
3383 in one::ls
3384 end
3385
3386 (*--------------------------------------------------------------------*)
3387 (* is x a member of l? *)
3388 (*--------------------------------------------------------------------*)
3389 fun member x l = List.exists (fn y => x=y) l
3390
3391 (*--------------------------------------------------------------------*)
3392 (* for [a1,...,an] and [b1,...,bk], generate *)
3393 (* [f(a1,b1),f(a1,b2),...,f(an,bk-1),f(an,bk)]. *)
3394 (*--------------------------------------------------------------------*)
3395 fun mapAllPairs f (ass,bs) =
3396 foldr
3397 (fn (a,cs) => foldr (fn (b,cs) => f(a,b)::cs) cs bs)
3398 nil ass
3399
3400 (*--------------------------------------------------------------------*)
3401 (* find the first element x of l such that f x = SOME y, and return *)
3402 (* f x. If there is no such x, return NONE. *)
3403 (*--------------------------------------------------------------------*)
3404 fun findAndMap _ nil = NONE
3405 | findAndMap f (x::xs) = case f x of NONE => findAndMap f xs | y => y
3406
3407 (*--------------------------------------------------------------------*)
3408 (* find the first element x of l such that f x = true, delete it from *)
3409 (* l, and return SOME x with the modified list. If there is no such x *)
3410 (* return (NONE,l). *)
3411 (*--------------------------------------------------------------------*)
3412 fun findAndDelete _ nil = (NONE,nil)
3413 | findAndDelete f (x::xs) =
3414 if f x then (SOME x,xs)
3415 else let val (y,ys) = findAndDelete f xs in (y,x::ys) end
3416
3417 (*--------------------------------------------------------------------*)
3418 (* given a function that compares elements, merge two sorted lists. *)
3419 (*--------------------------------------------------------------------*)
3420 fun merge comp (l1,l2) =
3421 let
3422 fun go (nil,l) = l
3423 | go (l,nil) = l
3424 | go (l1 as (x1::r1),l2 as (x2::r2)) =
3425 case comp(x1,x2)
3426 of LESS => x1::go(r1,l2)
3427 | EQUAL => go(l1,r2)
3428 | GREATER => x2::go(l1,r2)
3429 in go(l1,l2)
3430 end
3431
3432 (*--------------------------------------------------------------------*)
3433 (* given a comparing function, compute the intersection of two *)
3434 (* ordered lists. *)
3435 (*--------------------------------------------------------------------*)
3436 fun cap comp (l1,l2) =
3437 let
3438 fun go (nil,l) = nil
3439 | go (l,nil) = nil
3440 | go (l1 as (x1::r1),l2 as (x2::r2)) =
3441 case comp(x1,x2)
3442 of LESS => go(r1,l2)
3443 | EQUAL => x1::go(r1,r2)
3444 | GREATER => go(l1,r2)
3445 in go(l1,l2)
3446 end
3447
3448 (*--------------------------------------------------------------------*)
3449 (* given a comparing function, compute the difference of two *)
3450 (* ordered lists. *)
3451 (*--------------------------------------------------------------------*)
3452 fun diff comp (l1,l2) =
3453 let
3454 fun go (nil,l) = nil
3455 | go (l,nil) = l
3456 | go (l1 as (x1::r1),l2 as (x2::r2)) =
3457 case comp(x1,x2)
3458 of LESS => x1::go(r1,l2)
3459 | EQUAL => go(r1,r2)
3460 | GREATER => go(l1,r2)
3461 in go(l1,l2)
3462 end
3463
3464 (*--------------------------------------------------------------------*)
3465 (* given a comparing function, find out whether an ordered list is *)
3466 (* contained in an other ordered list. *)
3467 (*--------------------------------------------------------------------*)
3468 fun sub comp (l1,l2) =
3469 let
3470 fun go (nil,l) = true
3471 | go (l,nil) = false
3472 | go (l1 as (x1::r1),l2 as (x2::r2)) =
3473 case comp(x1,x2)
3474 of LESS => false
3475 | EQUAL => go(r1,r2)
3476 | GREATER => go(l1,r2)
3477 in go(l1,l2)
3478 end
3479
3480 (*--------------------------------------------------------------------*)
3481 (* given a function that compares elements, insert an element into an *)
3482 (* ordered list. *)
3483 (*--------------------------------------------------------------------*)
3484 fun insert comp (x,l) =
3485 let
3486 fun go nil = [x]
3487 | go (l as y::ys) =
3488 case comp(x,y)
3489 of LESS => x::l
3490 | EQUAL => l
3491 | GREATER => y::go ys
3492 in go l
3493 end
3494
3495 (*--------------------------------------------------------------------*)
3496 (* given a function that compares elements, delete an element from *)
3497 (* an ordered list. *)
3498 (*--------------------------------------------------------------------*)
3499 fun delete comp (x,l) =
3500 let
3501 fun go nil = [x]
3502 | go (l as y::ys) =
3503 case comp(x,y)
3504 of LESS => l
3505 | EQUAL => ys
3506 | GREATER => y::go ys
3507 in go l
3508 end
3509
3510 (*--------------------------------------------------------------------*)
3511 (* given a function that compares elements, insert an element into an *)
3512 (* ordered list. *)
3513 (*--------------------------------------------------------------------*)
3514 fun elem comp (x,l) =
3515 let
3516 fun go nil = false
3517 | go (l as y::ys) =
3518 case comp(x,y)
3519 of LESS => false
3520 | EQUAL => true
3521 | GREATER => go ys
3522 in go l
3523 end
3524
3525 (*--------------------------------------------------------------------*)
3526 (* merge-sort a list of elements comparable with the function in the *)
3527 (* 1st argument. Preserve duplicate elements. *)
3528 (*--------------------------------------------------------------------*)
3529 fun sort _ nil = nil
3530 | sort comp l =
3531 let fun mergeOne (x::y::l) = merge comp (x,y)::mergeOne l
3532 | mergeOne l = l
3533 fun mergeAll [l] = l
3534 | mergeAll ls = mergeAll (mergeOne ls)
3535 val singles = map (fn x => [x]) l
3536 in
3537 mergeAll singles
3538 end
3539
3540 end
3541
3542 (* stop of ../../Util/utilList.sml *)
3543 (* start of ../../Parser/Dfa/dfaOptions.sml *)
3544 signature DfaOptions =
3545 sig
3546 val O_DFA_INITIAL_WIDTH : int ref
3547 val O_DFA_MAX_STATES : int ref
3548 val O_DFA_WARN_TOO_LARGE : bool ref
3549
3550 val setDfaDefaults : unit -> unit
3551 val setDfaOptions : Options.Option list * (string -> unit) -> Options.Option list
3552
3553 val dfaUsage : Options.Usage
3554 end
3555
3556 functor DfaOptions () : DfaOptions =
3557 struct
3558 open Options UtilInt
3559
3560 val O_DFA_INITIAL_WIDTH = ref 4
3561 val O_DFA_MAX_STATES = ref 256
3562 val O_DFA_WARN_TOO_LARGE = ref true
3563
3564 fun setDfaDefaults() =
3565 let
3566 val _ = O_DFA_INITIAL_WIDTH := 4
3567 val _ = O_DFA_MAX_STATES := 256
3568 val _ = O_DFA_WARN_TOO_LARGE := true
3569 in ()
3570 end
3571
3572 val dfaUsage =
3573 [U_ITEM(["--dfa-initial-size=n"],"Initial size of DFA transition tables (16)"),
3574 U_ITEM(["--dfa-initial-width=n"],"Same as --dfa-initial-size=2^n (4)"),
3575 U_ITEM(["--dfa-max-size=n"],"Maximal size of DFAs for ambiguous content models (256)"),
3576 U_ITEM(["--dfa-warn-size[=(yes|no)]"],"Warn about too large DFAs (yes)")
3577 ]
3578
3579 fun setDfaOptions(opts,doError) =
3580 let
3581 exception Failed of string option
3582
3583 fun getNat str =
3584 if str="" then raise Failed NONE
3585 else let val cs = String.explode str
3586 in foldl (fn (c,n) => if #"0">c orelse #"9"<c then raise Failed NONE
3587 else 10*n+ord c-48) 0 cs
3588 handle Overflow => raise Failed
3589 (SOME("number "^str^" is too large for this system"))
3590 end
3591
3592 val yesNo = "'yes' or 'no'"
3593 fun tooLarge n = String.concat ["number ",n," is too large for this system"]
3594 fun mustHave key = String.concat ["option --",key," must have an argument"]
3595 fun mustBe key what = String.concat
3596 ["the argument to option --",key," must be ",what]
3597
3598 fun do_yesno(key,valOpt,flag) =
3599 case valOpt
3600 of NONE => flag := true
3601 | SOME "yes" => flag := true
3602 | SOME "no" => flag := false
3603 | SOME s => doError (mustBe key yesNo)
3604
3605 fun do_num(key,valOpt,flag) =
3606 case valOpt
3607 of NONE => doError (mustHave key)
3608 | SOME s => flag := getNat s
3609 handle Failed NONE => doError (mustBe key "a number")
3610 | Failed (SOME s) => doError s
3611
3612 fun do_dfa_ts(key,valOpt,toWidth) =
3613 case valOpt
3614 of NONE => doError (mustHave key)
3615 | SOME s => O_DFA_INITIAL_WIDTH := toWidth (getNat s)
3616 handle Failed NONE => doError (mustBe key "a number")
3617 | Failed (SOME s) => doError s
3618
3619 fun do_long(key,valOpt) =
3620 case key
3621 of "dfa-initial-size" => true before do_dfa_ts(key,valOpt,nextPowerTwo)
3622 | "dfa-initial-width" => true before do_dfa_ts(key,valOpt,fn i => i)
3623 | "dfa-max-size" => true before do_num(key,valOpt,O_DFA_MAX_STATES)
3624 | "dfa-warn-size" => true before do_yesno(key,valOpt,O_DFA_WARN_TOO_LARGE)
3625 | _ => false
3626
3627 and doit nil = nil
3628 | doit (opt::opts) =
3629 case opt
3630 of OPT_NOOPT => opts
3631 | OPT_LONG(key,value) => if do_long(key,value) then doit opts
3632 else opt::doit opts
3633 | OPT_NEG _ => opt::doit opts
3634 | OPT_SHORT _ => opt::doit opts
3635 | OPT_STRING _ => opt::doit opts
3636 in doit opts
3637 end
3638 end
3639
3640
3641 (* stop of ../../Parser/Dfa/dfaOptions.sml *)
3642 (* start of ../../Parser/Params/parserOptions.sml *)
3643 (*--------------------------------------------------------------------------*)
3644 (* Structure: ParserOptions *)
3645 (* *)
3646 (* Depends on: none *)
3647 (*--------------------------------------------------------------------------*)
3648 signature ParserOptions =
3649 sig
3650 structure DfaOptions : DfaOptions
3651
3652 val O_CHECK_ISO639 : bool ref
3653 val O_CHECK_LANGID : bool ref
3654 val O_CHECK_PREDEFINED : bool ref
3655 val O_CHECK_RESERVED : bool ref
3656 val O_CHECK_VERSION : bool ref
3657
3658 val O_WARN_MULT_ENUM : bool ref
3659 val O_WARN_XML_DECL : bool ref
3660 val O_WARN_ATT_NO_ELEM : bool ref
3661 val O_WARN_MULT_ENT_DECL : bool ref
3662 val O_WARN_MULT_NOT_DECL : bool ref
3663 val O_WARN_MULT_ATT_DEF : bool ref
3664 val O_WARN_MULT_ATT_DECL : bool ref
3665 val O_WARN_SHOULD_DECLARE : bool ref
3666 val O_WARN_NON_ASCII_URI : bool ref
3667
3668 val O_ERROR_MINIMIZE : bool ref
3669
3670 val O_VALIDATE : bool ref
3671 val O_COMPATIBILITY : bool ref
3672 val O_INTEROPERABILITY : bool ref
3673
3674 val O_INCLUDE_EXT_PARSED : bool ref
3675 val O_INCLUDE_PARAM_ENTS : bool ref
3676
3677 val setParserDefaults : unit -> unit
3678 val setParserOptions : Options.Option list * (string -> unit) -> Options.Option list
3679
3680 val parserUsage : Options.Usage
3681 end
3682
3683 functor ParserOptions () : ParserOptions =
3684 struct
3685 structure DfaOptions = DfaOptions ()
3686
3687 open DfaOptions Options UtilInt UtilList
3688
3689 val O_CHECK_VERSION = ref true (* check for conforming xml version? *)
3690 val O_CHECK_ISO639 = ref true (* check whether a two-letter LangCode *)
3691 (* is acording to ISO 639? *)
3692 val O_CHECK_LANGID = ref true (* check whether a LangCode fullfills *)
3693 (* IETF RFC 1766? *)
3694 val O_CHECK_RESERVED = ref false(* check for names starting with xml? *)
3695 val O_CHECK_PREDEFINED = ref true (* check declarations of predefined *)
3696 val O_WARN_MULT_ENUM = ref true (* check whether a token occurs *)
3697 (* twice in the enumerated attribute *)
3698 (* types of the same element *)
3699 val O_WARN_XML_DECL = ref false (* warn if the XML decl is missing? *)
3700
3701 val O_WARN_ATT_NO_ELEM = ref true (* warn for undeclared elements *)
3702 (* in att def list declarations? *)
3703
3704 val O_WARN_MULT_ENT_DECL = ref true (* warn about redefined entities *)
3705 val O_WARN_MULT_NOT_DECL = ref true (* warn about redefined notations*)
3706 val O_WARN_SHOULD_DECLARE = ref true (* warn if predefined entities *)
3707 (* are not declared in the dtd *)
3708
3709 val O_WARN_MULT_ATT_DEF = ref true (* warn if an attributes is defd *)
3710 (* twice for the same element? *)
3711 val O_WARN_MULT_ATT_DECL = ref true (* warn if there are multiple att *)
3712 (* def lists for one element? *)
3713 val O_WARN_NON_ASCII_URI = ref true (* warn about non-ascii chars in *)
3714 (* system identifiers? *)
3715
3716 val O_ERROR_MINIMIZE = ref true (* try to avoid repeating errors? *)
3717
3718 val O_VALIDATE = ref true
3719 val O_COMPATIBILITY = ref true
3720 val O_INTEROPERABILITY = ref false
3721
3722 val O_INCLUDE_EXT_PARSED = ref false
3723 val O_INCLUDE_PARAM_ENTS = ref false
3724
3725 fun setParserDefaults() =
3726 let
3727 val _ = setDfaDefaults()
3728
3729 val _ = O_CHECK_ISO639 := false
3730 val _ = O_CHECK_LANGID := false
3731 val _ = O_CHECK_PREDEFINED := true
3732 val _ = O_CHECK_RESERVED := false
3733 val _ = O_CHECK_VERSION := true
3734
3735 val _ = O_WARN_MULT_ENUM := true
3736 val _ = O_WARN_XML_DECL := false
3737 val _ = O_WARN_ATT_NO_ELEM := false
3738 val _ = O_WARN_MULT_ENT_DECL := false
3739 val _ = O_WARN_MULT_NOT_DECL := false
3740 val _ = O_WARN_MULT_ATT_DEF := false
3741 val _ = O_WARN_MULT_ATT_DECL := false
3742 val _ = O_WARN_SHOULD_DECLARE := true
3743 val _ = O_WARN_NON_ASCII_URI := true
3744
3745 val _ = O_VALIDATE := true
3746 val _ = O_COMPATIBILITY := true
3747 val _ = O_INTEROPERABILITY := false
3748
3749 val _ = O_ERROR_MINIMIZE := true
3750
3751 val _ = O_INCLUDE_EXT_PARSED := false
3752 val _ = O_INCLUDE_PARAM_ENTS := false
3753 in ()
3754 end
3755
3756 val parserUsage =
3757 [U_ITEM(["-[n]v","--validate[=(yes|no)]"],"Turn on or off validation (yes)"),
3758 U_ITEM(["-[n]c","--compat[=(yes|no)]","--compatibility[=(yes|no)]"],
3759 "Turn on or off compatibility checking (yes)"),
3760 U_ITEM(["-[n]i","--interop[=(yes|no)]","--interoperability[=(yes|no)]"],
3761 "Turn on or off interoperability checking (no)"),
3762 U_SEP,
3763 U_ITEM(["--few-errors[=(yes|no)]"],"Report fewer errors (no)"),
3764 U_ITEM(["--check-reserved[=(yes|no)]"],
3765 "Checking for reserved names (no)"),
3766 U_ITEM(["--check-predef[=(yes|no)]","--check-predefined[=(yes|no)]"],
3767 "Check declaration of predefined entities (yes)"),
3768 U_ITEM(["--check-lang-id[=(yes|no)]"],"Checking language identifiers (no)"),
3769 U_ITEM(["--check-iso639[=(yes|no)]"],"Check ISO 639 language codes (no)"),
3770 U_ITEM(["--check-xml-version[=(yes|no)]"], "Check XML version number (yes)"),
3771 U_SEP,
3772 U_ITEM(["--warn-xml-decl[=(yes|no)]"],"Warn if there is no XML declaration (no)"),
3773 U_ITEM(["--warn-att-elem[=(yes|no)]"],
3774 "Warn about attlist declarations for undeclared elements (no)"),
3775 U_ITEM(["--warn-predefined[=(yes|no)]"],
3776 "Warn if the predefined entities are not declared (no)"),
3777 U_ITEM(["--warn-mult-decl[=<arg>]"],"Warn about multiple declarations (none)"),
3778 U_ITEM(["--warn-uri[=(yes|no)]"],"Warn about non-ASCII characters in URIs (yes)"),
3779 U_ITEM(["--warn[=all]"],"Warn about nearly everything"),
3780 U_ITEM(["--warn=none"],"Do not print warnings"),
3781 U_SEP,
3782 U_ITEM(["--include-ext[=(yes|no)]","--include-external[=(yes|no)]"],
3783 "Include external entities in non-validating mode (no)"),
3784 U_ITEM(["--include-par[=(yes|no)]","--include-parameter[=(yes|no)]"],
3785 "Include parameter entities and external subset in "^
3786 "non-validating mode (no)"),
3787 U_SEP]
3788 @dfaUsage
3789
3790 fun setParserOptions(opts,doError) =
3791 let
3792 datatype What = ATT | ATTLIST | ENT | NOT
3793
3794 exception Failed of string option
3795
3796 fun getNat str =
3797 if str="" then raise Failed NONE
3798 else let val cs = String.explode str
3799 in foldl (fn (c,n) => if #"0">c orelse #"9"<c then raise Failed NONE
3800 else 10*n+ord c-48) 0 cs
3801 handle Overflow => raise Failed
3802 (SOME("number "^str^" is too large for this system"))
3803 end
3804
3805 val allNone = "'all' or 'none'"
3806 val yesNo = "'yes' or 'no'"
3807 val yesNoWhat = "'yes', 'no' or a list of 'att', 'attlist', 'ent' and 'not'"
3808 fun errorMustBe(key,what) = doError
3809 (String.concat ["the argument to option --",key," must be ",what])
3810 fun errorNoArg key = doError
3811 (String.concat ["option --",key," has no argument"])
3812
3813 fun do_mult_decl(key,valOpt) =
3814 let
3815 val all = [ATT,ATTLIST,ENT,NOT]
3816 fun setFlags whats = app (fn (what,flag) => flag := member what whats)
3817 [(ATT,O_WARN_MULT_ATT_DEF),(ATTLIST,O_WARN_MULT_ATT_DECL),
3818 (ENT,O_WARN_MULT_ENT_DECL),(NOT,O_WARN_MULT_NOT_DECL)]
3819 in case valOpt
3820 of NONE => setFlags all
3821 | SOME "yes" => setFlags all
3822 | SOME "no" => setFlags nil
3823 | SOME s => let val fields = String.fields (fn c => #","=c) s
3824 val whats = map
3825 (fn s => case s
3826 of "att" => ATT
3827 | "attlist" => ATTLIST
3828 | "ent" => ENT
3829 | "not" => NOT
3830 | _ => raise Failed NONE) fields
3831 in setFlags whats
3832 end
3833 handle Failed _ => errorMustBe(key,yesNoWhat)
3834 end
3835
3836 fun do_noarg(key,valOpt,flag) =
3837 case valOpt
3838 of NONE => flag := true
3839 | SOME _ => errorNoArg key
3840
3841 fun do_yesno(key,valOpt,flag) =
3842 case valOpt
3843 of NONE => flag := true
3844 | SOME "yes" => flag := true
3845 | SOME "no" => flag := false
3846 | SOME s => errorMustBe(key,yesNo)
3847
3848 fun do_num(key,valOpt,flag) =
3849 case valOpt
3850 of NONE => errorMustBe(key,"a number")
3851 | SOME s => flag := getNat s
3852 handle Failed NONE => errorMustBe(key,"a number")
3853 | Failed (SOME s) => doError s
3854
3855 fun do_warn(key,valOpt) =
3856 let val all = [O_WARN_MULT_ENUM,O_WARN_ATT_NO_ELEM,
3857 O_WARN_MULT_ENT_DECL,O_WARN_MULT_NOT_DECL,O_WARN_MULT_ATT_DEF,
3858 O_WARN_MULT_ATT_DECL,O_WARN_SHOULD_DECLARE,O_WARN_XML_DECL]
3859 fun setFlags value = app (fn flag => flag := value) all
3860 in case valOpt
3861 of NONE => setFlags true
3862 | SOME "all" => setFlags true
3863 | SOME "none" => setFlags false
3864 | SOME _ => errorMustBe(key,allNone)
3865 end
3866
3867 fun do_long(key,valOpt) =
3868 case key
3869 of "validate" => true before do_yesno(key,valOpt,O_VALIDATE)
3870 | "compat" => true before do_yesno(key,valOpt,O_COMPATIBILITY)
3871 | "compatibility" => true before do_yesno(key,valOpt,O_COMPATIBILITY)
3872 | "interop" => true before do_yesno(key,valOpt,O_INTEROPERABILITY)
3873 | "interoperability" => true before do_yesno(key,valOpt,O_INTEROPERABILITY)
3874
3875 | "few-errors" => true before do_yesno(key,valOpt,O_ERROR_MINIMIZE)
3876
3877 | "check-reserved" => true before do_yesno(key,valOpt,O_CHECK_RESERVED)
3878 | "check-predef" => true before do_yesno(key,valOpt,O_CHECK_PREDEFINED)
3879 | "check-predefined" => true before do_yesno(key,valOpt,O_CHECK_PREDEFINED)
3880 | "check-lang-id" => true before do_yesno(key,valOpt,O_CHECK_LANGID)
3881 | "check-iso639" => true before do_yesno(key,valOpt,O_CHECK_ISO639)
3882 | "check-xml-version" => true before do_yesno(key,valOpt,O_CHECK_VERSION)
3883
3884 | "warn" => true before do_warn(key,valOpt)
3885 | "warn-xml-decl" => true before do_yesno(key,valOpt,O_WARN_XML_DECL)
3886 | "warn-att-elem" => true before do_yesno(key,valOpt,O_WARN_ATT_NO_ELEM)
3887 | "warn-predefined" => true before do_yesno(key,valOpt,O_WARN_SHOULD_DECLARE)
3888 | "warn-mult-decl" => true before do_mult_decl(key,valOpt)
3889 | "warn-uri" => true before do_yesno(key,valOpt,O_WARN_NON_ASCII_URI)
3890
3891 | "include-ext" => true before do_yesno(key,valOpt,O_INCLUDE_EXT_PARSED)
3892 | "include-external" => true before do_yesno(key,valOpt,O_INCLUDE_EXT_PARSED)
3893 | "include-par" => true before do_yesno(key,valOpt,O_INCLUDE_PARAM_ENTS)
3894 | "include-parameter" => true before do_yesno(key,valOpt,O_INCLUDE_PARAM_ENTS)
3895
3896 | _ => false
3897
3898 fun do_short cs =
3899 let fun doOne c =
3900 case c
3901 of #"v" => false before O_VALIDATE := true
3902 | #"c" => false before O_COMPATIBILITY := true
3903 | #"i" => false before O_INTEROPERABILITY := true
3904 | _ => true
3905 in List.filter doOne cs
3906 end
3907
3908 fun do_neg cs =
3909 let fun doOne c =
3910 case c
3911 of #"v" => false before O_VALIDATE := false
3912 | #"c" => false before O_COMPATIBILITY := false
3913 | #"i" => false before O_INTEROPERABILITY := false
3914 | _ => true
3915 in List.filter doOne cs
3916 end
3917
3918 and doit nil = nil
3919 | doit (opt::opts) =
3920 case opt
3921 of OPT_NOOPT => opts
3922 | OPT_LONG(key,value) => if do_long(key,value) then doit opts
3923 else opt::doit opts
3924 | OPT_SHORT cs => (case do_short cs
3925 of nil => doit opts
3926 | rest => OPT_SHORT rest::doit opts)
3927 | OPT_NEG cs => (case do_neg cs
3928 of nil => doit opts
3929 | rest => OPT_NEG rest::doit opts)
3930 | OPT_STRING s => opt::doit opts
3931
3932 val opts1 = setDfaOptions (opts,doError)
3933 in
3934 doit opts1
3935 end
3936 end
3937 (* stop of ../../Parser/Params/parserOptions.sml *)
3938 (* start of ../../Util/intLists.sml *)
3939 signature IntLists =
3940 sig
3941 type IntList = int list
3942
3943 val emptyIntList : IntList
3944 val singleIntList : int -> IntList
3945 val fullIntList : int -> IntList
3946
3947 val isEmptyIntList : IntList -> bool
3948 val inIntList : int * IntList -> bool
3949 val subIntList : IntList * IntList -> bool
3950
3951 val compareIntLists: IntList * IntList -> order
3952 val hashIntList : IntList -> word
3953
3954 val addIntList : int * IntList -> IntList
3955 val delIntList : int * IntList -> IntList
3956
3957 val cupIntLists : IntList * IntList -> IntList
3958 val capIntLists : IntList * IntList -> IntList
3959 val diffIntLists : IntList * IntList -> IntList
3960
3961 val IntList2String : IntList -> string
3962 end
3963
3964 structure IntLists : IntLists =
3965 struct
3966 open UtilCompare UtilHash UtilInt UtilList UtilString
3967
3968 type IntList = int list
3969
3970 val emptyIntList = nil : IntList
3971
3972 fun fullIntList n = intervalList(0,n)
3973 fun singleIntList n = [n]
3974 val isEmptyIntList = null
3975
3976 val inIntList = elem Int.compare
3977 val subIntList = sub Int.compare
3978 val addIntList = insert Int.compare
3979 val delIntList = delete Int.compare
3980 val capIntLists = cap Int.compare
3981 val cupIntLists = merge Int.compare
3982 val diffIntLists = diff Int.compare
3983 val compareIntLists = compareList Int.compare
3984 val hashIntList = hashList hashInt
3985
3986 val IntList2String = List2String Int.toString
3987 end
3988 (* stop of ../../Util/intLists.sml *)
3989 (* start of ../../Unicode/Chars/dataDict.sml *)
3990
3991
3992
3993
3994
3995
3996
3997
3998 structure KeyData : Key =
3999 struct
4000 type Key = UniChar.Data
4001
4002 val null = UniChar.nullData
4003 val hash = UniChar.hashData
4004 val compare = UniChar.compareData
4005 val toString = UniChar.Data2String
4006 end
4007
4008 structure DataDict = Dict (structure Key = KeyData)
4009 structure DataSymTab = SymTable (structure Key = KeyData)
4010
4011
4012 (* stop of ../../Unicode/Chars/dataDict.sml *)
4013 (* start of ../../Parser/Dfa/dfaData.sml *)
4014
4015
4016
4017 (*--------------------------------------------------------------------------*)
4018 (* Structure: DfaData *)
4019 (* *)
4020 (* Depends on: *)
4021 (* *)
4022 (* Exceptions raised by functions in this structure: *)
4023 (* boundsFollow : none *)
4024 (* mergeFirst : ConflictFirst *)
4025 (* mergeFollow : ConflictFollow *)
4026 (*--------------------------------------------------------------------------*)
4027 signature DfaData =
4028 sig
4029 type Dfa
4030
4031 datatype ContentModel =
4032 CM_ELEM of int
4033 | CM_OPT of ContentModel
4034 | CM_REP of ContentModel
4035 | CM_PLUS of ContentModel
4036 | CM_ALT of ContentModel list
4037 | CM_SEQ of ContentModel list
4038 end
4039
4040 structure DfaBase =
4041 struct
4042 (*--- visible to the parser ---*)
4043 datatype ContentModel =
4044 CM_ELEM of int
4045 | CM_OPT of ContentModel
4046 | CM_REP of ContentModel
4047 | CM_PLUS of ContentModel
4048 | CM_ALT of ContentModel list
4049 | CM_SEQ of ContentModel list
4050
4051 type Sigma = int
4052 type State = int
4053
4054 val dfaDontCare = ~2
4055 val dfaError = ~1
4056 val dfaInitial = 0
4057
4058 exception DfaTooLarge of int
4059 exception Ambiguous of Sigma * int * int
4060 exception ConflictFirst of Sigma * State * State
4061 exception ConflictFollow of Sigma * State * State
4062
4063 type Empty = bool
4064 type First = (State * Sigma) list
4065 type Follow = First
4066
4067 type Info = State * Empty * First
4068
4069 datatype CM' =
4070 ELEM of Sigma
4071 | OPT of CM
4072 | REP of CM
4073 | PLUS of CM
4074 | ALT of CM list
4075 | SEQ of CM list
4076 withtype CM = CM' * Info
4077
4078 type Row = Sigma * Sigma * State vector * bool
4079 val nullRow : Row = (1,0,Vector.fromList nil,false)
4080
4081 type Dfa = Row vector
4082
4083 val emptyDfa : Dfa = Vector.fromList [(1,0,Vector.fromList nil,true)]
4084 end
4085
4086 structure DfaData = DfaBase : DfaData
4087 (* stop of ../../Parser/Dfa/dfaData.sml *)
4088 (* start of ../../Unicode/Decode/decodeFile.sml *)
4089 (*--------------------------------------------------------------------------*)
4090 (* Structure: DecodeBasic *)
4091 (* *)
4092 (* Exceptions raised by functions in this structure: *)
4093 (* closeFile : none *)
4094 (* filePos : none *)
4095 (* fileName : none *)
4096 (* nextByte : EndOfFile *)
4097 (* openFile : NoSuchFile *)
4098 (*--------------------------------------------------------------------------*)
4099 signature DecodeFile =
4100 sig
4101 structure Bytes : WORD
4102
4103 type File
4104 type Byte = Bytes.word
4105
4106 exception EndOfFile of File
4107
4108 val Char2Byte : UniChar.Char -> Byte
4109 val Byte2Char : Byte -> UniChar.Char
4110 val Byte2Hex : Byte -> string
4111
4112 val openFile : Uri.Uri option -> File
4113 val closeFile : File -> unit
4114
4115 val getByte : File -> Byte * File
4116 val ungetBytes : File * Byte list -> File
4117
4118 val fileUri : File -> Uri.Uri
4119 val fileName : File -> string
4120 end
4121
4122 structure DecodeFile : DecodeFile =
4123 struct
4124 open
4125 UniChar Uri UtilError
4126
4127 structure Bytes = Word8
4128 type Byte = Bytes.word
4129
4130 fun Byte2Char b = Chars.fromLargeWord(Bytes.toLargeWord b)
4131 fun Byte2Hex b =
4132 "0x"^UtilString.toUpperString(StringCvt.padLeft #"0" 2 (Bytes.toString b))
4133 fun Char2Byte c = Bytes.fromLargeWord(Chars.toLargeWord c)
4134
4135 type instream = TextIO.instream
4136 val closeIn = TextIO.closeIn
4137 val input = TextIO.input
4138 val input1 = TextIO.input1
4139 val openIn = TextIO.openIn
4140 val stdIn = TextIO.stdIn
4141
4142 (*--------------------------------------------------------------------*)
4143 (* a file type is stdin or a uri with its string representation and *)
4144 (* the file it is mapped to. *)
4145 (* a file position is a stream, an int position and a file type. *)
4146 (* a file is a file position, a buffer, its size and current index. *)
4147 (*--------------------------------------------------------------------*)
4148 datatype FileType = STD | FNAME of (Uri * string * string * bool)
4149 type FilePos = FileType * instream * int
4150 type File = FilePos * Word8Vector.vector * int * int
4151
4152 exception EndOfFile of File
4153 val nullVec = Word8Vector.fromList nil
4154
4155 (*--------------------------------------------------------------------*)
4156 (* return the uri of a file. *)
4157 (*--------------------------------------------------------------------*)
4158 fun fileUri ((typ,_,_),_,_,_) =
4159 case typ
4160 of STD => emptyUri
4161 | FNAME(uri,_,_,_) => uri
4162 (*--------------------------------------------------------------------*)
4163 (* return the uri string name of a file. *)
4164 (*--------------------------------------------------------------------*)
4165 fun fileName ((typ,_,_),_,_,_) =
4166 case typ
4167 of STD => "<stdin>"
4168 | FNAME(_,str,_,_) => str
4169 (*--------------------------------------------------------------------*)
4170 (* return the uri string and the position in the the file. *)
4171 (*--------------------------------------------------------------------*)
4172 fun filePos ((typ,_,p),_,s,i) =
4173 case typ
4174 of STD => ("<stdin>",p+i-s)
4175 | FNAME(_,str,_,_) => (str,p+i-s)
4176
4177 (*--------------------------------------------------------------------*)
4178 (* open a file; report IO errors by raising NoSuchFile. *)
4179 (*--------------------------------------------------------------------*)
4180 fun openFile uriOpt =
4181 let val (typ,stream) =
4182 case uriOpt
4183 of NONE => (STD,stdIn)
4184 | SOME uri => let val (str,fname,tmp) = retrieveUri uri
4185 in (FNAME(uri,str,fname,tmp),openIn fname)
4186 end
4187 handle IO.Io {name,cause,...}
4188 => raise NoSuchFile(name,exnMessage cause)
4189 in ((typ,stream,0),nullVec,0,0)
4190 end
4191
4192 (*--------------------------------------------------------------------*)
4193 (* close the file; ignore IO errors. *)
4194 (*--------------------------------------------------------------------*)
4195 fun closeStream (typ,stream,_) =
4196 case typ
4197 of STD => ()
4198 | FNAME(_,uri,fname,tmp)
4199 => let val _ = closeIn stream handle IO.Io _ => ()
4200 val _ = (if tmp andalso OS.FileSys.access(fname,nil)
4201 then OS.FileSys.remove fname else ())
4202 handle exn as OS.SysErr _ =>
4203 TextIO.output(TextIO.stdErr,String.concat
4204 ["Error removing temporary file ",fname,"for URI",uri,
4205 "(",exnMessage exn,")\n"])
4206
4207 in ()
4208 end
4209 fun closeFile (tsp,_,_,_) = closeStream tsp
4210
4211 (*--------------------------------------------------------------------*)
4212 (* read a byte from the file; if at the end of buffer, reload it. *)
4213 (* if a reload fails or returns an IO error, raise EndOfFile. --------*)
4214 (*--------------------------------------------------------------------*)
4215 fun getByte (tsp,vec,s,i) =
4216 if i<s then (Word8Vector.sub(vec,i),(tsp,vec,s,i+1))
4217 else let val (typ,stream,pos) = tsp
4218 val v = Byte.stringToBytes (input stream) handle IO.Io _ => nullVec
4219 val s = Word8Vector.length v
4220 in if s=0 then let val _ = closeStream tsp
4221 in raise EndOfFile(tsp,v,0,0)
4222 end
4223 else (Word8Vector.sub(v,0),((typ,stream,pos+s),v,s,1))
4224 end
4225
4226 (*--------------------------------------------------------------------*)
4227 (* un-get some bytes. this should only happen while checking for a *)
4228 (* byte-order mark or xml/text declaration. It should be efficient in *)
4229 (* that case, otherwise might be very space-consuming. *)
4230 (*--------------------------------------------------------------------*)
4231 fun ungetBytes ((tsp,vec,s,i),bs) =
4232 let val len = length bs
4233 in if len<=i then (tsp,vec,s,i-len)
4234 else let val diff = len-i
4235 val vec0 = Word8Vector.fromList(List.take(bs,diff))
4236 in (tsp,Word8Vector.concat [vec0,vec],s+diff,0)
4237 end
4238 end
4239 end
4240 (* stop of ../../Unicode/Decode/decodeFile.sml *)
4241 (* start of ../../Unicode/Decode/decodeError.sml *)
4242
4243
4244
4245
4246
4247 (*--------------------------------------------------------------------------*)
4248 (* Structure: DecodeError *)
4249 (* *)
4250 (* Exceptions raised by functions in this structure: *)
4251 (* decodeMessage : none *)
4252 (*--------------------------------------------------------------------------*)
4253 signature DecodeError =
4254 sig
4255 datatype DecodeError =
4256 ERR_ILLEGAL_CHAR of DecodeFile.Byte * string
4257 | ERR_NON_UNI_UCS4 of UniChar.Char
4258 | ERR_EOF_UCS4 of int * DecodeFile.Byte list
4259 | ERR_NON_DIRECT_UTF7 of DecodeFile.Byte
4260 | ERR_PADDING_UTF7 of UniChar.Char
4261 | ERR_ILLFORMED_UTF8 of DecodeFile.Byte * int * int
4262 | ERR_ILLEGAL_UTF8 of DecodeFile.Byte
4263 | ERR_INVALID_UTF8_SEQ of DecodeFile.Byte list
4264 | ERR_EOF_UTF8 of int * int
4265 | ERR_NON_UNI_UTF8 of UniChar.Char * int
4266 | ERR_EOF_UCS2 of DecodeFile.Byte
4267 | ERR_EOF_UTF16 of DecodeFile.Byte
4268 | ERR_LOW_SURROGATE of UniChar.Char
4269 | ERR_HIGH_SURROGATE of UniChar.Char * UniChar.Char
4270 | ERR_EOF_SURROGATE of UniChar.Char
4271 | ERR_NO_ENC_DECL of string
4272 | ERR_UNSUPPORTED_ENC of string
4273 | ERR_INCOMPATIBLE_ENC of string * string
4274
4275 val decodeMessage : DecodeError -> string list
4276
4277 exception DecodeError of DecodeFile.File * bool * DecodeError
4278 end
4279
4280 structure DecodeError : DecodeError =
4281 struct
4282 open
4283 DecodeFile UtilString UniChar
4284
4285 datatype DecodeError =
4286 ERR_ILLEGAL_CHAR of DecodeFile.Byte * string
4287 | ERR_NON_UNI_UCS4 of UniChar.Char
4288 | ERR_EOF_UCS4 of int * DecodeFile.Byte list
4289 | ERR_NON_DIRECT_UTF7 of DecodeFile.Byte
4290 | ERR_PADDING_UTF7 of UniChar.Char
4291 | ERR_ILLFORMED_UTF8 of DecodeFile.Byte * int * int
4292 | ERR_ILLEGAL_UTF8 of DecodeFile.Byte
4293 | ERR_INVALID_UTF8_SEQ of DecodeFile.Byte list
4294 | ERR_EOF_UTF8 of int * int
4295 | ERR_NON_UNI_UTF8 of UniChar.Char * int
4296 | ERR_EOF_UCS2 of DecodeFile.Byte
4297 | ERR_EOF_UTF16 of DecodeFile.Byte
4298 | ERR_LOW_SURROGATE of UniChar.Char
4299 | ERR_HIGH_SURROGATE of UniChar.Char * UniChar.Char
4300 | ERR_EOF_SURROGATE of UniChar.Char
4301 | ERR_NO_ENC_DECL of string
4302 | ERR_UNSUPPORTED_ENC of string
4303 | ERR_INCOMPATIBLE_ENC of string * string
4304
4305 fun Char2Hex c = "0x"^UtilString.toUpperString(StringCvt.padLeft #"0" 4 (Chars.toString c))
4306
4307 fun decodeMessage err =
4308 case err
4309 of ERR_ILLEGAL_CHAR(b,what) =>
4310 [Byte2Hex b,"is not",prependAnA what,"character"]
4311
4312 | ERR_NON_UNI_UCS4 c =>
4313 ["UCS-4 coded non-Unicode character",Char2Uni c]
4314 | ERR_EOF_UCS4(pos,bytes) =>
4315 ["End of file after",Int2String pos,"bytes of UCS-4 character",
4316 "starting with ",List2String0 Byte2Hex bytes]
4317
4318 | ERR_NON_DIRECT_UTF7 b =>
4319 ["Indirect UTF-7 character ",Byte2Hex b,"in non-shifted mode"]
4320 | ERR_PADDING_UTF7 pad =>
4321 ["Non-zero padding",Char2Hex pad,"at end of UTF-7 shifted sequence"]
4322
4323 | ERR_ILLFORMED_UTF8 (b,len,pos) =>
4324 [numberNth pos,"byte",Byte2Hex b,"of a",Int2String len^"-byte",
4325 "UTF-8 sequence does not start with bits 10"]
4326 | ERR_ILLEGAL_UTF8 b =>
4327 ["Byte",Byte2Hex b,"is neither ASCII nor does it start",
4328 "a valid multi-byte UTF-8 sequence"]
4329 | ERR_EOF_UTF8 (len,pos) =>
4330 ["End of file terminates a ",Int2String len^"-byte",
4331 "UTF-8 sequence before the ",numberNth pos,"byte"]
4332 | ERR_NON_UNI_UTF8 (c,len) =>
4333 [Int2String len^"-byte UTF-8 sequence decodes to non-Unicode character",Char2Uni c]
4334 | ERR_INVALID_UTF8_SEQ bs =>
4335 ["Invalid UTF-8 sequence",List2xString (""," ","") Byte2Hex bs]
4336
4337 | ERR_EOF_UCS2 b =>
4338 ["End of file before second byte of UCS-2 character starting with",Byte2Hex b]
4339 | ERR_EOF_UTF16 b =>
4340 ["End of file before second byte of UTF-16 character starting with",Byte2Hex b]
4341
4342 | ERR_LOW_SURROGATE c =>
4343 ["Low surrogate",Char2Uni c,"without preceding high surrogate"]
4344 | ERR_HIGH_SURROGATE (c,c1) =>
4345 ["High surrogate",Char2Uni c,"followed by",Char2Uni c1,"instead of low surrogate"]
4346 | ERR_EOF_SURROGATE c =>
4347 ["High surrogate",Char2Uni c,"followed by the end of file"]
4348
4349 | ERR_NO_ENC_DECL auto =>
4350 ["Couldn't parse encoding declaration but auto-detected encoding",auto,"required so"]
4351 | ERR_UNSUPPORTED_ENC enc =>
4352 ["Unsupported encoding",enc]
4353 | ERR_INCOMPATIBLE_ENC (enc,auto) =>
4354 ["Encoding",enc,"is incompatible with auto-detected encoding",auto]
4355
4356 exception DecodeError of File * bool * DecodeError
4357 end
4358
4359
4360 (* stop of ../../Unicode/Decode/decodeError.sml *)
4361 (* start of ../../Unicode/Decode/decodeUtil.sml *)
4362 (*
4363 require "basis.__word";
4364 require "basis.__word8";
4365
4366 require "chars";
4367 require "decodeBasic";
4368 require "decodeError";
4369 *)
4370
4371 (*--------------------------------------------------------------------------*)
4372 (* Structure: DecodeUtil *)
4373 (* *)
4374 (* Exceptions raised by functions in this structure: *)
4375 (* combineSurrogates : none *)
4376 (* combineUcs4big : none *)
4377 (* combineUcs4little : none *)
4378 (* combineUcs4strangeBig : none *)
4379 (* combineUcs4strangeLittle : none *)
4380 (* combineUtf16big : none *)
4381 (* combineUtf16little : none *)
4382 (* isLowSurrogate : none *)
4383 (* isHighSurrogate : none *)
4384 (* isSurrogate : none *)
4385 (*--------------------------------------------------------------------------*)
4386 signature DecodeUtil =
4387 sig
4388 val isSurrogate : UniChar.Char -> bool
4389 val isLowSurrogate : UniChar.Char -> bool
4390 val isHighSurrogate : UniChar.Char -> bool
4391 val combineSurrogates : UniChar.Char * UniChar.Char -> UniChar.Char
4392 end
4393
4394 structure DecodeUtil : DecodeUtil =
4395 struct
4396 open UniChar DecodeFile DecodeError
4397
4398 fun isSurrogate c = Chars.orb(c,0wx7FF)=0wxDFFF
4399 fun isLowSurrogate c = Chars.orb(c,0wx3FF)=0wxDFFF
4400 fun isHighSurrogate c = Chars.orb(c,0wx3FF)=0wxDBFF
4401 fun combineSurrogates(hi,lo) = (hi-0wxD800)*0wx400+lo+0wx2400 : Char
4402 end
4403 (* stop of ../../Unicode/Decode/decodeUtil.sml *)
4404 (* start of ../../Unicode/Decode/decodeUcs2.sml *)
4405
4406
4407
4408
4409
4410
4411
4412 signature DecodeUcs2 =
4413 sig
4414 val getCharUcs2b : DecodeFile.File -> UniChar.Char * DecodeFile.File
4415 val getCharUcs2l : DecodeFile.File -> UniChar.Char * DecodeFile.File
4416 end
4417
4418 structure DecodeUcs2 : DecodeUcs2 =
4419 struct
4420 open
4421 UniChar Encoding
4422 DecodeFile DecodeError DecodeUtil
4423
4424 fun getCharUcs2b f =
4425 let
4426 val (b1,f1) = getByte f
4427 val (b2,f2) = getByte f1 handle exn as EndOfFile f
4428 => raise DecodeError(f,true,ERR_EOF_UCS2 b1)
4429 val c = Chars.orb(Chars.<<(Byte2Char b1,0w8),Byte2Char b2)
4430 in (c,f2)
4431 end
4432
4433 fun getCharUcs2l f =
4434 let
4435 val (b1,f1) = getByte f
4436 val (b2,f2) = getByte f1 handle exn as EndOfFile f
4437 => raise DecodeError(f,true,ERR_EOF_UCS2 b1)
4438 val c = Chars.orb(Chars.<<(Byte2Char b2,0w8),Byte2Char b1)
4439 in (c,f2)
4440 end
4441 end
4442 (* stop of ../../Unicode/Decode/decodeUcs2.sml *)
4443 (* start of ../../Unicode/Decode/decodeMisc.sml *)
4444 signature DecodeMisc =
4445 sig
4446 val getCharAscii : DecodeFile.File -> UniChar.Char * DecodeFile.File
4447 val getCharEbcdic : DecodeFile.File -> UniChar.Char * DecodeFile.File
4448 val getCharEof : DecodeFile.File -> UniChar.Char * DecodeFile.File
4449 val getCharLatin1 : DecodeFile.File -> UniChar.Char * DecodeFile.File
4450 end
4451
4452 structure DecodeMisc : DecodeMisc =
4453 struct
4454 open
4455 UniChar DecodeFile DecodeError
4456
4457 fun getCharEof f = raise EndOfFile f
4458
4459 (*--------------------------------------------------------------------*)
4460 (* ASCII characters must be lower than 0wx80 *)
4461 (*--------------------------------------------------------------------*)
4462 fun getCharAscii f =
4463 let val (b,f1) = getByte f
4464 in if b<0wx80 then (Byte2Char b,f1)
4465 else raise DecodeError(f1,false,ERR_ILLEGAL_CHAR(b,"ASCII"))
4466 end
4467
4468 (*--------------------------------------------------------------------*)
4469 (* LATIN-1 is the first plane of Unicode. *)
4470 (*--------------------------------------------------------------------*)
4471 fun getCharLatin1 f = let val (b,f1) = getByte f
4472 in (Byte2Char b,f1)
4473 end
4474
4475 (*--------------------------------------------------------------------*)
4476 (* EBCDIC is mapped to the first plane of Unicode. *)
4477 (*--------------------------------------------------------------------*)
4478 (* according to rfc-1345 (and gnu recode experiments) *)
4479 val ebcdic2latinTab = Vector.fromList
4480 [0wx00,0wx01,0wx02,0wx03,0wx9C,0wx09,0wx86,0wx7F,
4481 0wx97,0wx8D,0wx8E,0wx0B,0wx0C,0wx0D,0wx0E,0wx0F,
4482 0wx10,0wx11,0wx12,0wx13,0wx9D,0wx85,0wx08,0wx87,
4483 0wx18,0wx19,0wx92,0wx8F,0wx1C,0wx1D,0wx1E,0wx1F,
4484 0wx80,0wx81,0wx82,0wx83,0wx84,0wx0A,0wx17,0wx1B,
4485 0wx88,0wx89,0wx8A,0wx8B,0wx8C,0wx05,0wx06,0wx07,
4486 0wx90,0wx91,0wx16,0wx93,0wx94,0wx95,0wx96,0wx04,
4487 0wx98,0wx99,0wx9A,0wx9B,0wx14,0wx15,0wx9E,0wx1A,
4488 0wx20,0wxA0,0wxA1,0wxA2,0wxA3,0wxA4,0wxA5,0wxA6,
4489 0wxA7,0wxA8,0wx5B,0wx2E,0wx3C,0wx28,0wx2B,0wx21,
4490 0wx26,0wxA9,0wxAA,0wxAB,0wxAC,0wxAD,0wxAE,0wxAF,
4491 0wxB0,0wxB1,0wx5D,0wx24,0wx2A,0wx29,0wx3B,0wx5E,
4492 0wx2D,0wx2F,0wxB2,0wxB3,0wxB4,0wxB5,0wxB6,0wxB7,
4493 0wxB8,0wxB9,0wx7C,0wx2C,0wx25,0wx5F,0wx3E,0wx3F,
4494 0wxBA,0wxBB,0wxBC,0wxBD,0wxBE,0wxBF,0wxC0,0wxC1,
4495 0wxC2,0wx60,0wx3A,0wx23,0wx40,0wx27,0wx3D,0wx22,
4496 0wxC3,0wx61,0wx62,0wx63,0wx64,0wx65,0wx66,0wx67,
4497 0wx68,0wx69,0wxC4,0wxC5,0wxC6,0wxC7,0wxC8,0wxC9,
4498 0wxCA,0wx6A,0wx6B,0wx6C,0wx6D,0wx6E,0wx6F,0wx70,
4499 0wx71,0wx72,0wxCB,0wxCC,0wxCD,0wxCE,0wxCF,0wxD0,
4500 0wxD1,0wx7E,0wx73,0wx74,0wx75,0wx76,0wx77,0wx78,
4501 0wx79,0wx7A,0wxD2,0wxD3,0wxD4,0wxD5,0wxD6,0wxD7,
4502 0wxD8,0wxD9,0wxDA,0wxDB,0wxDC,0wxDD,0wxDE,0wxDF,
4503 0wxE0,0wxE1,0wxE2,0wxE3,0wxE4,0wxE5,0wxE6,0wxE7,
4504 0wx7B,0wx41,0wx42,0wx43,0wx44,0wx45,0wx46,0wx47,
4505 0wx48,0wx49,0wxE8,0wxE9,0wxEA,0wxEB,0wxEC,0wxED,
4506 0wx7D,0wx4A,0wx4B,0wx4C,0wx4D,0wx4E,0wx4F,0wx50,
4507 0wx51,0wx52,0wxEE,0wxEF,0wxF0,0wxF1,0wxF2,0wxF3,
4508 0wx5C,0wx9F,0wx53,0wx54,0wx55,0wx56,0wx57,0wx58,
4509 0wx59,0wx5A,0wxF4,0wxF5,0wxF6,0wxF7,0wxF8,0wxF9,
4510 0wx30,0wx31,0wx32,0wx33,0wx34,0wx35,0wx36,0wx37,
4511 0wx38,0wx39,0wxFA,0wxFB,0wxFC,0wxFD,0wxFE,0wxFF
4512 ]
4513
4514 fun ebcdic2latin b = Vector.sub(ebcdic2latinTab,Word8.toInt b)
4515
4516 fun getCharEbcdic f = let val (b,f1) = getByte f
4517 in (ebcdic2latin b,f1)
4518 end
4519 end
4520 (* stop of ../../Unicode/Decode/decodeMisc.sml *)
4521 (* start of ../../Unicode/Decode/decodeUcs4.sml *)
4522
4523
4524
4525
4526
4527
4528
4529
4530 signature DecodeUcs4 =
4531 sig
4532 val getCharUcs4b : DecodeFile.File -> UniChar.Char * DecodeFile.File
4533 val getCharUcs4l : DecodeFile.File -> UniChar.Char * DecodeFile.File
4534 val getCharUcs4sb : DecodeFile.File -> UniChar.Char * DecodeFile.File
4535 val getCharUcs4sl : DecodeFile.File -> UniChar.Char * DecodeFile.File
4536 end
4537
4538 structure DecodeUcs4 : DecodeUcs4 =
4539 struct
4540 open
4541 UniChar UniClasses
4542 DecodeFile DecodeError DecodeUtil
4543
4544 fun getCharUcs4b f =
4545 let
4546 val (b1,f1) = getByte f
4547 val (b2,f2) = getByte f1 handle EndOfFile f
4548 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1]))
4549 val (b3,f3) = getByte f2 handle EndOfFile f
4550 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2]))
4551 val (b4,f4) = getByte f3 handle EndOfFile f
4552 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2,b3]))
4553 val c = Chars.orb(Chars.orb(Chars.<<(Byte2Char b1,0w24),
4554 Chars.<<(Byte2Char b2,0w16)),
4555 Chars.orb(Chars.<<(Byte2Char b3,0w08),
4556 Byte2Char b4))
4557 in if isUnicode c then (c,f4)
4558 else raise DecodeError(f4,false,ERR_NON_UNI_UCS4 c)
4559 end
4560
4561 fun getCharUcs4l f =
4562 let
4563 val (b1,f1) = getByte f
4564 val (b2,f2) = getByte f1 handle EndOfFile f
4565 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1]))
4566 val (b3,f3) = getByte f2 handle EndOfFile f
4567 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2]))
4568 val (b4,f4) = getByte f3 handle EndOfFile f
4569 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2,b3]))
4570 val c = Chars.orb(Chars.orb(Chars.<<(Byte2Char b4,0w24),
4571 Chars.<<(Byte2Char b3,0w16)),
4572 Chars.orb(Chars.<<(Byte2Char b2,0w08),
4573 Byte2Char b1))
4574 in if isUnicode c then (c,f4)
4575 else raise DecodeError(f4,false,ERR_NON_UNI_UCS4 c)
4576 end
4577
4578 fun getCharUcs4sb f =
4579 let
4580 val (b1,f1) = getByte f
4581 val (b2,f2) = getByte f1 handle EndOfFile f
4582 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1]))
4583 val (b3,f3) = getByte f2 handle EndOfFile f
4584 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2]))
4585 val (b4,f4) = getByte f3 handle EndOfFile f
4586 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2,b3]))
4587 val c = Chars.orb(Chars.orb(Chars.<<(Byte2Char b2,0w24),
4588 Chars.<<(Byte2Char b1,0w16)),
4589 Chars.orb(Chars.<<(Byte2Char b4,0w08),
4590 Byte2Char b3))
4591 in if isUnicode c then (c,f4)
4592 else raise DecodeError(f4,false,ERR_NON_UNI_UCS4 c)
4593 end
4594
4595 fun getCharUcs4sl f =
4596 let
4597 val (b1,f1) = getByte f
4598 val (b2,f2) = getByte f1 handle EndOfFile f
4599 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1]))
4600 val (b3,f3) = getByte f2 handle EndOfFile f
4601 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2]))
4602 val (b4,f4) = getByte f3 handle EndOfFile f
4603 => raise DecodeError(f,true,ERR_EOF_UCS4(1,[b1,b2,b3]))
4604 val c = Chars.orb(Chars.orb(Chars.<<(Byte2Char b3,0w24),
4605 Chars.<<(Byte2Char b4,0w16)),
4606 Chars.orb(Chars.<<(Byte2Char b1,0w08),
4607 Byte2Char b2))
4608 in if isUnicode c then (c,f4)
4609 else raise DecodeError(f4,false,ERR_NON_UNI_UCS4 c)
4610 end
4611 end
4612
4613 (* stop of ../../Unicode/Decode/decodeUcs4.sml *)
4614 (* start of ../../Unicode/Decode/decodeUtf16.sml *)
4615
4616
4617
4618
4619
4620
4621
4622 signature DecodeUtf16 =
4623 sig
4624 val getCharUtf16b : DecodeFile.File -> UniChar.Char * DecodeFile.File
4625 val getCharUtf16l : DecodeFile.File -> UniChar.Char * DecodeFile.File
4626 end
4627
4628 structure DecodeUtf16 : DecodeUtf16 =
4629 struct
4630 open
4631 UniChar Encoding
4632 DecodeFile DecodeError DecodeUtil
4633
4634 fun getCharUtf16b f =
4635 let
4636 val (b1,f1) = getByte f
4637 val (b2,f2) = getByte f1 handle exn as EndOfFile f
4638 => raise DecodeError(f,true,ERR_EOF_UTF16 b1)
4639 val c = Chars.orb(Chars.<<(Byte2Char b1,0w8),Byte2Char b2)
4640 in
4641 if isSurrogate c then (* Chars.orb(c,0wx7FF)=0wxDFFF *)
4642 if isLowSurrogate c then raise DecodeError(f2,false,ERR_LOW_SURROGATE c)
4643 else let
4644 val (b3,f3) = getByte f2 handle exn as EndOfFile f
4645 => raise DecodeError(f,true,ERR_EOF_SURROGATE c)
4646 val (b4,f4) = getByte f3 handle exn as EndOfFile f
4647 => raise DecodeError(f,true,ERR_EOF_UTF16 b3)
4648 val c1 = Chars.orb(Chars.<<(Byte2Char b3,0w8),Byte2Char b4)
4649 in if isLowSurrogate c1 then (combineSurrogates(c,c1),f4)
4650 else raise DecodeError(f4,false,ERR_HIGH_SURROGATE(c,c1))
4651 end
4652 else (c,f2)
4653 end
4654
4655 fun getCharUtf16l f =
4656 let
4657 val (b1,f1) = getByte f
4658 val (b2,f2) = getByte f1 handle exn as EndOfFile f
4659 => raise DecodeError(f,true,ERR_EOF_UTF16 b1)
4660 val c = Chars.orb(Chars.<<(Byte2Char b2,0w8),Byte2Char b1)
4661 in
4662 if isSurrogate c then
4663 if isLowSurrogate c then raise DecodeError(f2,false,ERR_LOW_SURROGATE c)
4664 else let
4665 val (b3,f3) = getByte f2 handle exn as EndOfFile f
4666 => raise DecodeError(f,true,ERR_EOF_SURROGATE c)
4667 val (b4,f4) = getByte f3 handle exn as EndOfFile f
4668 => raise DecodeError(f,true,ERR_EOF_UTF16 b3)
4669 val c1 = Chars.orb(Chars.<<(Byte2Char b4,0w8),Byte2Char b3)
4670 in if isLowSurrogate c1 then (combineSurrogates(c,c1),f4)
4671 else raise DecodeError(f4,false,ERR_HIGH_SURROGATE(c,c1))
4672 end
4673 else (c,f2)
4674 end
4675 end
4676 (* stop of ../../Unicode/Decode/decodeUtf16.sml *)
4677 (* start of ../../Unicode/Decode/decodeUtf8.sml *)
4678 signature DecodeUtf8 =
4679 sig
4680 val getCharUtf8 : DecodeFile.File -> UniChar.Char * DecodeFile.File
4681 end
4682
4683 structure DecodeUtf8 : DecodeUtf8 =
4684 struct
4685 open
4686 UniChar UniClasses UtilError UtilInt
4687 DecodeFile DecodeError DecodeUtil
4688
4689 val THIS_MODULE = "DecodeUtf8"
4690
4691 infix 8 <<<
4692 infix 7 &&
4693 infix 6 |||
4694
4695 val op && = Bytes.andb
4696 val op <<< = Chars.<<
4697 val op ||| = Chars.orb
4698
4699 val byte1switch = Vector.tabulate
4700 (256,fn i =>
4701 if i<0x80 then 1
4702 else if i<0xC0 then 0
4703 else if i<0xE0 then 2
4704 else if i<0xF0 then 3
4705 else if i<0xF8 then 4
4706 else if i<0xFC then 5
4707 else if i<0xFE then 6
4708 else 0)
4709
4710 val diff2 : Char = 0wx00003080
4711 val diff3 : Char = diff2 <<< 0wx6 ||| 0wx00020080
4712 val diff4 : Char = diff3 <<< 0wx6 ||| 0wx00400080
4713 val diff5 : Char = diff4 <<< 0wx6 ||| 0wx08000080
4714 val diff6 : Char = diff5 <<< 0wx6 ||| 0wx00000080
4715
4716 fun getCharUtf8 f =
4717 let val (b1,f1) = getByte f
4718 in if b1<0wx80 then (Byte2Char b1,f1)
4719 else let val n = Vector.sub(byte1switch,Word8.toInt b1)
4720 in case n
4721 of 0 (* error *) => raise DecodeError(f1,false,ERR_ILLEGAL_UTF8 b1)
4722 | 1 => (Byte2Char b1,f1)
4723 | 2 =>
4724 let
4725 val (b2,f2) = getByte f1 handle EndOfFile f
4726 => raise DecodeError(f,true,ERR_EOF_UTF8(n,2))
4727 in if b2 && 0wxC0 <> 0wx80
4728 then raise DecodeError(f2,false,ERR_ILLFORMED_UTF8(b2,n,2))
4729 else let val c = Byte2Char b1 <<< 0w6 + Byte2Char b2 - diff2
4730 in if c>=0wx80 then (c,f2)
4731 else raise DecodeError(f2,false,ERR_INVALID_UTF8_SEQ [b1,b2])
4732 end
4733 end
4734 | 3 =>
4735 let
4736 val (b2,f2) = getByte f1 handle EndOfFile f
4737 => raise DecodeError(f,true,ERR_EOF_UTF8(n,2))
4738 val (b3,f3) = getByte f2 handle EndOfFile f
4739 => raise DecodeError(f,true,ERR_EOF_UTF8(n,3))
4740 in
4741 if b2 && 0wxC0 <> 0wx80
4742 then raise DecodeError(f3,false,ERR_ILLFORMED_UTF8(b2,n,2))
4743 else if b3 && 0wxC0 <> 0wx80
4744 then raise DecodeError(f3,false,ERR_ILLFORMED_UTF8(b2,n,3))
4745 else let val c = (Byte2Char b1 <<< 0w12 +
4746 Byte2Char b2 <<< 0w06 +
4747 Byte2Char b3 - diff3)
4748 in if c>=0wx800 then (c,f3)
4749 else raise DecodeError
4750 (f3,false,ERR_INVALID_UTF8_SEQ [b1,b2,b3])
4751 end
4752 end
4753 | 4 =>
4754 let
4755 val (b2,f2) = getByte f1 handle EndOfFile f
4756 => raise DecodeError(f,true,ERR_EOF_UTF8(n,2))
4757 val (b3,f3) = getByte f2 handle EndOfFile f
4758 => raise DecodeError(f,true,ERR_EOF_UTF8(n,3))
4759 val (b4,f4) = getByte f3 handle EndOfFile f
4760 => raise DecodeError(f,true,ERR_EOF_UTF8(n,4))
4761 in
4762 if b2 && 0wxC0 <> 0wx80
4763 then raise DecodeError(f4,false,ERR_ILLFORMED_UTF8(b2,n,2))
4764 else if b3 && 0wxC0 <> 0wx80
4765 then raise DecodeError(f4,false,ERR_ILLFORMED_UTF8(b2,n,3))
4766 else if b4 && 0wxC0 <> 0wx80
4767 then raise DecodeError(f4,false,ERR_ILLFORMED_UTF8(b2,n,4))
4768 else let val c = (Byte2Char b1 <<< 0w18 +
4769 Byte2Char b2 <<< 0w12 +
4770 Byte2Char b3 <<< 0w06 +
4771 Byte2Char b4 - diff4)
4772 in
4773 if c>=0wx100000 andalso c<=0wx10FFFF then (c,f4)
4774 else if c<0wx10000
4775 then raise DecodeError
4776 (f4,false,ERR_INVALID_UTF8_SEQ [b1,b2,b3,b4])
4777 else raise DecodeError
4778 (f4,false,ERR_NON_UNI_UTF8(c,n))
4779 end
4780 end
4781 | 5 =>
4782 let
4783 val (b2,f2) = getByte f1 handle EndOfFile f
4784 => raise DecodeError(f,true,ERR_EOF_UTF8(n,2))
4785 val (b3,f3) = getByte f2 handle EndOfFile f
4786 => raise DecodeError(f,true,ERR_EOF_UTF8(n,3))
4787 val (b4,f4) = getByte f3 handle EndOfFile f
4788 => raise DecodeError(f,true,ERR_EOF_UTF8(n,4))
4789 val (b5,f5) = getByte f4 handle EndOfFile f
4790 => raise DecodeError(f,true,ERR_EOF_UTF8(n,5))
4791 in
4792 if b2 && 0wxC0 <> 0wx80
4793 then raise DecodeError(f5,false,ERR_ILLFORMED_UTF8(b2,n,2))
4794 else if b3 && 0wxC0 <> 0wx80
4795 then raise DecodeError(f5,false,ERR_ILLFORMED_UTF8(b2,n,3))
4796 else if b4 && 0wxC0 <> 0wx80
4797 then raise DecodeError(f5,false,ERR_ILLFORMED_UTF8(b2,n,4))
4798 else if b5 && 0wxC0 <> 0wx80
4799 then raise DecodeError(f5,false,ERR_ILLFORMED_UTF8(b2,n,5))
4800 else let val c = (Byte2Char b1 <<< 0w24 +
4801 Byte2Char b2 <<< 0w18 +
4802 Byte2Char b3 <<< 0w12 +
4803 Byte2Char b4 <<< 0w06 +
4804 Byte2Char b5 - diff5)
4805 in if c<0wx200000
4806 then raise DecodeError
4807 (f5,false,ERR_INVALID_UTF8_SEQ [b1,b2,b3,b4,b5])
4808 else raise DecodeError
4809 (f5,false,ERR_NON_UNI_UTF8(c,n))
4810 end
4811 end
4812 | 6 =>
4813 let
4814 val (b2,f2) = getByte f1 handle EndOfFile f
4815 => raise DecodeError(f,true,ERR_EOF_UTF8(n,2))
4816 val (b3,f3) = getByte f2 handle EndOfFile f
4817 => raise DecodeError(f,true,ERR_EOF_UTF8(n,3))
4818 val (b4,f4) = getByte f3 handle EndOfFile f
4819 => raise DecodeError(f,true,ERR_EOF_UTF8(n,4))
4820 val (b5,f5) = getByte f4 handle EndOfFile f
4821 => raise DecodeError(f,true,ERR_EOF_UTF8(n,5))
4822 val (b6,f6) = getByte f5 handle EndOfFile f
4823 => raise DecodeError(f,true,ERR_EOF_UTF8(n,6))
4824 in
4825 if b2 && 0wxC0 <> 0wx80
4826 then raise DecodeError(f6,false,ERR_ILLFORMED_UTF8(b2,n,2))
4827 else if b3 && 0wxC0 <> 0wx80
4828 then raise DecodeError(f6,false,ERR_ILLFORMED_UTF8(b2,n,3))
4829 else if b4 && 0wxC0 <> 0wx80
4830 then raise DecodeError(f6,false,ERR_ILLFORMED_UTF8(b2,n,4))
4831 else if b5 && 0wxC0 <> 0wx80
4832 then raise DecodeError(f6,false,ERR_ILLFORMED_UTF8(b2,n,5))
4833 else if b6 && 0wxC0 <> 0wx80
4834 then raise DecodeError(f6,false,ERR_ILLFORMED_UTF8(b2,n,6))
4835 else let val c = (Byte2Char b1 <<< 0w30 +
4836 Byte2Char b2 <<< 0w24 +
4837 Byte2Char b3 <<< 0w18 +
4838 Byte2Char b4 <<< 0w12 +
4839 Byte2Char b5 <<< 0w06 +
4840 Byte2Char b6 - diff6)
4841 in if c<0wx4000000
4842 then raise DecodeError
4843 (f6,false,ERR_INVALID_UTF8_SEQ [b1,b2,b3,b4,b5,b6])
4844 else raise DecodeError
4845 (f6,false,ERR_NON_UNI_UTF8(c,n))
4846 end
4847 end
4848 | _ => raise InternalError(THIS_MODULE,"getCharUtf8",
4849 "byte1switch holds "^Int.toString n^
4850 ">6 for byte "^Bytes.toString b1)
4851 end
4852 end
4853 end
4854 (* stop of ../../Unicode/Decode/decodeUtf8.sml *)
4855 (* start of ../../Unicode/Decode/decode.sml *)
4856 (*--------------------------------------------------------------------------*)
4857 (* Structure: Decode *)
4858 (* *)
4859 (* Exceptions raised by functions in this structure: *)
4860 (* checkEncoding : NoSuchFile *)
4861 (* encCloseFile : none *)
4862 (* encFileName : none *)
4863 (*--------------------------------------------------------------------------*)
4864 signature Decode =
4865 sig
4866 structure Error : DecodeError
4867
4868 type DecFile
4869
4870 exception DecEof of DecFile
4871 exception DecError of DecFile * bool * Error.DecodeError
4872
4873 val decUri : DecFile -> Uri.Uri
4874 val decName : DecFile -> string
4875 val decEncoding : DecFile -> Encoding.Encoding
4876
4877 val decOpenXml : Uri.Uri option -> DecFile
4878 val decOpenUni : Uri.Uri option * Encoding.Encoding -> DecFile
4879 val decClose : DecFile -> DecFile
4880
4881 val decCommit : DecFile -> unit
4882 val decSwitch : DecFile * string -> DecFile
4883
4884 val decGetChar : DecFile -> UniChar.Char * DecFile
4885 val decGetArray : DecFile -> UniChar.Char array -> int * DecFile * Error.DecodeError option
4886 end
4887
4888 structure Decode : Decode =
4889 struct
4890 structure Error = DecodeError
4891 open
4892 UniChar Encoding Error
4893 DecodeFile DecodeMisc DecodeUcs2 DecodeUcs4
4894 DecodeUtf16 DecodeUtf8 DecodeUtil
4895
4896 type DecFile = Encoding * File
4897 exception DecEof of DecFile
4898 exception DecError of DecFile * bool * DecodeError
4899
4900 (*--------------------------------------------------------------------*)
4901 (* close an encoded entity. *)
4902 (*--------------------------------------------------------------------*)
4903 fun decClose (_,f) = (NOENC,f) before closeFile f
4904 (*--------------------------------------------------------------------*)
4905 (* get the uri string of an encoded entity. *)
4906 (*--------------------------------------------------------------------*)
4907 fun decName (_,f) = fileName f
4908 (*--------------------------------------------------------------------*)
4909 (* get the uri of an encoded entity. *)
4910 (*--------------------------------------------------------------------*)
4911 fun decUri (_,f) = fileUri f
4912 (*--------------------------------------------------------------------*)
4913 (* get the encoding of an encoded entity. *)
4914 (*--------------------------------------------------------------------*)
4915 fun decEncoding (enc,_) = enc
4916
4917 (*--------------------------------------------------------------------*)
4918 (* commit the auto-detected encoding. *)
4919 (*--------------------------------------------------------------------*)
4920 fun decCommit (enc,f) =
4921 case enc
4922 of UTF8 => ()
4923 | UTF16B => ()
4924 | UTF16L => ()
4925 | _ => raise DecError((enc,f),false,ERR_NO_ENC_DECL(encodingName enc))
4926
4927 (*--------------------------------------------------------------------*)
4928 (* change to another - compatible - encoding. *)
4929 (*--------------------------------------------------------------------*)
4930 fun decSwitch ((enc,f),decl) =
4931 let
4932 val decEnc = isEncoding decl
4933 val _ = if decEnc<>NOENC then ()
4934 else raise DecError((enc,f),false,ERR_UNSUPPORTED_ENC decl)
4935 val newEnc = switchEncoding(enc,decEnc)
4936 val _ = if decEnc<>NOENC orelse enc=NOENC then ()
4937 else raise DecError((enc,f),false,ERR_INCOMPATIBLE_ENC(encodingName enc,decl))
4938 in (newEnc,f)
4939 end
4940
4941 (*--------------------------------------------------------------------*)
4942 (* get a character from an encoded entity. *)
4943 (*--------------------------------------------------------------------*)
4944 fun decGetChar (enc,f) =
4945 let val (c,f1) =
4946 case enc
4947 of NOENC => raise EndOfFile f
4948 | ASCII => getCharAscii f
4949 | EBCDIC => getCharEbcdic f
4950 | LATIN1 => getCharLatin1 f
4951 | UCS2B => getCharUcs2b f
4952 | UCS2L => getCharUcs2l f
4953 | UCS4B => getCharUcs4b f
4954 | UCS4L => getCharUcs4l f
4955 | UCS4SB => getCharUcs4sb f
4956 | UCS4SL => getCharUcs4sl f
4957 | UTF8 => getCharUtf8 f
4958 | UTF16B => getCharUtf16b f
4959 | UTF16L => getCharUtf16l f
4960 in (c,(enc,f1))
4961 end
4962 handle EndOfFile f => raise DecEof(NOENC,f)
4963 | DecodeError(f,eof,err) => raise DecError((enc,f),eof,err)
4964
4965 (*--------------------------------------------------------------------*)
4966 (* Load new characters, depending on the current entity's encoding. *)
4967 (*--------------------------------------------------------------------*)
4968 fun decGetArray (enc,f) arr =
4969 let
4970 (*--------------------------------------------------------------*)
4971 (* Load the buffer with len new characters, or until the entity *)
4972 (* end is reached. Close the current file in that case. *)
4973 (* Local exception Ended is needed in order to preserve tail *)
4974 (* recursion. *)
4975 (*--------------------------------------------------------------*)
4976 fun loadArray getChar =
4977 let
4978 val ende = Array.length arr
4979 exception Error of int * exn
4980 fun doit (idx,f) =
4981 if idx=ende then (ende,(enc,f),NONE)
4982 else let val (c,f1) = getChar f handle exn => raise Error (idx,exn)
4983 val _ = Array.update(arr,idx,c)
4984 in doit (idx+1,f1)
4985 end
4986 in doit (0,f) handle Error(idx,exn)
4987 => case exn
4988 of EndOfFile f => (idx,(NOENC,f),NONE)
4989 | DecodeError (f,_,err) => (idx,(enc,f),SOME err)
4990 | _ => raise exn
4991 end
4992 in case enc
4993 of NOENC => (0,(NOENC,f),NONE)
4994 | ASCII => loadArray getCharAscii
4995 | EBCDIC => loadArray getCharEbcdic
4996 | LATIN1 => loadArray getCharLatin1
4997 | UCS2B => loadArray getCharUcs2b
4998 | UCS2L => loadArray getCharUcs2l
4999 | UCS4B => loadArray getCharUcs4b
5000 | UCS4L => loadArray getCharUcs4l
5001 | UCS4SB => loadArray getCharUcs4sb
5002 | UCS4SL => loadArray getCharUcs4sl
5003 | UTF8 => loadArray getCharUtf8
5004 | UTF16B => loadArray getCharUtf16b
5005 | UTF16L => loadArray getCharUtf16l
5006 end
5007
5008
5009 (*--------------------------------------------------------------------*)
5010 (* open an XML file and try to auto-detect its encoding. *)
5011 (*--------------------------------------------------------------------*)
5012 (* Auto-detection of the encoding of XML entities according to App. F *)
5013 (* of the XML recommendation. *)
5014 (* *)
5015 (* The file is opened in basic mode and upto four bytes are read from *)
5016 (* it in order to detect the encoding: if they constitute a prefix of *)
5017 (* "<?xml" in a recognized encoding, this encoding is taken. *)
5018 (*--------------------------------------------------------------------*)
5019 (* read upto four bytes from the file, detect the encoding, and unget *)
5020 (* the read bytes. Return the resulting encoded file and its encoding *)
5021 (*--------------------------------------------------------------------*)
5022 (**************************************************************************)
5023 (* NB 24.08.2000 Autodetection of encoding is affected by the *)
5024 (* XML 1.0 Specification Errata (10.08.2000) E 44 *)
5025 (* *)
5026 (* The first four bytes read are interpreted according to: *)
5027 (* *)
5028 (* "Append the following to the second paragraph: *)
5029 (* The notation ## is used to denote any byte value except 00. *)
5030 (* Adjust the itemized list of detection cases to read as follows: *)
5031 (* *)
5032 (* With a Byte Order Mark: *)
5033 (* 00 00 FE FF: UCS-4, big-endian machine (1234 order) *)
5034 (* FF FE 00 00: UCS-4, little-endian machine (4321 order) *)
5035 (* FE FF 00 ##: UTF-16, big-endian *)
5036 (* FF FE ## 00: UTF-16, little-endian *)
5037 (* EF BB BF: UTF-8 *)
5038 (* Without a Byte Order Mark: *)
5039 (* 00 00 00 3C: UCS-4, big-endian machine (1234 order) *)
5040 (* 3C 00 00 00: UCS-4, little-endian machine (4321 order) *)
5041 (* 00 00 3C 00: UCS-4, unusual octet order (2143) *)
5042 (* 00 3C 00 00: UCS-4, unusual octet order (3412) *)
5043 (* 00 3C ## ##, *)
5044 (* 00 25 ## ##, *)
5045 (* 00 20 ## ##, *)
5046 (* 00 09 ## ##, *)
5047 (* 00 0D ## ## or *)
5048 (* 00 0A ## ##: Big-endian UTF-16 or ISO-10646-UCS-2. Note that, absent *)
5049 (* an encoding declaration, these cases are strictly *)
5050 (* speaking in error. *)
5051 (* 3C 00 ## ##, *)
5052 (* 25 00 ## ##, *)
5053 (* 20 00 ## ##, *)
5054 (* 09 00 ## ##, *)
5055 (* 0D 00 ## ## or *)
5056 (* 0A 00 ## ##: Little-endian UTF-16 or ISO-10646-UCS-2. Note that, absent*)
5057 (* an encoding declaration, these cases are strictly *)
5058 (* speaking in error. *)
5059 (* 3C 3F 78 6D: UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, *)
5060 (* EUC, or any other 7-bit, 8-bit, or mixed-width encoding *)
5061 (* which ensures that the characters of ASCII have their *)
5062 (* normal positions, width, and values; the actual encoding *)
5063 (* declaration must be read to detect which of these *)
5064 (* applies, but since all of these encodings use the same *)
5065 (* bit patterns for the ASCII characters, the encoding *)
5066 (* declaration itself may be read reliably *)
5067 (* 4C 6F A7 94: EBCDIC (in some flavor; the full encoding declaration *)
5068 (* must be read to tell which code page is in use) *)
5069 (* other: UTF-8 without an encoding declaration, or else the data stream *)
5070 (* is corrupt, fragmentary, or enclosed in a wrapper of some kind *)
5071 (**************************************************************************)
5072
5073
5074
5075 fun decOpenXml uri =
5076 let
5077 fun get4Bytes (n,f) =
5078 if n=4 then (nil,f)
5079 else let val (b,f1) = getByte f
5080 val (bs,f2) = get4Bytes (n+1,f1)
5081 in (b::bs,f2)
5082 end
5083 handle EndOfFile f => (nil,f)
5084
5085 fun detect bs =
5086 case bs
5087 of
5088 [0wx0,0wx0,0wxFE,0wxFF] => (UCS4B,nil)
5089 | [0wxFF,0wxFE,0wx0,0wx0] => (UCS4L,nil)
5090 | [0wxFE,0wxFF,0wx0,b4] =>
5091 if b4 <> 0wx0 then (UTF16B,[0wx0,b4])
5092 else (UTF8,bs)
5093 | [0wxFF,0wxFE,b3,0wx0] =>
5094 if b3 <> 0wx0 then (UTF16L,[b3,0wx0])
5095 else (UTF8,bs)
5096 | [0wxEF,0wxBB,0wxBF,b4] => (UTF8,[b4])
5097 | [0wx0,0wx0,0wx0,0wx3C] => (UCS4B,bs)
5098 | [0wx3C,0wx0,0wx0,0wx0] => (UCS4L,bs)
5099 | [0wx0,0wx0,0wx3C,0wx0] => (UCS4SB,bs)
5100 | [0wx0,0wx3C,0wx0,0wx0] => (UCS4SL,bs)
5101 | [0wx0,b2,b3,b4] =>
5102 if (b2=0wx3C orelse b2=0wx25 orelse b2=0wx20
5103 orelse b2=0wx09 orelse b2=0wx0D orelse b2=0wx0A)
5104 andalso (b3<>0wx0 orelse b4<>0wx0) then (UTF16B,bs)
5105 else (UTF8,bs)
5106 | [b1,0wx0,b3,b4] =>
5107 if (b1=0wx3C orelse b1=0wx25 orelse b1=0wx20
5108 orelse b1=0wx09 orelse b1=0wx0D orelse b1=0wx0A)
5109 andalso (b3<>0wx0 orelse b4<>0wx0) then (UTF16L,bs)
5110 else (UTF8,bs)
5111 | [0wx4C,0wx6F,0wxA7,0wx94] => (EBCDIC,bs)
5112 | _ => (UTF8,bs)
5113
5114 val f = openFile uri
5115 val (bs,f1) = get4Bytes(0,f)
5116 val (enc,unget) = detect bs
5117 in (enc,ungetBytes(f1,unget))
5118 end
5119
5120 (*--------------------------------------------------------------------*)
5121 (* open a Unicode file. Check whether it starts with a byte order *)
5122 (* mark. If yes, chose UTF16 encoding, otherwise use the default that *)
5123 (* is provided as second argument. *)
5124 (* *)
5125 (* return the encoded file, a list of bytes looked ahead and the *)
5126 (* encoding. *)
5127 (*--------------------------------------------------------------------*)
5128 fun decOpenUni (uri,default) =
5129 let
5130 fun def(f,bs) =
5131 (default,ungetBytes(f,bs))
5132 fun detect f =
5133 let val (b1,f1) = getByte f
5134 in case b1
5135 of 0wxFE => (let val (b2,f2) = getByte f1
5136 in if b2 = 0wxFF then (UTF16B,f2)
5137 else def(f2,[b1,b2])
5138 end handle EndOfFile f => def(f,[b1]))
5139 | 0wxFF => (let val (b2,f2) = getByte f1
5140 in if b2 = 0wxFE then (UTF16L,f2)
5141 else def(f2,[b1,b2])
5142 end handle EndOfFile f => def(f,[b1]))
5143 | _ => def(f1,[b1])
5144 end handle EndOfFile f => def(f,nil)
5145 val f = openFile uri
5146 val (enc,f1) = detect f
5147 in (enc,f1)
5148 end
5149 end
5150
5151 (* stop of ../../Unicode/Decode/decode.sml *)
5152 (* start of ../../Parser/Error/errorData.sml *)
5153 structure ErrorData =
5154 struct
5155 (*--------------------------------------------------------------------*)
5156 (* a position holds the filename, line and column number. *)
5157 (*--------------------------------------------------------------------*)
5158 type Position = string * int * int
5159 val nullPosition = ("",0,0)
5160
5161 datatype ExpItem =
5162 EXP_CHAR of UniChar.Char
5163 | EXP_DATA of UniChar.Data
5164 | EXP_STRING of string
5165 type Expected = ExpItem list
5166 type Found = UniChar.Data
5167
5168 datatype Location =
5169 LOC_NONE
5170 | LOC_AFTER_DTD
5171 | LOC_ATT_DECL
5172 | LOC_ATT_DEFAULT of Position
5173 | LOC_ATT_VALUE
5174 | LOC_CDATA
5175 | LOC_CHOICE
5176 | LOC_COMMENT
5177 | LOC_CONTENT
5178 | LOC_DECL
5179 | LOC_DOC_DECL
5180 | LOC_ELEM_DECL
5181 | LOC_ENCODING
5182 | LOC_ENT_DECL
5183 | LOC_ENT_VALUE
5184 | LOC_EPILOG
5185 | LOC_ETAG
5186 | LOC_IGNORED
5187 | LOC_INCLUDED
5188 | LOC_INT_DECL
5189 | LOC_INT_SUBSET
5190 | LOC_LITERAL
5191 | LOC_MIXED
5192 | LOC_NOT_DECL
5193 | LOC_OUT_COND
5194 | LOC_PROC
5195 | LOC_PROLOG
5196 | LOC_PUB_LIT
5197 | LOC_SEQ
5198 | LOC_STAG
5199 | LOC_SUBSET
5200 | LOC_SYS_LIT
5201 | LOC_TEXT_DECL
5202 | LOC_VERSION
5203 | LOC_XML_DECL
5204
5205 datatype EntityClass =
5206 ENT_GENERAL
5207 | ENT_PARAMETER
5208 | ENT_EXTERNAL
5209 | ENT_UNPARSED
5210
5211 datatype Item =
5212 IT_ATT_NAME
5213 | IT_CDATA
5214 | IT_CHAR of UniChar.Char
5215 | IT_CHAR_REF
5216 | IT_COND
5217 | IT_DATA of UniChar.Data
5218 | IT_DECL
5219 | IT_DTD
5220 | IT_ELEM
5221 | IT_ENT_NAME
5222 | IT_ETAG
5223 | IT_GEN_ENT
5224 | IT_ID_NAME
5225 | IT_LANG_ID
5226 | IT_NAME
5227 | IT_NMTOKEN
5228 | IT_NOT_NAME
5229 | IT_NOTATION
5230 | IT_PAR_ENT
5231 | IT_PAR_REF
5232 | IT_REF
5233 | IT_STAG
5234 | IT_TARGET
5235
5236 datatype Error =
5237 (* syntax errors *)
5238 ERR_EMPTY of Location
5239 | ERR_ENDED_BY_EE of Location
5240 | ERR_EXPECTED of Expected * Found
5241 | ERR_NON_XML_CHAR of UniChar.Char
5242 | ERR_MISSING_WHITE
5243 | ERR_NON_XML_CHARREF of UniChar.Char
5244
5245 (* other well-formedness errors *)
5246 | ERR_CANT_PARSE of Location
5247 | ERR_ELEM_ENT_NESTING of UniChar.Data
5248 | ERR_ELEM_TYPE_MATCH of UniChar.Data * UniChar.Data
5249 | ERR_OMITTED_END_TAG of UniChar.Data
5250 | ERR_IGNORED_END_TAG of UniChar.Data * UniChar.Data
5251 | ERR_ENDED_IN_PROLOG
5252 | ERR_FORBIDDEN_HERE of Item * Location
5253 | ERR_ILLEGAL_ENTITY of EntityClass * UniChar.Data * Location
5254 | ERR_MULTIPLE_DTD
5255 | ERR_MULT_ATT_SPEC of UniChar.Data
5256 | ERR_RECURSIVE_ENTITY of EntityClass * UniChar.Data
5257 | ERR_UNDEC_ENTITY of EntityClass * UniChar.Data
5258
5259 (* validity errors concerning attributes *)
5260 | ERR_AT_LEAST_ONE of Item
5261 | ERR_AT_MOST_ONE of Item
5262 | ERR_ATT_IS_NOT of UniChar.Data * Item
5263 | ERR_EXACTLY_ONE of Item
5264 | ERR_FIXED_VALUE of UniChar.Data * UniChar.Vector * UniChar.Vector
5265 | ERR_ID_DEFAULT
5266 | ERR_MISSING_ATT of UniChar.Data
5267 | ERR_MULT_ID_ELEM of UniChar.Data
5268 | ERR_MUST_BE_AMONG of Item * UniChar.Data * UniChar.Data list
5269 | ERR_MUST_BE_UNPARSED of UniChar.Data * Location
5270 | ERR_REPEATED_ID of UniChar.Data
5271 | ERR_UNDECL_ATT of UniChar.Data * UniChar.Data
5272 | ERR_UNDECL_ID of UniChar.Data * Position list
5273
5274 (* validity errors concerning elements *)
5275 | ERR_BAD_ELEM of UniChar.Data * UniChar.Data
5276 | ERR_ELEM_CONTENT of Item
5277 | ERR_EMPTY_TAG of UniChar.Data
5278 | ERR_ENDED_EARLY of UniChar.Data
5279 | ERR_MULT_MIXED of UniChar.Data
5280 | ERR_NONEMPTY of UniChar.Data
5281 | ERR_REDEC_ELEM of UniChar.Data
5282 | ERR_ROOT_ELEM of UniChar.Data * UniChar.Data
5283
5284 (* other validity errors *)
5285 | ERR_DECL_ENT_NESTING of Location
5286 | ERR_EE_INT_SUBSET
5287 | ERR_GROUP_ENT_NESTING of Location
5288 | ERR_NO_DTD
5289 | ERR_STANDALONE_DEF of UniChar.Data
5290 | ERR_STANDALONE_ELEM of UniChar.Data
5291 | ERR_STANDALONE_ENT of EntityClass *UniChar.Data
5292 | ERR_STANDALONE_NORM of UniChar.Data
5293 | ERR_UNDECLARED of Item * UniChar.Data * Location
5294
5295 (* miscellaneous errors *)
5296 | ERR_DECL_PREDEF of UniChar.Data * UniChar.Vector
5297 | ERR_NO_SUCH_FILE of string * string
5298 | ERR_RESERVED of UniChar.Data * Item
5299 | ERR_VERSION of string
5300 | ERR_XML_SPACE
5301
5302 (* compatibility errors *)
5303 | ERR_AMBIGUOUS of UniChar.Data * int * int
5304 | ERR_MUST_ESCAPE of UniChar.Char
5305
5306 (* interoperability errors *)
5307 | ERR_EMPTY_TAG_INTER of UniChar.Data
5308 | ERR_MUST_BE_EMPTY of UniChar.Data
5309
5310 (* decoding errors *)
5311 | ERR_DECODE_ERROR of Decode.Error.DecodeError
5312
5313 datatype Warning =
5314 WARN_NO_XML_DECL
5315
5316 | WARN_MULT_DECL of Item * UniChar.Data
5317 | WARN_SHOULD_DECLARE of UniChar.Data list
5318
5319 | WARN_ATT_UNDEC_ELEM of UniChar.Data
5320 | WARN_MULT_ATT_DECL of UniChar.Data
5321 | WARN_MULT_ATT_DEF of UniChar.Data * UniChar.Data
5322 | WARN_ENUM_ATTS of UniChar.Data * UniChar.Data list
5323
5324 | WARN_DFA_TOO_LARGE of UniChar.Data * int
5325
5326 | WARN_NON_ASCII_URI of UniChar.Char
5327 end
5328 (* stop of ../../Parser/Error/errorData.sml *)
5329 (* start of ../../Parser/Error/errorString.sml *)
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342 signature ErrorString =
5343 sig
5344 val errorChar2String : UniChar.Char -> string
5345 val errorData2String : UniChar.Data -> string
5346 val errorVector2String : UniChar.Vector -> string
5347
5348 val quoteErrorChar0 : UniChar.Char -> string
5349 val quoteErrorChar : UniChar.Char -> string
5350 val quoteErrorData : UniChar.Data -> string
5351 val quoteErrorString : string -> string
5352 val quoteErrorVector : UniChar.Vector -> string
5353
5354 val Position2String : ErrorData.Position -> string
5355
5356 val Expected2String : ErrorData.Expected -> string
5357 val Found2String : ErrorData.Found -> string
5358
5359 val Item2String : ErrorData.Item -> string
5360 val AnItem2String : ErrorData.Item -> string
5361
5362 val Location2String : ErrorData.Location -> string
5363 val InLocation2String : ErrorData.Location -> string
5364
5365 val EntityClass2String : ErrorData.EntityClass -> string
5366 end
5367
5368 structure ErrorString : ErrorString =
5369 struct
5370 open
5371 ErrorData UniChar UtilString
5372
5373
5374 fun errorChar2String c =
5375 case c
5376 of 0wx9 => "\\t"
5377 | 0wxA => "\\n"
5378 | _ => if c>=0wx20 andalso c<0wx100 then String.implode [Char2char c]
5379 else "U+"^UtilString.toUpperString
5380 (StringCvt.padLeft #"0" 4 (Chars.toString c))
5381
5382 fun errorData2String cs =
5383 String.concat (map errorChar2String cs)
5384 fun errorVector2String vec =
5385 errorData2String (Vector.foldr (op ::) nil vec)
5386
5387 val QUOTE = "'"
5388 fun quoteErrorChar0 c = QUOTE^errorChar2String c^QUOTE
5389 fun quoteErrorChar c = if c=0wx0 then "entity end" else QUOTE^errorChar2String c^QUOTE
5390 fun quoteErrorData cs = QUOTE^errorData2String cs^QUOTE
5391 fun quoteErrorString s = QUOTE^s^QUOTE
5392 fun quoteErrorVector v = QUOTE^errorVector2String v^QUOTE
5393
5394 fun Position2String (fname,l,c) =
5395 if fname="" then ""
5396 else String.concat ["[",fname,":",Int2String l,".",Int2String c,"]"]
5397
5398 fun ExpItem2String exp =
5399 case exp
5400 of EXP_CHAR c => quoteErrorChar c
5401 | EXP_DATA cs => quoteErrorData cs
5402 | EXP_STRING s => s
5403
5404 fun Expected2String exp =
5405 case exp
5406 of nil => "nothing"
5407 | [one] => ExpItem2String one
5408 | _ => let val l=List.length exp
5409 in List2xString ("",", ","") ExpItem2String (List.take (exp,l-1))
5410 ^" or "^ExpItem2String (List.last exp)
5411 end
5412 fun Found2String fnd =
5413 case fnd
5414 of [0wx0] => "entity end"
5415 | cs => quoteErrorData cs
5416
5417 fun Location2String loc =
5418 case loc
5419 of LOC_NONE => "nothing"
5420 | LOC_AFTER_DTD => "document instance"
5421 | LOC_ATT_DECL => "attribute list declaration"
5422 | LOC_ATT_DEFAULT pos => "default value declared at "^Position2String pos
5423 | LOC_ATT_VALUE => "attribute value"
5424 | LOC_CDATA => "CDATA section"
5425 | LOC_CHOICE => "choice list"
5426 | LOC_COMMENT => "comment"
5427 | LOC_CONTENT => "content"
5428 | LOC_DECL => "declaration"
5429 | LOC_DOC_DECL => "document type declaration"
5430 | LOC_ELEM_DECL => "element type declaration"
5431 | LOC_ENCODING => "encoding name"
5432 | LOC_ENT_DECL => "entity declaration"
5433 | LOC_ENT_VALUE => "entity value"
5434 | LOC_EPILOG => "epilog"
5435 | LOC_ETAG => "end-tag"
5436 | LOC_IGNORED => "ignored section"
5437 | LOC_INCLUDED => "included section"
5438 | LOC_INT_DECL => "declaration in the internal subset"
5439 | LOC_INT_SUBSET => "internal subset"
5440 | LOC_LITERAL => "literal"
5441 | LOC_MIXED => "Mixed list"
5442 | LOC_NOT_DECL => "notation declaration"
5443 | LOC_OUT_COND => "outside a conditional section"
5444 | LOC_PROLOG => "prolog"
5445 | LOC_PROC => "processing instruction"
5446 | LOC_PUB_LIT => "public identifier"
5447 | LOC_SEQ => "sequence list"
5448 | LOC_STAG => "start-tag"
5449 | LOC_SUBSET => "declaration subset"
5450 | LOC_SYS_LIT => "system identifier"
5451 | LOC_TEXT_DECL => "text declaration"
5452 | LOC_VERSION => "version number"
5453 | LOC_XML_DECL => "XML declaration"
5454 fun InLocation2String loc =
5455 case loc
5456 of LOC_NONE => ""
5457 | LOC_AFTER_DTD => "after the DTD"
5458 | LOC_CONTENT => "in content"
5459 | LOC_ATT_DEFAULT pos => "in default value declared at "^Position2String pos
5460 | LOC_DOC_DECL => "in the document type declaration"
5461 | LOC_EPILOG => "after the root element"
5462 | LOC_INT_SUBSET => "in the internal subset"
5463 | LOC_OUT_COND => "outside a conditional section"
5464 | LOC_PROLOG => "in prolog"
5465 | LOC_SUBSET => "in the declaration subset"
5466 | LOC_XML_DECL => "in the XML declaration"
5467 | _ => "in "^prependAnA (Location2String loc)
5468
5469 fun EntityClass2String ent =
5470 case ent
5471 of ENT_GENERAL => "general"
5472 | ENT_PARAMETER => "parameter"
5473 | ENT_UNPARSED => "unparsed"
5474 | ENT_EXTERNAL => "external"
5475
5476 fun Item2String item =
5477 case item
5478 of IT_ATT_NAME => "attribute name"
5479 | IT_CDATA => "CDATA section"
5480 | IT_CHAR c => "character "^quoteErrorChar c
5481 | IT_CHAR_REF => "character reference"
5482 | IT_COND => "conditional section"
5483 | IT_DATA cs => if null cs then "character data" else quoteErrorData cs
5484 | IT_DECL => "declaration"
5485 | IT_DTD => "document type declaration"
5486 | IT_ELEM => "element type"
5487 | IT_ENT_NAME => "entity name"
5488 | IT_ETAG => "end-tag"
5489 | IT_GEN_ENT => "general entity"
5490 | IT_ID_NAME => "ID name"
5491 | IT_LANG_ID => "language identifier"
5492 | IT_NAME => "name"
5493 | IT_NMTOKEN => "name token"
5494 | IT_NOT_NAME => "notation name"
5495 | IT_NOTATION => "notation"
5496 | IT_PAR_ENT => "parameter entity"
5497 | IT_PAR_REF => "parameter entity reference"
5498 | IT_REF => "reference"
5499 | IT_STAG => "start-tag"
5500 | IT_TARGET => "target name"
5501
5502 fun AnItem2String item =
5503 case item
5504 of IT_CHAR c => Item2String item
5505 | IT_DATA cs => Item2String item
5506 | _ => prependAnA (Item2String item)
5507 end
5508
5509 (* stop of ../../Parser/Error/errorString.sml *)
5510 (* start of ../../Parser/Error/errorMessage.sml *)
5511
5512
5513
5514
5515
5516
5517
5518
5519 signature ErrorMessage =
5520 sig
5521 val errorMessage : ErrorData.Error -> string list
5522 val warningMessage : ErrorData.Warning -> string list
5523 end
5524
5525 structure ErrorMessage : ErrorMessage =
5526 struct
5527 open
5528 Decode
5529 UtilString
5530 ErrorData ErrorString
5531
5532 val quoteChar0 = quoteErrorChar0
5533 val quoteChar = quoteErrorChar
5534 val quoteData = quoteErrorData
5535 val quoteString = quoteErrorString
5536 val quoteVector = quoteErrorVector
5537
5538 fun errorMessage err =
5539 case err
5540 (* syntax errors *)
5541 of ERR_EMPTY loc => ["Empty",Location2String loc]
5542 | ERR_ENDED_BY_EE loc => [toUpperFirst (Location2String loc),"ended by entity end"]
5543 | ERR_EXPECTED (exp,found) =>
5544 ["Expected",Expected2String exp,"but found",Found2String found]
5545 | ERR_MISSING_WHITE => ["Missing white space"]
5546 | ERR_NON_XML_CHAR c => ["Non-XML character",quoteChar0 c]
5547 | ERR_NON_XML_CHARREF c => ["Reference to non-XML character",quoteChar0 c]
5548
5549 (* other well-formedness errors *)
5550 | ERR_CANT_PARSE loc => ["Cannot parse",Location2String loc]
5551 | ERR_ELEM_ENT_NESTING elem =>
5552 ["The first and last character of element",quoteData elem,
5553 "are in different entities"]
5554 | ERR_ELEM_TYPE_MATCH (elem,other) =>
5555 ["Element",quoteData elem,"was ended by an end-tag for",quoteData other]
5556 | ERR_IGNORED_END_TAG(elem,other) =>
5557 ["An end-tag for element type",quoteData other,"is not allowed in the",
5558 "content of element",quoteData elem]
5559 | ERR_OMITTED_END_TAG elem =>
5560 ["Element",quoteData elem,"has no end-tag"]
5561 | ERR_ENDED_IN_PROLOG => ["Document entity ended in prolog"]
5562 | ERR_FORBIDDEN_HERE(what,loc) =>
5563 [AnItem2String what,"is not allowed",InLocation2String loc]
5564 | ERR_ILLEGAL_ENTITY(what,ent,loc) =>
5565 ["Reference to",EntityClass2String what,"entity",quoteData ent,InLocation2String loc]
5566 | ERR_MULTIPLE_DTD => ["Repeated document type declaration"]
5567 | ERR_MULT_ATT_SPEC att =>
5568 ["A value for attribute",quoteData att,"was already specified in this tag"]
5569 | ERR_RECURSIVE_ENTITY(what,ent) =>
5570 ["Reference to",EntityClass2String what,"entity",quoteData ent,
5571 "that is already open"]
5572 | ERR_UNDEC_ENTITY(what,ent) =>
5573 ["Reference to undeclared",EntityClass2String what,"entity",quoteData ent]
5574
5575 (* validity errors concerning attributes *)
5576 | ERR_AT_LEAST_ONE what => ["At least one",Item2String what,"must be specified"]
5577 | ERR_AT_MOST_ONE what => ["Only one",Item2String what,"may be specified"]
5578 | ERR_ATT_IS_NOT(cs,what) => [quoteData cs,"is not",AnItem2String what]
5579 | ERR_EXACTLY_ONE what => [toUpperFirst (AnItem2String what),"must be specified"]
5580 | ERR_FIXED_VALUE(att,value,fixed) =>
5581 ["Attribute",quoteData att,"has the value",quoteVector value,
5582 "but was declared with a fixed default value of",quoteVector fixed]
5583 | ERR_ID_DEFAULT =>
5584 ["An ID attribute must have a default value of #IMPLIED or #REQUIRED"]
5585 | ERR_MISSING_ATT att =>
5586 ["No value was specified for required attribute",quoteData att]
5587 | ERR_MULT_ID_ELEM elem =>
5588 ["Element type",quoteData elem,"already has an ID attribute"]
5589 | ERR_MUST_BE_AMONG (what,x,ys) =>
5590 [toUpperFirst (Item2String what),quoteData x,"is none of",
5591 List2xString ("",",","") quoteData ys]
5592 | ERR_MUST_BE_UNPARSED (name,loc) =>
5593 [quoteData name,InLocation2String loc,"is not the name of an unparsed entity"]
5594 | ERR_REPEATED_ID name =>
5595 ["ID name",quoteData name,"already occurred as an attribute value"]
5596 | ERR_UNDECL_ATT(att,elem) =>
5597 ["Attribute",quoteData att,"was not declared for element type",quoteData elem]
5598 | ERR_UNDECL_ID(name,refs) =>
5599 (if null refs then ["Reference to non-existent ID",quoteData name]
5600 else ["Reference to non-existent ID",quoteData name,
5601 "(also referenced at",List2xString ("",", ",")") Position2String refs])
5602
5603 (* validity errors concerning elements *)
5604 | ERR_BAD_ELEM (curr,elem) =>
5605 ["Element type",quoteData elem,"not allowed at this point",
5606 "in the content of element",quoteData curr]
5607 | ERR_ELEM_CONTENT what =>
5608 [toUpperFirst (AnItem2String what),"is not allowed in element content"]
5609 | ERR_EMPTY_TAG elem =>
5610 ["Empty-element tag for element type",quoteData elem,
5611 "whose content model requires non-empty content"]
5612 | ERR_ENDED_EARLY elem =>
5613 ["Element",quoteData elem,"ended before its content was completed"]
5614 | ERR_MULT_MIXED elem =>
5615 ["Element type",quoteData elem,"already occurred in this mixed-content declaration"]
5616 | ERR_NONEMPTY elem =>
5617 ["The end-tag for element",quoteData elem,"with declared EMPTY content",
5618 "must follow immediately after its start-tag"]
5619 | ERR_REDEC_ELEM elem => ["Element type",quoteData elem,"was already declared"]
5620 | ERR_ROOT_ELEM (dec,root) =>
5621 ["Document element",quoteData root,"does not match the name",
5622 quoteData dec,"in the document type declaration"]
5623
5624 (* other validity errors *)
5625 | ERR_DECL_ENT_NESTING loc =>
5626 ["The first and last character of this",Location2String loc,
5627 "are not in the same entity replacement text"]
5628 | ERR_EE_INT_SUBSET =>
5629 ["An entity end is not allowed in a declaration in the internal subset"]
5630 | ERR_GROUP_ENT_NESTING loc =>
5631 ["The opening and closing parentheses of this",Location2String loc,
5632 "are not in the same entity replacement text"]
5633 | ERR_NO_DTD =>
5634 ["There is no document type declaration. Switching to semi-validating mode",
5635 "(will not check for declaredness of entities, elements, etc.)"]
5636 | ERR_STANDALONE_DEF att =>
5637 ["Externally declared attribute",quoteData att,"was defaulted,",
5638 "although the standalone declaration is",quoteString "yes"]
5639 | ERR_STANDALONE_ELEM elem =>
5640 ["White space occurred in the content of externally declared",
5641 "element",quoteData elem,"with declared element content",
5642 "although the standalone declaration is",quoteString "yes"]
5643 | ERR_STANDALONE_ENT(what,ent) =>
5644 ["Reference to externally declared",EntityClass2String what,"entity",
5645 quoteData ent^",","although the standalone declaration is",quoteString "yes"]
5646 | ERR_STANDALONE_NORM att =>
5647 ["The value for externally declared attribute",
5648 quoteData att,"was changed as a result of normalization,",
5649 "although the standalone declaration is",quoteString "yes"]
5650 | ERR_UNDECLARED (what,x,loc) =>
5651 ["Undeclared",Item2String what,quoteData x,InLocation2String loc]
5652
5653 (* miscellaneous errors *)
5654 | ERR_DECL_PREDEF(ent,def) =>
5655 ["General entity",quoteData ent,"must be declared as internal entity",
5656 "with replacement text",quoteVector def]
5657 | ERR_NO_SUCH_FILE(f,msg) => ["Could not open file",quoteString f,"("^msg^")"]
5658 | ERR_RESERVED(name,what) =>
5659 [quoteData name,"is reserved for standardization and therefore not allowed as",
5660 AnItem2String what]
5661 | ERR_VERSION version =>
5662 ["XML version",quoteString version,"is not supported"]
5663 | ERR_XML_SPACE =>
5664 ["Attribute",quoteString "xml:space","must be given an enumeration type",
5665 "with values",quoteString "default","and",quoteString "preserve","only"]
5666
5667 (* compatibility errors *)
5668 | ERR_AMBIGUOUS(a,n1,n2) =>
5669 ["Content model is ambiguous: conflict between the",numberNth n1,
5670 "and the",numberNth n2,"occurrence of element",quoteData a^".",
5671 "Using an approximation instead"]
5672 | ERR_MUST_ESCAPE c => ["Character",quoteChar c,"must be escaped for compatibility"]
5673
5674 (* interoperability errors *)
5675 | ERR_EMPTY_TAG_INTER elem =>
5676 ["Empty-element tag for element",quoteData elem,"with non-EMPTY declared content"]
5677 | ERR_MUST_BE_EMPTY elem =>
5678 ["An empty-element tag must be used for element type",
5679 quoteData elem,"with EMPTY declared content"]
5680
5681 (* decoding errors *)
5682 | ERR_DECODE_ERROR err => "Decoding error:"::Decode.Error.decodeMessage err
5683
5684 fun warningMessage warn =
5685 case warn
5686 of WARN_NO_XML_DECL => ["Document entity has no XML declaration"]
5687
5688 | WARN_MULT_DECL(what,name) =>
5689 ["Repeated declaration for",Item2String what,quoteData name]
5690 | WARN_SHOULD_DECLARE(ents) =>
5691 let val (one,more) = (hd ents,tl ents)
5692 in case more
5693 of nil => ["The predefined entity",quoteData one,"should have been declared"]
5694 | _ => ["The predefined entities",List2xString ("",", ","") quoteData more,
5695 "and",quoteData one,"should have been declared"]
5696 end
5697
5698 | WARN_ATT_UNDEC_ELEM elem =>
5699 ["Attribute-list declaration for undeclared element type",quoteData elem]
5700 | WARN_MULT_ATT_DECL elem =>
5701 ["Repeated attribute-list declaration for element type",quoteData elem]
5702 | WARN_MULT_ATT_DEF(elem,att) =>
5703 ["Repeated definition of attribute",quoteData att,"for element type",quoteData elem]
5704 | WARN_ENUM_ATTS(elem,names) =>
5705 ["The following name tokens occur more than once in the enumerated attribute",
5706 "types of element",quoteData elem^":",List2xString ("",", ","") quoteData names]
5707
5708 | WARN_DFA_TOO_LARGE (elem,max) =>
5709 ["The finite state machine for the content model of element type",
5710 quoteData elem,"would have more than the maximal allowed number of",
5711 Int2String max,"states. Using an approximation instead"]
5712
5713 | WARN_NON_ASCII_URI c =>
5714 ["System identifier contains non-ASCII character",quoteChar c]
5715
5716 end
5717 (* stop of ../../Parser/Error/errorMessage.sml *)
5718 (* start of ../../Parser/Error/errorUtil.sml *)
5719
5720
5721 signature ErrorUtil =
5722 sig
5723 val isFatalError : ErrorData.Error -> bool
5724 val isDecodeError : ErrorData.Error -> bool
5725 val isSyntaxError : ErrorData.Error -> bool
5726 val isValidityError : ErrorData.Error -> bool
5727 val isWellFormedError : ErrorData.Error -> bool
5728 end
5729
5730 structure ErrorUtil : ErrorUtil =
5731 struct
5732 open ErrorData
5733
5734 fun isDecodeError err =
5735 case err
5736 of ERR_DECODE_ERROR _ => true
5737 | _ => false
5738
5739 fun isSyntaxError err =
5740 case err
5741 of ERR_EMPTY _ => true
5742 | ERR_ENDED_BY_EE _ => true
5743 | ERR_EXPECTED _ => true
5744 | ERR_MISSING_WHITE => true
5745 | ERR_NON_XML_CHAR _ => true
5746 | ERR_NON_XML_CHARREF _ => true
5747 | _ => false
5748
5749 fun isWellFormedError err =
5750 case err
5751 of ERR_CANT_PARSE _ => true
5752 | ERR_ELEM_ENT_NESTING _ => true
5753 | ERR_ELEM_TYPE_MATCH _ => true
5754 | ERR_OMITTED_END_TAG _ => true
5755 | ERR_IGNORED_END_TAG _ => true
5756 | ERR_ENDED_IN_PROLOG => true
5757 | ERR_FORBIDDEN_HERE _ => true
5758 | ERR_ILLEGAL_ENTITY _ => true
5759 | ERR_MULTIPLE_DTD => true
5760 | ERR_MULT_ATT_SPEC _ => true
5761 | ERR_RECURSIVE_ENTITY _ => true
5762 | ERR_UNDEC_ENTITY _ => true
5763 | _ => isSyntaxError err
5764
5765 fun isFatalError err =
5766 case err
5767 of ERR_NO_SUCH_FILE _ => true
5768 | _ => isWellFormedError err
5769
5770 fun isValidityError err =
5771 case err
5772 of ERR_AT_LEAST_ONE _ => true
5773 | ERR_AT_MOST_ONE _ => true
5774 | ERR_ATT_IS_NOT _ => true
5775 | ERR_EXACTLY_ONE _ => true
5776 | ERR_FIXED_VALUE _ => true
5777 | ERR_ID_DEFAULT => true
5778 | ERR_MISSING_ATT _ => true
5779 | ERR_MULT_ID_ELEM _ => true
5780 | ERR_MUST_BE_AMONG _ => true
5781 | ERR_MUST_BE_UNPARSED _ => true
5782 | ERR_REPEATED_ID _ => true
5783 | ERR_UNDECL_ATT _ => true
5784 | ERR_UNDECL_ID _ => true
5785 | ERR_BAD_ELEM _ => true
5786 | ERR_ELEM_CONTENT _ => true
5787 | ERR_EMPTY_TAG _ => true
5788 | ERR_ENDED_EARLY _ => true
5789 | ERR_MULT_MIXED _ => true
5790 | ERR_NONEMPTY _ => true
5791 | ERR_REDEC_ELEM _ => true
5792 | ERR_ROOT_ELEM _ => true
5793 | ERR_DECL_ENT_NESTING _ => true
5794 | ERR_EE_INT_SUBSET => true
5795 | ERR_GROUP_ENT_NESTING _ => true
5796 | ERR_NO_DTD => true
5797 | ERR_STANDALONE_DEF _ => true
5798 | ERR_STANDALONE_ELEM _ => true
5799 | ERR_STANDALONE_ENT _ => true
5800 | ERR_STANDALONE_NORM _ => true
5801 | ERR_UNDECLARED _ => true
5802 | _ => false
5803 end
5804 (* stop of ../../Parser/Error/errorUtil.sml *)
5805 (* start of ../../Parser/Error/expected.sml *)
5806
5807
5808
5809
5810 structure Expected =
5811 struct
5812 local
5813 open UniChar ErrorData
5814 in
5815 val expAnElemName = [EXP_STRING "an element name"]
5816 val expAnEntName = [EXP_STRING "an entity name"]
5817 val expAName = [EXP_STRING "a name"]
5818 val expANameToken = [EXP_STRING "a name token"]
5819 val expANotName = [EXP_STRING "a notation name"]
5820 val expATarget = [EXP_STRING "a target name"]
5821 val expAttDefKey = [EXP_DATA (String2Data "REQUIRED"),EXP_DATA (String2Data "IMPLIED"),
5822 EXP_DATA (String2Data "FIXED")]
5823 val expAttNameGt = [EXP_STRING "an attribute name",EXP_CHAR 0wx3E]
5824 val expAttSTagEnd = [EXP_STRING "an attribute name",EXP_CHAR 0wx3E,
5825 EXP_DATA(String2Data "/>")]
5826 val expAttType = [EXP_CHAR 0wx28,EXP_DATA (String2Data "CDATA"),
5827 EXP_DATA (String2Data "ID"),EXP_DATA (String2Data "IDREF"),
5828 EXP_DATA (String2Data "IDREFS"),EXP_DATA (String2Data "ENTITY"),
5829 EXP_DATA (String2Data "ENTITIES"),EXP_DATA (String2Data "NMTOKEN"),
5830 EXP_DATA (String2Data "NMTOKENS"),EXP_DATA (String2Data "NOTATION")]
5831 val expBarRpar = [EXP_CHAR 0wx29,EXP_CHAR 0wx7C]
5832 val expCdata = [EXP_DATA (String2Data "CDATA")]
5833 fun expConCRpar c = [EXP_CHAR 0wx29,EXP_CHAR c]
5834 val expConRpar = [EXP_CHAR 0wx29,EXP_CHAR 0wx2C,EXP_CHAR 0wx7C]
5835 val expCondStatus = [EXP_DATA (String2Data "IGNORE"),EXP_DATA (String2Data "INCLUDE")]
5836 val expContSpec = [EXP_CHAR 0wx28,EXP_DATA (String2Data "ANY"),
5837 EXP_DATA (String2Data "EMPTY")]
5838 val expElemLpar = [EXP_STRING "an element name",EXP_CHAR 0wx28]
5839 val expEncStand = [EXP_DATA (String2Data "encoding"),
5840 EXP_DATA (String2Data "standalone")]
5841 val expDash = [EXP_CHAR 0wx2D]
5842 val expDashDocLbrk = [EXP_CHAR 0wx2D,EXP_CHAR 0wx5B,EXP_DATA (String2Data "DOCTYPE")]
5843 val expDashLbrack = [EXP_CHAR 0wx2D,EXP_CHAR 0wx5B]
5844 val expDigitX = [EXP_STRING "a digit",EXP_CHAR 0wx78]
5845 val expEncoding = [EXP_DATA (String2Data "encoding")]
5846 val expEncVers = [EXP_DATA (String2Data "encoding"),EXP_DATA (String2Data "version")]
5847 val expEntNamePero = [EXP_STRING "an entity name",EXP_CHAR 0wx25]
5848 val expEq = [EXP_CHAR 0wx3D]
5849 val expExclQuest = [EXP_CHAR 0wx21,EXP_CHAR 0wx3F]
5850 val expExtId = [EXP_DATA (String2Data "PUBLIC"),EXP_DATA (String2Data "SYSTEM")]
5851 val expGt = [EXP_CHAR 0wx3E]
5852 val expGtNdata = [EXP_CHAR 0wx3E,EXP_DATA (String2Data "NDATA")]
5853 val expHexDigit = [EXP_STRING "a hexadecimal digit"]
5854 val expInSubset = [EXP_CHAR 0wx3C,EXP_CHAR 0wx5D,EXP_CHAR 0wx25,
5855 EXP_STRING "white space"]
5856 val expLbrack = [EXP_CHAR 0wx5B]
5857 val expLitQuote = [EXP_CHAR 0wx22,EXP_CHAR 0wx27]
5858 val expLitQuotExt = [EXP_CHAR 0wx22,EXP_CHAR 0wx27,
5859 EXP_DATA (String2Data "PUBLIC"),EXP_DATA (String2Data "SYSTEM")]
5860 val expLpar = [EXP_CHAR 0wx28]
5861 val expNoYes = [EXP_DATA (String2Data "no"),EXP_DATA (String2Data "yes")]
5862 val expPcdata = [EXP_DATA (String2Data "PCDATA")]
5863 val expProcEnd = [EXP_DATA (String2Data "?>")]
5864 val expQuoteRni = [EXP_CHAR 0wx22,EXP_CHAR 0wx27,EXP_CHAR 0wx23]
5865 val expRbrack = [EXP_CHAR 0wx5D]
5866 val expRep = [EXP_CHAR 0wx2A]
5867 val expSemi = [EXP_CHAR 0wx3B]
5868 val expStandOpt = [EXP_DATA (String2Data "standalone"),EXP_DATA (String2Data "?>")]
5869 val expStartEnc = [EXP_STRING "a letter"]
5870 val expStartMarkup = [EXP_DATA (String2Data "--"),EXP_DATA (String2Data "ATTLIST"),
5871 EXP_DATA (String2Data "ELEMENT"),EXP_DATA (String2Data "ENTITY"),
5872 EXP_DATA (String2Data "NOTATION")]
5873 val expVersion = [EXP_DATA (String2Data "version")]
5874 end
5875 end
5876 (* stop of ../../Parser/Error/expected.sml *)
5877 (* start of ../../Parser/Error/errors.sml *)
5878 structure Errors =
5879 struct
5880 open
5881 UtilError
5882 ErrorData ErrorMessage ErrorString ErrorUtil Expected
5883 end
5884 (* stop of ../../Parser/Error/errors.sml *)
5885 (* start of ../../Parser/Base/baseData.sml *)
5886 (*--------------------------------------------------------------------------*)
5887 (* Structure: BaseData *)
5888 (*--------------------------------------------------------------------------*)
5889
5890 structure BaseData =
5891 struct
5892 open DfaData
5893
5894 (*--- external ids may have a public id and must have a system id ---*)
5895 (*--- for notations, however, also the system id can be optional ----*)
5896 datatype ExternalId =
5897 EXTID of (string * UniChar.Char) option * (Uri.Uri * Uri.Uri * UniChar.Char) option
5898
5899 (*--- external ids may have a public id and must have a system id ---*)
5900 type NotationInfo = ExternalId option
5901
5902 (*--- replacement of a general entity ---*)
5903 datatype GenEntity =
5904 GE_NULL
5905 | GE_INTERN of UniChar.Vector * UniChar.Vector
5906 | GE_EXTERN of ExternalId
5907 | GE_UNPARSED of ExternalId * int * Errors.Position
5908 type GenEntInfo = GenEntity * bool
5909
5910 fun isExtGen (GE_EXTERN _) = true
5911 | isExtGen _ = false
5912
5913 (*--- replacement of a parameter entity ---*)
5914 datatype ParEntity =
5915 PE_NULL
5916 | PE_INTERN of UniChar.Vector * UniChar.Vector
5917 | PE_EXTERN of ExternalId
5918 type ParEntInfo = ParEntity * bool
5919
5920 fun isExtPar (PE_EXTERN _) = true
5921 | isExtPar _ = false
5922
5923 (*--- declared type of an attribute ---*)
5924 datatype AttType =
5925 AT_CDATA
5926 | AT_NMTOKEN
5927 | AT_NMTOKENS
5928 | AT_ID
5929 | AT_IDREF
5930 | AT_IDREFS
5931 | AT_ENTITY
5932 | AT_ENTITIES
5933 | AT_GROUP of int list
5934 | AT_NOTATION of int list
5935
5936 (*--- typed attribute value ---*)
5937 datatype AttValue =
5938 AV_CDATA of UniChar.Vector
5939 | AV_NMTOKEN of UniChar.Data
5940 | AV_NMTOKENS of UniChar.Data list
5941 | AV_ID of int
5942 | AV_IDREF of int
5943 | AV_IDREFS of int list
5944 | AV_ENTITY of int
5945 | AV_ENTITIES of int list
5946 | AV_GROUP of int list * int
5947 | AV_NOTATION of int list * int
5948
5949 fun isIdType at = at=AT_ID
5950
5951 (*--- default values of attributes ---*)
5952 datatype AttDefault =
5953 AD_IMPLIED
5954 | AD_REQUIRED
5955 | AD_DEFAULT of (UniChar.Vector * UniChar.Vector * AttValue option)
5956 * (Errors.Position * bool ref)
5957 | AD_FIXED of (UniChar.Vector * UniChar.Vector * AttValue option)
5958 * (Errors.Position * bool ref)
5959
5960 (*--- attribute definition (list) ---*)
5961 (*--- the boolean says whether it was externally declared ---*)
5962 type AttDef = int * AttType * AttDefault * bool
5963 type AttDefList = AttDef list
5964
5965 (*--- content specification ---*)
5966 fun defaultAttDef idx = (idx,AT_CDATA,AD_IMPLIED,false)
5967
5968 (*--- content specification ---*)
5969 datatype ContentSpec =
5970 CT_ANY
5971 | CT_EMPTY
5972 | CT_MIXED of int list
5973 | CT_ELEMENT of DfaData.ContentModel * DfaData.Dfa
5974
5975 fun isMixed ct =
5976 case ct
5977 of CT_ANY => true
5978 | CT_MIXED _ => true
5979 | _ => false
5980
5981 type ElemInfo = {decl : (ContentSpec * bool) option,
5982 atts : (AttDefList * bool) option,
5983 errAtts : int list}
5984
5985 val nullElemInfo : ElemInfo = {decl=NONE,
5986 atts=NONE,
5987 errAtts=nil}
5988
5989 (*--------------------------------------------------------------------*)
5990 (* the id info tells whether an id value has occurred for a name and *)
5991 (* the list of all positions where it occurred as an idref value. *)
5992 (*--------------------------------------------------------------------*)
5993 type IdInfo = bool * Errors.Position list
5994 val nullIdInfo : IdInfo = (false,nil)
5995 end
5996 (* stop of ../../Parser/Base/baseData.sml *)
5997 (* start of ../../Parser/Dfa/dfaString.sml *)
5998
5999
6000
6001
6002
6003 (*--------------------------------------------------------------------------*)
6004 (* Structure: DfaString *)
6005 (* *)
6006 (* Notes: *)
6007 (* This structure is needed for debugging of content models and tables. *)
6008 (* *)
6009 (* Depends on: *)
6010 (* DfaData *)
6011 (* UtilString *)
6012 (* *)
6013 (* Exceptions raised by functions in this structure: *)
6014 (* Table2String : none *)
6015 (* ContentModel2String : none *)
6016 (*--------------------------------------------------------------------------*)
6017 signature DfaString =
6018 sig
6019 val ContentModel2String : (int -> string) -> DfaData.ContentModel -> string
6020 val Dfa2String : (int -> string) -> DfaData.Dfa -> string
6021 end
6022
6023 structure DfaString : DfaString =
6024 struct
6025 open DfaBase UtilString
6026
6027 fun State2String q = if q=dfaError then "Error" else Int2String q
6028
6029 fun Info2String Elem2String (q,mt,fst) = String.concat
6030 (State2String q::Bool2xString ("[empty]","") mt
6031 ::map (fn (q,a) => " "^Elem2String a^"->"^State2String q) fst)
6032
6033 fun ContentModel2String Elem2String cm =
6034 case cm
6035 of CM_ELEM i => Elem2String i
6036 | CM_OPT cm => ContentModel2String Elem2String cm^"?"
6037 | CM_REP cm => ContentModel2String Elem2String cm^"*"
6038 | CM_PLUS cm => ContentModel2String Elem2String cm^"+"
6039 | CM_ALT cms => List2xString ("(","|",")") (ContentModel2String Elem2String) cms
6040 | CM_SEQ cms => List2xString ("(",",",")") (ContentModel2String Elem2String) cms
6041
6042 fun CM2String Elem2String =
6043 let fun cm2s indent cm =
6044 case cm
6045 of (ELEM a,info) => String.concat
6046 [indent,Elem2String a," ",Info2String Elem2String info,"\n"]
6047 | (OPT cm',info) => String.concat
6048 [indent,"? ",Info2String Elem2String info,"\n",cm2s (indent^" ") cm']
6049 | (REP cm',info) => String.concat
6050 [indent,"* ",Info2String Elem2String info,"\n",cm2s (indent^" ") cm']
6051 | (PLUS cm',info) => String.concat
6052 [indent,"+ ",Info2String Elem2String info,"\n",cm2s (indent^" ") cm']
6053 | (ALT cms,info) => String.concat
6054 (indent^"| "::Info2String Elem2String info::"\n"
6055 ::map (cm2s (indent^" ")) cms)
6056 | (SEQ cms,info) => String.concat
6057 (indent^", "::Info2String Elem2String info::"\n"
6058 ::map (cm2s (indent^" ")) cms)
6059 in cm2s ""
6060 end
6061
6062 fun Row2String Elem2String (lo,hi,tab,fin) =
6063 String.concat
6064 (Vector.foldri
6065 (fn (i,q,yet) => if q<0 then yet
6066 else " "::Elem2String (i+lo)::"->"::State2String q::yet)
6067 (if fin then [" [Final]"] else nil)
6068 (tab,0,NONE))
6069
6070 fun Dfa2String Elem2String tab =
6071 String.concat
6072 (Vector.foldri
6073 (fn (q,row,yet) => State2String q::":"::Row2String Elem2String row::yet)
6074 nil (tab,0,NONE))
6075 end
6076 (* stop of ../../Parser/Dfa/dfaString.sml *)
6077 (* start of ../../Parser/Base/baseString.sml *)
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090 (*--------------------------------------------------------------------------*)
6091 (* Structure: BaseString *)
6092 (* *)
6093 (* Depends on: *)
6094 (* UniChar *)
6095 (* Dfa *)
6096 (* BaseData *)
6097 (* UtilString *)
6098 (* *)
6099 (* Exceptions raised by functions in this structure: *)
6100 (* ElemInfo2xString : InternalError *)
6101 (* ExternalId2String : none *)
6102 (* GenEntity2xString : none *)
6103 (* Notation2String : none *)
6104 (* IdInfo2String : none *)
6105 (* ParEntity2String : none *)
6106 (*--------------------------------------------------------------------------*)
6107 signature BaseString =
6108 sig
6109 val ExternalId2String : BaseData.ExternalId -> string
6110 val NotationInfo2String : BaseData.NotationInfo -> string
6111
6112 val GenEntity2xString : (int -> string) -> BaseData.GenEntity -> string
6113 val ParEntity2String : BaseData.ParEntity -> string
6114
6115 val ElemInfo2xString : (int -> string) * (int -> string) * (int -> string)
6116 * (int -> string) * (int -> string) -> BaseData.ElemInfo -> string
6117
6118 val IdInfo2String : BaseData.IdInfo -> string
6119 end
6120
6121 structure BaseString : BaseString =
6122 struct
6123 open
6124 UtilString Uri
6125 Errors UniChar DfaString
6126 BaseData
6127
6128 val THIS_MODULE = "BaseString"
6129
6130 fun ExternalId2String (EXTID id) =
6131 case id
6132 of (SOME(p,pq),SOME(rel,s,sq)) => String.concat
6133 ["PUBLIC ",quoteUni pq p,
6134 " ",quoteUni sq (Uri2String rel),
6135 " @ ",quoteUni sq (Uri2String s)]
6136 | (SOME(p,pq),NONE) => String.concat
6137 ["PUBLIC ",quoteUni pq p]
6138 | (NONE,SOME(rel,s,sq)) => String.concat
6139 ["SYSTEM ",quoteUni sq (Uri2String rel),
6140 " @ ",quoteUni sq (Uri2String s)]
6141 | (NONE,NONE) => "<none>"
6142 fun NotationInfo2String not =
6143 case not
6144 of NONE => "undeclared"
6145 | SOME extId => ExternalId2String extId
6146
6147 fun GenEntity2xString NotIdx2String ge =
6148 case ge
6149 of GE_NULL => "NULL"
6150 | GE_INTERN(lit,cv) => let val quote = Vector.sub(lit,0)
6151 in String.concat ["INTERN ",Vector2String lit,
6152 " - ",quoteVector quote cv]
6153 end
6154 | GE_EXTERN id => "EXTERN "^ExternalId2String id
6155 | GE_UNPARSED(id,not,_) => "UNPARSED "^ExternalId2String id^" "^NotIdx2String not
6156
6157 fun ParEntity2String pe =
6158 case pe
6159 of PE_NULL => "NULL"
6160 | PE_INTERN(lit,cv) => let val quote = Vector.sub(lit,0)
6161 in String.concat ["INTERN ",Vector2String lit,
6162 " - ",quoteVector quote cv]
6163 end
6164 | PE_EXTERN id => "EXTERN "^ExternalId2String id
6165
6166 fun ContentSpec2String Elem2String cs =
6167 case cs
6168 of CT_ANY => "ANY"
6169 | CT_EMPTY => "EMPTY"
6170 | CT_MIXED is => List2xString ("MIXED (","|",")") Elem2String is
6171 | CT_ELEMENT(cm,_) => "ELEMENT "^ContentModel2String Elem2String cm
6172
6173 fun AttValue2xString (Att2String,Ent2String,Id2String,Not2String) quote av =
6174 quoteUni quote (case av
6175 of AV_CDATA buf => Vector2String buf
6176 | AV_NMTOKEN cs => Data2String cs
6177 | AV_NMTOKENS css => List2xString (""," ","") Data2String css
6178 | AV_ID idx => Id2String idx
6179 | AV_IDREF idx => Id2String idx
6180 | AV_IDREFS idxs => List2xString (""," ","") Id2String idxs
6181 | AV_ENTITY idx => Ent2String idx
6182 | AV_ENTITIES idxs => List2xString (""," ","") Ent2String idxs
6183 | AV_GROUP(_,idx) => Att2String idx
6184 | AV_NOTATION(_,idx) => Not2String idx)
6185
6186 fun AttDefault2xString funs ad =
6187 case ad
6188 of AD_DEFAULT ((lit,cv,av),_) =>
6189 let val quote = Vector.sub(lit,0)
6190 in String.concat [quoteVector quote cv," ",
6191 Option2String0 (AttValue2xString funs quote) av]
6192 end
6193 | AD_FIXED ((lit,cv,av),_) =>
6194 let val quote = Vector.sub(lit,0)
6195 in String.concat ["#FIXED ",quoteVector quote cv," ",
6196 Option2String0 (AttValue2xString funs quote) av]
6197 end
6198 | AD_IMPLIED => "#IMPLIED"
6199 | AD_REQUIRED => "#REQUIRED"
6200
6201 fun AttType2xString (Att2String,Not2String) at =
6202 case at
6203 of AT_CDATA => "CDATA"
6204 | AT_NMTOKEN => "NMTOKEN"
6205 | AT_NMTOKENS => "NMTOKENS"
6206 | AT_ID => "ID"
6207 | AT_IDREF => "IDREF"
6208 | AT_IDREFS => "IDREFS"
6209 | AT_ENTITY => "ENTITY"
6210 | AT_ENTITIES => "ENTITIES"
6211 | AT_GROUP idxs => List2xString ("(","|",")") Att2String idxs
6212 | AT_NOTATION idxs => List2xString ("NOTATION(","|",")") Not2String idxs
6213
6214 fun AttDef2xString (funs as (Att2String,_,_,Not2String)) (idx,attType,default,ext) =
6215 String.concat [Att2String idx," ",
6216 AttType2xString (Att2String,Not2String) attType," ",
6217 AttDefault2xString funs default,
6218 Bool2xString ("[external]","") ext]
6219
6220 fun AttDefList2xString funs adl = List2xString ("",",","") (AttDef2xString funs) adl
6221
6222 fun ElemInfo2xString (Att2String,Elem2String,Ent2String,Id2String,Not2String)
6223 ({decl,atts,...}:ElemInfo) =
6224 let val dec = case decl
6225 of NONE => "elem undeclared"
6226 | SOME(cont,ext) => String.concat
6227 ["elem declared ",if ext then "ex" else "in","ternally: ",
6228 ContentSpec2String Elem2String cont]
6229 val att = case atts
6230 of NONE => "no atts declared"
6231 | SOME(defs,hadId) => String.concat
6232 ["atts were declared",if hadId then "(has id attribute): " else ": ",
6233 AttDefList2xString (Att2String,Ent2String,Id2String,Not2String) defs]
6234 in dec^att
6235 end
6236
6237 fun IdInfo2String (decl,refs) =
6238 Bool2xString ("declared","undeclared") decl^"/"^
6239 (if null refs then "no references"
6240 else List2xString ("references: ",", ","") Position2String refs)
6241 end
6242
6243 (* stop of ../../Parser/Base/baseString.sml *)
6244 (* start of ../../Parser/Base/base.sml *)
6245
6246
6247
6248 structure Base =
6249 struct
6250 open
6251 BaseData
6252 BaseString
6253 end
6254 (* stop of ../../Parser/Base/base.sml *)
6255 (* start of ../../Parser/Params/dtd.sml *)
6256 (*--------------------------------------------------------------------------*)
6257 (* Structure: Dtd *)
6258 (* *)
6259 (* Exceptions raised by functions in this structure: *)
6260 (* AttNot2Index : none *)
6261 (* Element2Index : none *)
6262 (* GenEnt2Index : none *)
6263 (* Id2Index : none *)
6264 (* Index2AttNot : NoSuchIndex *)
6265 (* Index2Element : NoSuchIndex *)
6266 (* Index2GenEnt : NoSuchIndex *)
6267 (* Index2Id : NoSuchIndex *)
6268 (* Index2ParEnt : NoSuchIndex *)
6269 (* ParEnt2Index : none *)
6270 (* entitiesWellformed : none *)
6271 (* getElement : NoSuchIndex *)
6272 (* getGenEnt : NoSuchIndex *)
6273 (* getId : NoSuchIndex *)
6274 (* getNotation : NoSuchIndex *)
6275 (* getParEnt : NoSuchIndex *)
6276 (* hasNotation : NoSuchIndex *)
6277 (* initDtdTables : none *)
6278 (* maxUsedElem : none *)
6279 (* maxUsedId : none *)
6280 (* printAttNotTable : none *)
6281 (* printIdTable : none *)
6282 (* printParEntTable : none *)
6283 (* printxElementTable : none *)
6284 (* printxGenEntTable : none *)
6285 (* setElement : NoSuchIndex *)
6286 (* setGenEnt : NoSuchIndex *)
6287 (* setId : NoSuchIndex *)
6288 (* setNotation : NoSuchIndex *)
6289 (* setParEnt : NoSuchIndex *)
6290 (*--------------------------------------------------------------------------*)
6291 signature Dtd =
6292 sig
6293 type Dtd
6294
6295 val hasDtd : Dtd -> bool
6296 val hasExternal : Dtd -> bool
6297 val standsAlone : Dtd -> bool
6298
6299 val setHasDtd : Dtd -> unit
6300 val setExternal : Dtd -> unit
6301 val setStandAlone : Dtd -> bool -> unit
6302
6303 val entitiesWellformed : Dtd -> bool
6304
6305 val validPredef : int -> UniChar.Vector
6306 val isRedefined : Dtd -> int -> bool
6307 val setRedefined : Dtd -> int -> unit
6308 val notRedefined : Dtd -> UniChar.Data list
6309
6310 val AttNot2Index : Dtd -> UniChar.Data -> int
6311 val Element2Index : Dtd -> UniChar.Data -> int
6312 val Id2Index : Dtd -> UniChar.Data -> int
6313 val GenEnt2Index : Dtd -> UniChar.Data -> int
6314 val ParEnt2Index : Dtd -> UniChar.Data -> int
6315
6316 val Index2Element : Dtd -> int -> UniChar.Data
6317 val Index2Id : Dtd -> int -> UniChar.Data
6318 val Index2GenEnt : Dtd -> int -> UniChar.Data
6319 val Index2AttNot : Dtd -> int -> UniChar.Data
6320 val Index2ParEnt : Dtd -> int -> UniChar.Data
6321
6322 val getId : Dtd -> int -> Base.IdInfo
6323 val getElement : Dtd -> int -> Base.ElemInfo
6324 val getGenEnt : Dtd -> int -> Base.GenEntInfo
6325 val getNotation : Dtd -> int -> Base.NotationInfo
6326 val getParEnt : Dtd -> int -> Base.ParEntInfo
6327
6328 val hasNotation : Dtd -> int -> bool
6329
6330 val setId : Dtd -> int * Base.IdInfo -> unit
6331 val setElement : Dtd -> int * Base.ElemInfo -> unit
6332 val setGenEnt : Dtd -> int * Base.GenEntInfo -> unit
6333 val setNotation : Dtd -> int * Base.ExternalId -> unit
6334 val setParEnt : Dtd -> int * Base.ParEntInfo -> unit
6335
6336 val maxUsedId : Dtd -> int
6337 val maxUsedElem : Dtd -> int
6338 val maxUsedGen : Dtd -> int
6339
6340 val initDtdTables : unit -> Dtd
6341 val printDtdTables : Dtd -> unit
6342
6343 val printAttNotTable : Dtd -> unit
6344 val printIdTable : Dtd -> unit
6345 val printElementTable : Dtd -> unit
6346 val printGenEntTable : Dtd -> unit
6347 val printParEntTable : Dtd -> unit
6348
6349 val defaultIdx : int
6350 val preserveIdx : int
6351 val xmlLangIdx : int
6352 val xmlSpaceIdx : int
6353 val xmlSpaceType : Base.AttType
6354 end
6355
6356 structure Dtd :> Dtd =
6357 struct
6358 open
6359 UtilInt
6360 Base UniChar
6361 DataDict DataSymTab
6362
6363 val O_TS_ELEM = ref 6 (* Initial size of element table *)
6364 val O_TS_GEN_ENT = ref 6 (* Initial size of general entity table *)
6365 val O_TS_ID = ref 6 (* Initial size of id attribute table *)
6366 val O_TS_ATT_NOT = ref 6 (* Initial size of notation table *)
6367 val O_TS_PAR_ENT = ref 6 (* Initial size of parameter entity table *)
6368
6369 (*--------------------------------------------------------------------*)
6370 (* this is how the predefined entities must be declared. *)
6371 (*--------------------------------------------------------------------*)
6372 val predefined = Vector.fromList
6373 (map (fn (x,y,z) => (String2Data x,String2Vector y,String2Vector z))
6374 [("","",""),
6375 ("amp" ,"'&#38;'","&#38;"),
6376 ("lt" ,"'&#60;'","&#60;"),
6377 ("gt" ,"'&#62;'","&#62;"),
6378 ("apos","\"'\"" ,"'" ),
6379 ("quot","'\"'" ,"\"" )])
6380 fun validPredef i = #3(Vector.sub(predefined,i))
6381
6382 (*--------------------------------------------------------------------*)
6383 (* this type holds all information relevent to the DTD. *)
6384 (*--------------------------------------------------------------------*)
6385 type Dtd = {hasDtdFlag : bool ref,
6386 standAloneFlag : bool ref,
6387 externalFlag : bool ref,
6388 elDict : ElemInfo DataDict.Dict,
6389 genDict : GenEntInfo DataDict.Dict,
6390 idDict : IdInfo DataDict.Dict,
6391 notDict : NotationInfo DataDict.Dict,
6392 parDict : ParEntInfo DataDict.Dict,
6393 preRedef : bool array
6394 }
6395
6396 fun newDtd() = {hasDtdFlag = ref false,
6397 standAloneFlag = ref false,
6398 externalFlag = ref false,
6399 elDict = nullDict ("element",nullElemInfo),
6400 idDict = nullDict ("ID name",nullIdInfo),
6401 genDict = nullDict ("general entity",(GE_NULL,false)),
6402 notDict = nullDict ("attribute and notation",NONE:NotationInfo),
6403 parDict = nullDict ("parameter entity",(PE_NULL,false)),
6404 preRedef = Array.array(6,false)
6405 } : Dtd
6406
6407 val default = String2Data "default"
6408 val preserve = String2Data "preserve"
6409 val xmlLang = String2Data "xml:lang"
6410 val xmlSpace = String2Data "xml:space"
6411
6412 (*--------------------------------------------------------------------*)
6413 (* standalone status, existance of a DTD and of external declarations *)
6414 (* externalFlag is true if there is an external subset or a (not nece-*)
6415 (* ssarily external) parameter entity reference in the DTD. (cf. 4.1) *)
6416 (*--------------------------------------------------------------------*)
6417 fun standsAlone (dtd:Dtd) = !(#standAloneFlag dtd)
6418 fun hasExternal (dtd:Dtd) = !(#externalFlag dtd)
6419 fun hasDtd (dtd:Dtd) = !(#hasDtdFlag dtd)
6420
6421 fun setHasDtd (dtd:Dtd) = #hasDtdFlag dtd := true
6422 fun setExternal (dtd:Dtd) = #externalFlag dtd := true
6423 fun setStandAlone (dtd:Dtd) x = #standAloneFlag dtd := x
6424
6425
6426 (*--------------------------------------------------------------------*)
6427 (* 4.1: *)
6428 (* Well-Formedness Constraint: Entity Declared *)
6429 (* In a document without any DTD, a document with only an internal *)
6430 (* DTD subset which contains no parameter entity references, or a *)
6431 (* document with "standalone='yes'", the Name given in the entity *)
6432 (* reference must match that in an entity declaration ... Note that *)
6433 (* if entities are declared in the external subset or in external *)
6434 (* parameter entities, a non-validating processor is not obligated *)
6435 (* to read and process their declarations; for such documents, the *)
6436 (* rule that an entity must be declared is a well-formedness *)
6437 (* constraint only if standalone='yes'. *)
6438 (* *)
6439 (* Thus a reference to an undeclared entity is a well-formedness *)
6440 (* error if either #hasDtdFlag or #externalFlag is false, or if *)
6441 (* #standaloneFlag is true *)
6442 (*--------------------------------------------------------------------*)
6443 (* bug fixed 080600: changed !hasDtdFlag to not(!hasDtdFlag) *)
6444 (*--------------------------------------------------------------------*)
6445 fun entitiesWellformed ({hasDtdFlag,standAloneFlag,externalFlag,...}:Dtd) =
6446 not (!hasDtdFlag andalso !externalFlag) orelse !standAloneFlag
6447
6448 fun initStandalone ({hasDtdFlag,standAloneFlag,externalFlag,...}:Dtd) =
6449 (hasDtdFlag := false; standAloneFlag := false; externalFlag := false)
6450
6451 (*--------------------------------------------------------------------*)
6452 (* this array tells whether the predefined entities (index 1-5) have *)
6453 (* been declared in the dtd. *)
6454 (*--------------------------------------------------------------------*)
6455 fun isRedefined (dtd:Dtd) i = Array.sub(#preRedef dtd,i)
6456 fun setRedefined (dtd:Dtd) i = Array.update(#preRedef dtd,i,true)
6457 fun notRedefined dtd = List.mapPartial
6458 (fn i => if isRedefined dtd i then NONE else SOME(#1(Vector.sub(predefined,i))))
6459 [1,2,3,4,5]
6460
6461 fun AttNot2Index (dtd:Dtd) name = getIndex(#notDict dtd,name)
6462 fun Element2Index (dtd:Dtd) name = getIndex(#elDict dtd,name)
6463 fun GenEnt2Index (dtd:Dtd) name = getIndex(#genDict dtd,name)
6464 fun Id2Index (dtd:Dtd) name = getIndex(#idDict dtd,name)
6465 fun ParEnt2Index (dtd:Dtd) name = getIndex(#parDict dtd,name)
6466
6467 fun Index2AttNot (dtd:Dtd) idx = getKey(#notDict dtd,idx)
6468 fun Index2Element (dtd:Dtd) idx = getKey(#elDict dtd,idx)
6469 fun Index2GenEnt (dtd:Dtd) idx = getKey(#genDict dtd,idx)
6470 fun Index2Id (dtd:Dtd) idx = getKey(#idDict dtd,idx)
6471 fun Index2ParEnt (dtd:Dtd) idx = getKey(#parDict dtd,idx)
6472
6473 fun getElement (dtd:Dtd) idx = getByIndex(#elDict dtd,idx)
6474 fun getGenEnt (dtd:Dtd) idx = getByIndex(#genDict dtd,idx)
6475 fun getId (dtd:Dtd) idx = getByIndex(#idDict dtd,idx)
6476 fun getNotation (dtd:Dtd) idx = getByIndex(#notDict dtd,idx)
6477 fun getParEnt (dtd:Dtd) idx = getByIndex(#parDict dtd,idx)
6478
6479 fun hasNotation (dtd:Dtd) idx = isSome(getByIndex(#notDict dtd,idx))
6480
6481 fun setElement (dtd:Dtd) (idx,el) = setByIndex(#elDict dtd,idx,el)
6482 fun setGenEnt (dtd:Dtd) (idx,ge) = setByIndex(#genDict dtd,idx,ge)
6483 fun setId (dtd:Dtd) (idx,a) = setByIndex(#idDict dtd,idx,a)
6484 fun setNotation (dtd:Dtd) (idx,nt) = setByIndex(#notDict dtd,idx,SOME nt)
6485 fun setParEnt (dtd:Dtd) (idx,pe) = setByIndex(#parDict dtd,idx,pe)
6486
6487 fun maxUsedElem (dtd:Dtd) = usedIndices(#elDict dtd)-1
6488 fun maxUsedGen (dtd:Dtd) = usedIndices(#genDict dtd)-1
6489 fun maxUsedId (dtd:Dtd) = usedIndices(#idDict dtd)-1
6490
6491 (*--------------------------------------------------------------------*)
6492 (* initialize the attribute tables. Make sure that indices 0...3 are *)
6493 (* assigned to "default", "preserve", "xml:lang" and "xml:space". *)
6494 (*--------------------------------------------------------------------*)
6495 fun initAttNotTable (dtd as {idDict,notDict,...}:Dtd) =
6496 let
6497 val _ = clearDict(notDict,SOME(!O_TS_ATT_NOT))
6498 val _ = clearDict(idDict,SOME(!O_TS_ID))
6499 val _ = AttNot2Index dtd default
6500 val _ = AttNot2Index dtd preserve
6501 val _ = AttNot2Index dtd xmlLang
6502 val _ = AttNot2Index dtd xmlSpace
6503 in ()
6504 end
6505 fun initElementTable (dtd:Dtd) = clearDict(#elDict dtd,SOME(!O_TS_ELEM))
6506 (*--------------------------------------------------------------------*)
6507 (* reserve 0 for gen entity -, i.e., the document entity. *)
6508 (* reserve 1 for gen entity amp, i.e., "&#38;#38;" *)
6509 (* reserve 2 for gen entity lt, i.e., "&#38;#60;" *)
6510 (* reserve 3 for gen entity gt, i.e., "&#62;" *)
6511 (* reserve 4 for gen entity apos, i.e., "&#39;" *)
6512 (* reserve 5 for gen entity quot, i.e., "&#34;" *)
6513 (* reserve 0 for par entity -, i.e., the external dtd subset. *)
6514 (* *)
6515 (* Cf. 4.1: *)
6516 (* *)
6517 (* ... except that well-formed documents need not declare any of *)
6518 (* the following entities: amp, lt, gt, apos, quot. *)
6519 (* *)
6520 (* and 4.6: *)
6521 (* *)
6522 (* <!ENTITY lt "&#38;#60;"> *)
6523 (* <!ENTITY gt "&#62;"> *)
6524 (* <!ENTITY amp "&#38;#38;"> *)
6525 (* <!ENTITY apos "&#39;"> *)
6526 (* <!ENTITY quot "&#34;"> *)
6527 (*--------------------------------------------------------------------*)
6528 fun initEntityTables (dtd as {genDict,parDict,preRedef,...}:Dtd) =
6529 let
6530 val _ = clearDict(genDict,SOME(!O_TS_GEN_ENT))
6531 val _ = clearDict(parDict,SOME(!O_TS_PAR_ENT))
6532 val _ = map (fn i => Array.update(preRedef,i,false)) [1,2,3,4,5]
6533 val _ = GenEnt2Index dtd [0wx2D] (* "-" *)
6534 val _ = ParEnt2Index dtd [0wx2D] (* "-" *)
6535 val _ = Vector.appi
6536 (fn (_,(name,lit,cs))
6537 => (setGenEnt dtd (GenEnt2Index dtd name,(GE_INTERN(lit,cs),false))))
6538 (predefined,1,NONE)
6539 in ()
6540 end
6541
6542 fun initDtdTables() =
6543 let
6544 val dtd = newDtd()
6545 val _ = initAttNotTable dtd
6546 val _ = initElementTable dtd
6547 val _ = initEntityTables dtd
6548 val _ = initStandalone dtd
6549 in dtd
6550 end
6551
6552 local
6553 val dtd = initDtdTables()
6554 in
6555 val defaultIdx = AttNot2Index dtd default
6556 val preserveIdx = AttNot2Index dtd preserve
6557 val xmlLangIdx = AttNot2Index dtd xmlLang
6558 val xmlSpaceIdx = AttNot2Index dtd xmlSpace
6559 val xmlSpaceType = AT_GROUP (IntLists.addIntList (preserveIdx,[defaultIdx]))
6560 end
6561
6562 fun printAttNotTable (dtd:Dtd) =
6563 printDict NotationInfo2String (#notDict dtd)
6564 fun printElementTable dtd =
6565 printDict (ElemInfo2xString (UniChar.Data2String o (Index2AttNot dtd),
6566 UniChar.Data2String o (Index2Element dtd),
6567 UniChar.Data2String o (Index2GenEnt dtd),
6568 UniChar.Data2String o (Index2Id dtd),
6569 UniChar.Data2String o (Index2AttNot dtd))) (#elDict dtd)
6570 fun printGenEntTable dtd =
6571 printDict (fn (ent,ext) => GenEntity2xString (Data2String o (Index2AttNot dtd)) ent
6572 ^(if ext then "[external]" else "")) (#genDict dtd)
6573 fun printIdTable (dtd:Dtd) = printDict (IdInfo2String) (#idDict dtd)
6574 fun printParEntTable (dtd:Dtd) =
6575 printDict (fn (ent,ext) => ParEntity2String ent
6576 ^(if ext then "[external]" else "")) (#parDict dtd)
6577
6578 fun printDtdTables dtd = (printAttNotTable dtd;
6579 printElementTable dtd;
6580 printGenEntTable dtd;
6581 printIdTable dtd;
6582 printParEntTable dtd)
6583 end
6584 (* stop of ../../Parser/Params/dtd.sml *)
6585 (* start of ../../Parser/Params/hookData.sml *)
6586 structure HookData =
6587 struct
6588 type StartEnd = Errors.Position * Errors.Position
6589 (*--------------------------------------------------------------------*)
6590 (* a text declaration consists of a version info and an encoding decl.*)
6591 (* an xml declaration has an additional standalone decl. *)
6592 (*--------------------------------------------------------------------*)
6593 type TextDecl = string option * string option
6594 type XmlDecl = string option * string option * bool option
6595
6596 type XmlInfo = Uri.Uri * Encoding.Encoding * XmlDecl option
6597 type ExtSubsetInfo = Uri.Uri * Encoding.Encoding * TextDecl option
6598 type SubsetInfo = Errors.Position
6599 type EndDtdInfo = Errors.Position
6600
6601 type ErrorInfo = Errors.Position * Errors.Error
6602 type WarningInfo = Errors.Position * Errors.Warning
6603 type NoFileInfo = string * string
6604
6605 type CommentInfo = StartEnd * UniChar.Vector
6606 type ProcInstInfo = StartEnd * UniChar.Data * Errors.Position * UniChar.Vector
6607
6608 type DtdInfo = int * Base.ExternalId option
6609
6610 datatype AttPresent =
6611 AP_IMPLIED
6612 | AP_MISSING
6613 | AP_DEFAULT of UniChar.Vector * UniChar.Vector * Base.AttValue option
6614 | AP_PRESENT of UniChar.Vector * UniChar.Vector * Base.AttValue option
6615
6616 type AttSpec = int * AttPresent * (UniChar.Data * UniChar.Data) option
6617 type AttSpecList = AttSpec list
6618
6619 type EndTagInfo = StartEnd * int * (int * UniChar.Data) option
6620 type StartTagInfo = StartEnd * int * AttSpecList * UniChar.Data * bool
6621 type WhiteInfo = UniChar.Vector
6622 type CDataInfo = StartEnd * UniChar.Vector
6623 type DataInfo = StartEnd * UniChar.Vector * bool
6624
6625 type CharRefInfo = StartEnd * UniChar.Char * UniChar.Vector
6626 type GenRefInfo = StartEnd * int * Base.GenEntity * bool
6627 type ParRefInfo = StartEnd * int * Base.ParEntity * bool
6628 type EntEndInfo = Errors.Position
6629
6630 datatype MarkupDecl =
6631 DEC_ATTLIST of int * (int * Base.AttType * Base.AttDefault) list * bool
6632 | DEC_ELEMENT of int * Base.ContentSpec * bool
6633 | DEC_GEN_ENT of int * Base.GenEntity * bool
6634 | DEC_PAR_ENT of int * Base.ParEntity * bool
6635 | DEC_NOTATION of int * Base.ExternalId * bool
6636 type DeclInfo = StartEnd * MarkupDecl
6637
6638 fun isExtDecl decl =
6639 case decl
6640 of DEC_ATTLIST(_,_,ext) => ext
6641 | DEC_ELEMENT(_,_,ext) => ext
6642 | DEC_GEN_ENT(_,_,ext) => ext
6643 | DEC_PAR_ENT(_,_,ext) => ext
6644 | DEC_NOTATION(_,_,ext) => ext
6645 end
6646 (* stop of ../../Parser/Params/hookData.sml *)
6647 (* start of ../../Parser/Params/hooks.sml *)
6648
6649
6650
6651 signature Hooks =
6652 sig
6653 type AppData
6654 type AppFinal
6655
6656 val hookXml : AppData * HookData.XmlInfo -> AppData
6657 val hookFinish : AppData -> AppFinal
6658
6659 val hookError : AppData * HookData.ErrorInfo -> AppData
6660 val hookWarning : AppData * HookData.WarningInfo -> AppData
6661
6662 val hookProcInst : AppData * HookData.ProcInstInfo -> AppData
6663 val hookComment : AppData * HookData.CommentInfo -> AppData
6664 val hookWhite : AppData * HookData.WhiteInfo -> AppData
6665 val hookDecl : AppData * HookData.DeclInfo -> AppData
6666
6667 val hookStartTag : AppData * HookData.StartTagInfo -> AppData
6668 val hookEndTag : AppData * HookData.EndTagInfo -> AppData
6669 val hookCData : AppData * HookData.CDataInfo -> AppData
6670 val hookData : AppData * HookData.DataInfo -> AppData
6671
6672 val hookCharRef : AppData * HookData.CharRefInfo -> AppData
6673 val hookGenRef : AppData * HookData.GenRefInfo -> AppData
6674 val hookParRef : AppData * HookData.ParRefInfo -> AppData
6675 val hookEntEnd : AppData * HookData.EntEndInfo -> AppData
6676
6677 val hookDocType : AppData * HookData.DtdInfo -> AppData
6678 val hookSubset : AppData * HookData.SubsetInfo -> AppData
6679 val hookExtSubset : AppData * HookData.ExtSubsetInfo -> AppData
6680 val hookEndDtd : AppData * HookData.EndDtdInfo -> AppData
6681 end
6682 (* stop of ../../Parser/Params/hooks.sml *)
6683 (* start of ../../Parser/Params/resolve.sml *)
6684
6685
6686
6687
6688 signature Resolve =
6689 sig
6690 val resolveExtId : Base.ExternalId -> Uri.Uri
6691 end
6692
6693 structure ResolveNull : Resolve =
6694 struct
6695 open Base Errors Uri
6696
6697 fun resolveExtId (EXTID(_,sys)) =
6698 case sys
6699 of NONE => raise NoSuchFile ("","Could not generate system identifier")
6700 | SOME (base,file,_) => uriJoin(base,file)
6701 end
6702 (* stop of ../../Parser/Params/resolve.sml *)
6703 (* start of ../../Parser/Dfa/dfaUtil.sml *)
6704
6705
6706
6707
6708
6709
6710 (*--------------------------------------------------------------------------*)
6711 (* Structure: DfaUtil *)
6712 (* *)
6713 (* Depends on: *)
6714 (* DfaData *)
6715 (* UtilInt *)
6716 (* *)
6717 (* Exceptions raised by functions in this structure: *)
6718 (* boundsFollow : none *)
6719 (* cmSymbols : none *)
6720 (* makeRow : none *)
6721 (* mergeFirst : ConflictFirst *)
6722 (* mergeFollow : ConflictFollow *)
6723 (*--------------------------------------------------------------------------*)
6724 signature DfaUtil =
6725 sig
6726 val mergeFirst : bool -> DfaBase.First * DfaBase.First -> DfaBase.First
6727 val mergeFollow : bool -> DfaBase.Follow * DfaBase.Follow -> DfaBase.Follow
6728 val boundsFollow : DfaBase.Follow -> DfaBase.Sigma * DfaBase.Sigma
6729 val cmSymbols : DfaBase.ContentModel -> DfaBase.Sigma list
6730 val makeRow : DfaBase.Follow * bool -> DfaBase.Row
6731 end
6732
6733 structure DfaUtil : DfaUtil =
6734 struct
6735 open UtilInt DfaBase
6736
6737 (*--------------------------------------------------------------------*)
6738 (* merge two First sets, raise ConflictFirst at conflict: there may *)
6739 (* not be two entries (q1,a) and (q2,a) in the same First set, if *)
6740 (* nondet is false. *)
6741 (*--------------------------------------------------------------------*)
6742 fun mergeFirst nondet ll =
6743 let
6744 fun go_det (nil,l) = l
6745 | go_det (l,nil) = l
6746 | go_det (l1 as (x1 as (q1,a1))::r1,l2 as (x2 as (q2,a2))::r2) =
6747 case Int.compare(a1,a2)
6748 of LESS => x1::go_det(r1,l2)
6749 | GREATER => x2::go_det(l1,r2)
6750 | EQUAL => raise ConflictFirst(a1,q1,q2)
6751
6752 fun go_nondet (nil,l) = l
6753 | go_nondet (l,nil) = l
6754 | go_nondet (l1 as (x1 as (q1,a1))::r1,l2 as (x2 as (q2,a2))::r2) =
6755 case Int.compare(a1,a2)
6756 of LESS => x1::go_nondet(r1,l2)
6757 | GREATER => x2::go_nondet(l1,r2)
6758 | EQUAL => case Int.compare(q1,q2)
6759 of LESS => x1::go_nondet(r1,l2)
6760 | GREATER => x2::go_nondet(l1,r2)
6761 | EQUAL => go_nondet(l1,r2)
6762 in
6763 if nondet then go_nondet ll else go_det ll
6764 end
6765
6766 (*--------------------------------------------------------------------*)
6767 (* merge two Follow sets, raise ConflictFollow at conflict. there may *)
6768 (* not be two entries (q1,a) and (q2,a) with q1<>q2 in the same Follow*)
6769 (* set, if nondet is false. Note that, e.g. for (a+)+, Follow(a) = *)
6770 (* Follow(a+) U First(a+), so duplicate occurrences of the same (q,a) *)
6771 (* are possible (as opposed to First). *)
6772 (*--------------------------------------------------------------------*)
6773 fun mergeFollow nondet ll =
6774 let
6775 fun go_det (nil,l) = l
6776 | go_det (l,nil) = l
6777 | go_det (l1 as (x1 as (q1,a1))::r1,l2 as (x2 as (q2,a2))::r2) =
6778 case Int.compare(a1,a2)
6779 of LESS => x1::go_det(r1,l2)
6780 | GREATER => x2::go_det(l1,r2)
6781 | EQUAL => if q1=q2 then go_det(l1,r2)
6782 else raise ConflictFollow(a1,q1,q2)
6783
6784 fun go_nondet (nil,l) = l
6785 | go_nondet (l,nil) = l
6786 | go_nondet (l1 as (x1 as (q1,a1))::r1,l2 as (x2 as (q2,a2))::r2) =
6787 case Int.compare(a1,a2)
6788 of LESS => x1::go_nondet(r1,l2)
6789 | GREATER => x2::go_nondet(l1,r2)
6790 | EQUAL => case Int.compare(q1,q2)
6791 of LESS => x1::go_nondet(r1,l2)
6792 | GREATER => x2::go_nondet(l1,r2)
6793 | EQUAL => go_nondet(l1,r2)
6794 in
6795 if nondet then go_nondet ll else go_det ll
6796 end
6797
6798 (*--------------------------------------------------------------------*)
6799 (* what are the least and largest symbol occurring in a Follow set? *)
6800 (*--------------------------------------------------------------------*)
6801 fun boundsFollow (nil:Follow) = (1,0)
6802 | boundsFollow [(q,a)] = (a,a)
6803 | boundsFollow ((q,a)::xs) = (a,#2(List.last xs))
6804
6805 (*--------------------------------------------------------------------*)
6806 (* return the list of all symbols occurring in a content model. *)
6807 (*--------------------------------------------------------------------*)
6808 fun cmSymbols cm =
6809 let
6810 fun do_cm(cm,yet) =
6811 case cm
6812 of CM_ELEM a => insertInt(a,yet)
6813 | CM_OPT cm => do_cm(cm,yet)
6814 | CM_REP cm => do_cm(cm,yet)
6815 | CM_PLUS cm => do_cm(cm,yet)
6816 | CM_ALT cms => foldr do_cm yet cms
6817 | CM_SEQ cms => foldr do_cm yet cms
6818 in do_cm(cm,nil)
6819 end
6820
6821 (*--------------------------------------------------------------------*)
6822 (* given the follow set and the final flag, make a row in the dfa. *)
6823 (*--------------------------------------------------------------------*)
6824 fun makeRow (flw,fin) =
6825 let
6826 val (lo,hi) = boundsFollow flw
6827 val tab = Array.array(hi-lo+1,dfaError)
6828 val _ = app (fn (q,a) => Array.update (tab,a-lo,q)) flw
6829 in
6830 (lo,hi,Array.extract (tab,0,NONE),fin)
6831 end
6832
6833 end
6834 (* stop of ../../Parser/Dfa/dfaUtil.sml *)
6835 (* start of ../../Util/intSets.sml *)
6836
6837
6838
6839
6840
6841
6842
6843 signature IntSets =
6844 sig
6845 eqtype IntSet
6846
6847 val emptyIntSet : IntSet
6848 val singleIntSet : int -> IntSet
6849 val fullIntSet : int -> IntSet
6850
6851 val isEmptyIntSet : IntSet -> bool
6852 val inIntSet : int * IntSet -> bool
6853
6854 val compareIntSets: IntSet * IntSet -> order
6855 val hashIntSet : IntSet -> word
6856
6857 val addIntSet : int * IntSet -> IntSet
6858 val delIntSet : int * IntSet -> IntSet
6859
6860 val cupIntSets : IntSet * IntSet -> IntSet
6861 val capIntSets : IntSet * IntSet -> IntSet
6862 val diffIntSets : IntSet * IntSet -> IntSet
6863
6864 val IntSet2List : IntSet -> int list
6865 val IntList2Set : int list -> IntSet
6866 end
6867
6868 structure IntSets : IntSets =
6869 struct
6870 structure W = Word32
6871 val wordSize = W.wordSize
6872
6873 type IntSet = W.word vector
6874
6875 infix 7 << >>
6876 infix 6 &&
6877 infix 5 ||
6878
6879 val op >> = W.>>
6880 val op << = W.<<
6881 val op && = W.andb
6882 val op || = W.orb
6883 val !! = W.notb
6884
6885 fun normalize (vec:IntSet) =
6886 let val max = Vector.foldli
6887 (fn (i,w,max) => if w=0wx0 then i else max) 0 (vec,0,NONE)
6888 in Vector.extract (vec,0,SOME max)
6889 end
6890
6891 val emptyIntSet = Vector.fromList nil : IntSet
6892
6893 fun fullIntSet n = let val size = (n+wordSize-1) div wordSize
6894 val full = 0w0-0w1:W.word
6895 val bits = (n-1) mod wordSize+1
6896 val last = full >> (Word.fromInt (wordSize-bits))
6897 in Vector.tabulate(n div wordSize+1,
6898 fn i => if i<size-1 then full else last):IntSet
6899 end
6900
6901 fun singleIntSet n =
6902 let
6903 val idx = n div wordSize
6904 val mask = 0w1 << (Word.fromInt (n mod wordSize))
6905 in
6906 Vector.tabulate(idx+1,fn i => if i=idx then mask else 0w0):IntSet
6907 end
6908
6909 fun isEmptyIntSet vec = Vector.length vec=0
6910
6911 fun inIntSet(n,vec) =
6912 let val idx = n div wordSize
6913 in if idx>=Vector.length vec then false
6914 else let val mask = 0w1 << (Word.fromInt (n mod wordSize))
6915 in Vector.sub(vec,idx) && mask <> 0w0
6916 end
6917 end
6918
6919 fun addIntSet(n,vec) =
6920 let
6921 val idx = n div wordSize
6922 val mask = 0w1 << (Word.fromInt (n mod wordSize))
6923 val size = Vector.length vec
6924 in
6925 if size>idx
6926 then Vector.mapi (fn (i,x) => if i=idx then x||mask else x) (vec,0,NONE)
6927 else Vector.tabulate
6928 (idx+1,fn i => if i<size then Vector.sub(vec,i) else if i=idx then mask else 0w0)
6929 end
6930
6931 fun delIntSet(n,vec) =
6932 let
6933 val idx = n div wordSize
6934 val size = Vector.length vec
6935 val vec1 = if size<=idx then vec
6936 else let val mask = !! (0w1 << (Word.fromInt (n mod wordSize)))
6937 in Vector.mapi
6938 (fn (i,x) => if i=idx then x && mask else x) (vec,0,NONE)
6939 end
6940 in normalize vec1
6941 end
6942
6943 fun capIntSets(vec1,vec2) =
6944 let
6945 val l12 = Int.min(Vector.length vec1,Vector.length vec2)
6946 val v12 = Vector.tabulate(l12,fn i => Vector.sub(vec1,i) && Vector.sub(vec2,i))
6947 in
6948 normalize v12
6949 end
6950
6951 fun cupIntSets(vec1,vec2) =
6952 let
6953 val (l1,l2) = (Vector.length vec1,Vector.length vec2)
6954 val (shorter,longer,v) = if l1<=l2 then (l1,l2,vec2) else (l2,l1,vec1)
6955 in
6956 Vector.tabulate (longer,fn i => if i>=shorter then Vector.sub(v,i)
6957 else Vector.sub(vec1,i) || Vector.sub(vec2,i))
6958 end
6959
6960 fun diffIntSets(vec1,vec2) =
6961 let
6962 val (l1,l2) = (Vector.length vec1,Vector.length vec2)
6963 val vec1 = Vector.tabulate
6964 (l1,fn i => if i>=l2 then Vector.sub(vec1,i)
6965 else Vector.sub(vec1,i) && !!(Vector.sub(vec2,i)))
6966 in normalize vec1
6967 end
6968
6969 fun IntList2Set l = List.foldl addIntSet emptyIntSet l
6970
6971 fun IntSet2List vec =
6972 let
6973 val size = Vector.length vec
6974 fun doOne (w,off,yet) =
6975 let fun doit (i,mask) =
6976 if i=wordSize then yet
6977 else if w&&mask=0w0 then doit(i+1,mask<<0wx1)
6978 else (off+i)::doit(i+1,mask<<0wx1)
6979 in doit(0,0wx1)
6980 end
6981 fun doAll i = if i>=size then nil
6982 else doOne(Vector.sub(vec,i),wordSize*i,(doAll (i+1)))
6983 in doAll 0
6984 end
6985
6986 fun compareIntSets (vec1,vec2:IntSet) =
6987 let
6988 val (l1,l2) = (Vector.length vec1,Vector.length vec2)
6989 val (l12,ifEq) = case Int.compare(l1,l2)
6990 of LESS => (l1,LESS)
6991 | order => (l2,order)
6992 fun doit i = if i>=l12 then ifEq
6993 else case W.compare(Vector.sub(vec1,i),Vector.sub(vec2,i))
6994 of EQUAL => doit (i+1)
6995 | order => order
6996 in doit 0
6997 end
6998
6999 val intShift = case Int.precision
7000 of NONE => 0w0
7001 | SOME x => Word.fromInt(Int.max(wordSize-x+1,0))
7002
7003 fun hashIntSet vec =
7004 case Vector.length vec
7005 of 0 => 0w0
7006 | 1 => Word.fromInt(W.toInt(W.>>(Vector.sub(vec,0),intShift)))
7007 | l => Word.fromInt(W.toInt(W.>>(Vector.sub(vec,0)+Vector.sub(vec,l-1),intShift)))
7008 end
7009 (* stop of ../../Util/intSets.sml *)
7010 (* start of ../../Util/SymDict/intSetDict.sml *)
7011
7012
7013
7014
7015
7016
7017
7018 structure KeyIntSet : Key =
7019 struct
7020 open IntSets UtilString
7021
7022 type Key = IntSet
7023
7024 val null = emptyIntSet
7025 val hash = hashIntSet
7026 val compare = compareIntSets
7027 val toString = (List2xString ("{",",","}") Int2String) o IntSet2List
7028 end
7029
7030 structure IntSetDict = Dict (structure Key = KeyIntSet)
7031 structure IntSetSymTab = SymTable (structure Key = KeyIntSet)
7032
7033
7034 (* stop of ../../Util/SymDict/intSetDict.sml *)
7035 (* start of ../../Parser/Dfa/dfaPassThree.sml *)
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047 (*--------------------------------------------------------------------------*)
7048 (* Structure: DfaPassThree *)
7049 (* *)
7050 (* Depends on: *)
7051 (* DfaData *)
7052 (* DfaUtil *)
7053 (* IntSets *)
7054 (* IntSetDict *)
7055 (* ParseOptions *)
7056 (* *)
7057 (* Exceptions raised by functions in this structure: *)
7058 (* passThree : TooLarge *)
7059 (*--------------------------------------------------------------------------*)
7060 signature DfaPassThree =
7061 sig
7062 val passThree: bool -> (DfaBase.Follow * bool) vector -> DfaBase.Dfa
7063 end
7064
7065 functor DfaPassThree (structure DfaOptions : DfaOptions) : DfaPassThree =
7066 struct
7067 open
7068 IntSets IntSetDict DfaBase DfaOptions DfaUtil
7069
7070 (*--------------------------------------------------------------------*)
7071 (* do the subset construction. *)
7072 (*--------------------------------------------------------------------*)
7073 (* given an automaton (Q,q0,F,delta), the subset automaton is *)
7074 (* (Q',q0',F',delta') with: *)
7075 (* - Q' = 2^Q *)
7076 (* - q0'= {q0} *)
7077 (* - F' = {S | S cap F <> empty} *)
7078 (* - delta'(S,a) = {p | (q,a,p) in delta, q in S} *)
7079 (*--------------------------------------------------------------------*)
7080 fun makeDet tab =
7081 let
7082 (* the new start state is the singleton of the old start state *)
7083 val sNull = singleIntSet 0
7084
7085 (* create a dictionary for the subsets, make sNull get index 0 *)
7086 val tau = makeDict("",!O_DFA_INITIAL_WIDTH,(nil:Follow,false))
7087 val pInitial = getIndex(tau,sNull)
7088
7089 (* enter a new set state. raise DfaTooLarge if the new state *)
7090 (* would have a too large index *)
7091 fun makeState s =
7092 let val (max,i) = (!O_DFA_MAX_STATES,getIndex(tau,s))
7093 in if max>i then i else raise DfaTooLarge max
7094 end
7095
7096 (* compute the follow set for a set state from the follow sets *)
7097 (* of its members *)
7098 fun makeFollow NONE nil = nil
7099 | makeFollow (SOME(s,a)) nil = [(makeState s,a)]
7100 | makeFollow NONE ((q,a)::qas) = makeFollow (SOME(singleIntSet q,a)) qas
7101 | makeFollow (SOME(s,a)) ((q,b)::qas) =
7102 if a=b then makeFollow (SOME(addIntSet(q,s),a)) qas
7103 else (makeState s,a)::makeFollow (SOME(singleIntSet q,b)) qas
7104
7105 (* continue until all entries in the state dictionary are done -*)
7106 fun doit i =
7107 if i>=usedIndices tau then i
7108 else let val sI = getKey(tau,i)
7109 val lI = IntSet2List sI
7110 val ffs = map (fn j => Vector.sub(tab,j)) lI
7111 val (followJs,finI) = foldl
7112 (fn ((flwJ,finJ),(flw,fin)) => (mergeFollow true (flwJ,flw),
7113 finJ orelse fin))
7114 (nil,false) ffs
7115 val followI = makeFollow NONE followJs
7116 val _ = setByIndex(tau,i,(followI,finI))
7117 in doit (i+1)
7118 end
7119
7120 val size = doit 0
7121 in (* finally create a vector holding the new follow/fin pairs *)
7122 Vector.tabulate (size,fn i => getByIndex(tau,i))
7123 end
7124
7125 (*--------------------------------------------------------------------*)
7126 (* given a vector of Follow and boolean final condition, make a dfa *)
7127 (* out of it. if the first arg is true, then the content model was *)
7128 (* ambiguous; in this case we must first apply a subset construction *)
7129 (* in order to obtain a deterministic finite machine. *)
7130 (*--------------------------------------------------------------------*)
7131 fun passThree nondet tab =
7132 let
7133 val det = if nondet then makeDet tab else tab
7134 in Vector.map makeRow det
7135 end
7136 end
7137 (* stop of ../../Parser/Dfa/dfaPassThree.sml *)
7138 (* start of ../../Parser/Dfa/dfaError.sml *)
7139
7140
7141 (*--------------------------------------------------------------------------*)
7142 (* Structure: DfaError *)
7143 (* *)
7144 (* Note: *)
7145 (* The function in this structure is for producing good error messages *)
7146 (* for ambiguous content models. It numbers the nodes of a cm exactly *)
7147 (* like passOne does, but counts the occurrences of symbol a in order to *)
7148 (* indicate which are in conflict. It is only executed in case of error. *)
7149 (* *)
7150 (* Depends on: *)
7151 (* DfaData *)
7152 (* *)
7153 (* Exceptions raised by functions in this structure: *)
7154 (* countOccs : none *)
7155 (*--------------------------------------------------------------------------*)
7156 signature DfaError =
7157 sig
7158 val countOccs : DfaBase.Sigma * DfaBase.State * DfaBase.State
7159 -> DfaBase.ContentModel -> DfaBase.Sigma * int * int
7160 end
7161
7162 structure DfaError : DfaError =
7163 struct
7164 open DfaBase
7165
7166 fun countOccs (a,q1,q2) cm =
7167 let
7168 val (q1,q2) = if q1>q2 then (q2,q1) else (q1,q2)
7169
7170 fun next a nil = (1,[(a,2)])
7171 | next a ((b,n)::rest) =
7172 if a=b then (n,(b,n+1)::rest)
7173 else if a<b then (1,(a,2)::(b,n)::rest)
7174 else let val (m,new) = next a rest
7175 in (m,(b,n)::new)
7176 end
7177
7178 fun insert a (q,yet,n1,n2) =
7179 let val (n,new) = next a yet
7180 in (q+1,new,if q=q1 then n else n1,if q=q2 then n else n2)
7181 end
7182
7183 fun doit (cm,yet) =
7184 case cm
7185 of CM_ELEM a => insert a yet
7186 | CM_OPT cmi => doit (cmi,yet)
7187 | CM_REP cmi => doit (cmi,yet)
7188 | CM_PLUS cmi => doit (cmi,yet)
7189 | CM_ALT cmis => foldl doit yet cmis
7190 | CM_SEQ cmis => foldl doit yet cmis
7191
7192 val (_,_,n1,n2) = doit (cm,(1,nil,0,0))
7193 in
7194 (a,n1,n2)
7195 end
7196 end
7197 (* stop of ../../Parser/Dfa/dfaError.sml *)
7198 (* start of ../../Parser/Dfa/dfaPassOne.sml *)
7199
7200
7201
7202 (*--------------------------------------------------------------------------*)
7203 (* Structure: DfaPassOne *)
7204 (* *)
7205 (* Depends on: *)
7206 (* DfaData *)
7207 (* DfaUtil *)
7208 (* *)
7209 (* Exceptions raised by functions in this structure: *)
7210 (* passOne : ConflictFirst *)
7211 (*--------------------------------------------------------------------------*)
7212 signature DfaPassOne =
7213 sig
7214 val passOne : bool -> DfaBase.ContentModel -> DfaBase.CM
7215 end
7216
7217 structure DfaPassOne : DfaPassOne =
7218 struct
7219 open DfaBase DfaUtil
7220
7221 (*--------------------------------------------------------------------*)
7222 (* Given a content model, number the leafs, compute Empty and First *)
7223 (* for each node, and construct a corresponding CM annotated with *)
7224 (* these informations. *)
7225 (* *)
7226 (* Numbering: *)
7227 (* The leafs are numbered in left-to-right, depth-first order, *)
7228 (* starting with 1 (0 will be the start state of the DFA). *)
7229 (* *)
7230 (* Empty a = false *)
7231 (* Empty e? = Empty e* = true *)
7232 (* Empty e+ = Empty e *)
7233 (* Empty e1|...|eN = Empty e1 \/ ... \/ Empty eN *)
7234 (* Empty e1,...,eN = Empty e1 /\ ... /\ Empty eN *)
7235 (* *)
7236 (* First a = {q,a}, where q is the number of this leaf. *)
7237 (* First e? = First e* = First e+ = First e *)
7238 (* First e1|...|eN = First e1 ++ ... ++ First eN *)
7239 (* First e1,...,eN = ++{First eI | Empty eJ=False forall j<i} *)
7240 (* *)
7241 (* F1++F2 = F1 U F2, if a2<>a1 forall (q1,a1) in F1, (q1,a1) in F1} *)
7242 (* error, if exist (q1,a) in F1, (q2,a) in F2 *)
7243 (* then raise ConflictFirst(a,q1,q2) *)
7244 (*--------------------------------------------------------------------*)
7245 fun passOne nondet cm =
7246 let
7247 fun und(a,b) = a andalso b
7248 fun oder(a,b) = a orelse b
7249
7250 fun op_fst_seq (fst,fsts,mt) = if mt then mergeFirst nondet (fst,fsts) else fst
7251 fun op_fst_or (fst,fsts,_) = mergeFirst nondet (fst,fsts)
7252
7253 fun do_cm cm q =
7254 case cm
7255 of CM_ELEM a => (ELEM a,(q+1,false,[(q+1,a)]))
7256 | CM_OPT cm => let val cmi as (_,(q1,_,fst)) = do_cm cm q
7257 in (OPT cmi,(q1,true,fst))
7258 end
7259 | CM_REP cm => let val cmi as (_,(q1,_,fst)) = do_cm cm q
7260 in (REP cmi,(q1,true,fst))
7261 end
7262 | CM_PLUS cm => let val cmi as (_,info1) = do_cm cm q
7263 in (PLUS cmi,info1)
7264 end
7265 | CM_ALT cms => do_cms (ALT,false,oder,op_fst_or) cms q
7266 | CM_SEQ cms => do_cms (SEQ,true,und,op_fst_seq) cms q
7267
7268 and do_cms(con,null_mt,op_mt,op_fst) cms q =
7269 let
7270 fun doit [] q = ([],(q,null_mt,[]))
7271 | doit (cm::cms) q =
7272 let
7273 val cmi as (_,(q1,mt1,fst1)) = do_cm cm q
7274 val (cmis,(q2,mt2,fst2)) = doit cms q1
7275 in (cmi::cmis,(q2,op_mt(mt1,mt2),op_fst(fst1,fst2,mt1)))
7276 end
7277 val (cmis,info1) = doit cms q
7278 in (con cmis,info1)
7279 end
7280
7281 in do_cm cm 0
7282 end
7283 end
7284 (* stop of ../../Parser/Dfa/dfaPassOne.sml *)
7285 (* start of ../../Parser/Dfa/dfaPassTwo.sml *)
7286
7287
7288
7289
7290
7291 (*--------------------------------------------------------------------------*)
7292 (* Structure: DfaPassTwo *)
7293 (* *)
7294 (* Depends on: *)
7295 (* DfaData *)
7296 (* DfaUtil *)
7297 (* *)
7298 (* Exceptions raised by functions in this structure: *)
7299 (* passTwo : ConflictFollow *)
7300 (*--------------------------------------------------------------------------*)
7301 signature DfaPassTwo =
7302 sig
7303 val passTwo: bool -> DfaBase.CM -> (DfaBase.Follow * bool) vector
7304 end
7305
7306 structure DfaPassTwo : DfaPassTwo =
7307 struct
7308 open DfaBase DfaUtil
7309
7310 (*--------------------------------------------------------------------*)
7311 (* Given a CM annotated with leaf numbers (states), Empty and First, *)
7312 (* compute Follow and Fin foreach node, and generate the transition *)
7313 (* row if node is a leaf. Follow and Fin are computed top-down: *)
7314 (* *)
7315 (* (Top-Level): *)
7316 (* Follow e = {}, Fin e = true *)
7317 (* *)
7318 (* (e=e1?): *)
7319 (* Follow e1 = Follow e, Fin e1 = Fin e *)
7320 (* *)
7321 (* (e=e1*, e=e1+) *)
7322 (* Follow e1 = Follow e1 ++ First e1, Fin e1 = Fin e *)
7323 (* *)
7324 (* (e=e1|...|eN) = *)
7325 (* Follow eI = Follow e, Fin eI = Fin e for i=0...n *)
7326 (* *)
7327 (* (e=e1,...,eN) = *)
7328 (* Follow eN = Follow e, Fin eN = Fin e *)
7329 (* Follow eI = First eI+1, if Empty eI+1 = false, i<n *)
7330 (* First eI+1 ++ Follow eI+1, if Empty eI+1 = true, i<n *)
7331 (* Fin eI = false, if Empty eI+1 = false, i<n *)
7332 (* Fin eI+1, if Empty eI+1 = true, i<n *)
7333 (* *)
7334 (* F1++F2 = F1 U F2, if a2<>a1 forall (q1,a1) in F1, (q1,a1) in F1} *)
7335 (* error, if exist (q1,a) in F1, (q2,a) in F2 *)
7336 (* then raise ConflictFirst(a,q1,q2) *)
7337 (*--------------------------------------------------------------------*)
7338 fun passTwo nondet (cmi as (_,(n,mt,fst))) =
7339 let
7340 val table = Array.array(n+1,(nil,false))
7341
7342 val _ = Array.update(table,0,(fst,mt))
7343
7344 fun do_cm (ff as (flw,fin)) (cm,(q,mt,fst)) =
7345 case cm
7346 of ELEM a => Array.update(table,q,ff)
7347 | OPT cmi => do_cm ff cmi
7348 | REP cmi => do_cm (mergeFollow nondet (fst,flw),fin) cmi
7349 | PLUS cmi => do_cm (mergeFollow nondet (fst,flw),fin) cmi
7350 | ALT cmis => app (do_cm ff) cmis
7351 | SEQ cmis => ignore (do_seq ff cmis)
7352 and do_seq ff cmis = foldr
7353 (fn (cmi as (_,(_,mt,fst)),ff as (flw,fin))
7354 => (do_cm ff cmi;
7355 if mt then (mergeFollow nondet (fst,flw),fin) else (fst,false)))
7356 ff cmis
7357
7358 val _ = do_cm (nil,true) cmi
7359
7360 in Array.extract (table,0,NONE)
7361 end
7362 end
7363 (* stop of ../../Parser/Dfa/dfaPassTwo.sml *)
7364 (* start of ../../Parser/Dfa/dfa.sml *)
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375 (*--------------------------------------------------------------------------*)
7376 (* Structure: Dfa *)
7377 (* *)
7378 (* Depends on: *)
7379 (* DfaData *)
7380 (* DfaError *)
7381 (* DfaPassOne *)
7382 (* DfaPassTwo *)
7383 (* DfaString *)
7384 (* DfaUtil *)
7385 (* *)
7386 (* Exceptions raised by functions in this structure: *)
7387 (* ContentModel2String : none *)
7388 (* dfaFinal : none *)
7389 (* dfaTrans : none *)
7390 (* makeAmbiguous : DfaTooLarge *)
7391 (* makeChoiceDfa : none *)
7392 (* makeDfa : Ambiguous *)
7393 (* Dfa2String : none *)
7394 (*--------------------------------------------------------------------------*)
7395 signature Dfa =
7396 sig
7397 eqtype DfaState
7398
7399 val dfaError : DfaState
7400 val dfaInitial : DfaState
7401
7402 exception DfaTooLarge of int
7403 exception Ambiguous of int * int * int
7404
7405 val emptyDfa : DfaData.Dfa
7406
7407 val makeDfa : DfaData.ContentModel -> DfaData.Dfa
7408 val makeAmbiguous : DfaData.ContentModel -> DfaData.Dfa
7409 val makeChoiceDfa : DfaData.ContentModel -> DfaData.Dfa
7410
7411 val dfaFinal : DfaData.Dfa * DfaState -> bool
7412 val dfaTrans : DfaData.Dfa * DfaState * int -> DfaState
7413 end
7414
7415 functor Dfa (structure DfaOptions : DfaOptions) : Dfa =
7416 struct
7417 structure DfaPassThree = DfaPassThree (structure DfaOptions = DfaOptions)
7418
7419 open
7420 DfaBase DfaError DfaPassOne DfaPassTwo DfaString DfaUtil
7421
7422 type DfaState = State
7423
7424 (*--------------------------------------------------------------------*)
7425 (* Create a dfa for the content model (a1|...|aN)*, where a1,...,aN *)
7426 (* are the symbols occurring in the input dfa. *)
7427 (*--------------------------------------------------------------------*)
7428 fun makeChoiceDfa cm =
7429 let
7430 val syms = cmSymbols cm
7431 val flw = map (fn a => (dfaInitial,a)) syms
7432 in
7433 Vector.fromList [makeRow(flw,true)]
7434 end
7435
7436 (*--------------------------------------------------------------------*)
7437 (* create a dfa for an ambiguous content model. Raise DfaTooLarge if *)
7438 (* the subset construction yields too many states. *)
7439 (*--------------------------------------------------------------------*)
7440 fun makeAmbiguous cm =
7441 let
7442 val cmi = DfaPassOne.passOne true cm
7443 val tab = DfaPassTwo.passTwo true cmi
7444 val dfa = DfaPassThree.passThree true tab
7445 in dfa
7446 end
7447
7448 (*--------------------------------------------------------------------*)
7449 (* generate a dfa for a content model. Raise Ambiguous if the content *)
7450 (* model is ambiguous. *)
7451 (*--------------------------------------------------------------------*)
7452 fun makeDfa cm =
7453 let
7454 val cmi = DfaPassOne.passOne false cm
7455 val tab = DfaPassTwo.passTwo false cmi
7456 val dfa = DfaPassThree.passThree false tab
7457 in dfa
7458 end
7459 handle ConflictFirst aqq => raise Ambiguous (countOccs aqq cm)
7460 | ConflictFollow aqq => raise Ambiguous (countOccs aqq cm)
7461
7462 (*--------------------------------------------------------------------*)
7463 (* make one transitions in the dfa. *)
7464 (*--------------------------------------------------------------------*)
7465 fun dfaTrans(tab,q,a) =
7466 if q<0 then dfaDontCare
7467 else let val (lo,hi,tab,_) = Vector.sub(tab,q)
7468 in if a>=lo andalso a<=hi then Vector.sub(tab,a-lo) else dfaError
7469 end
7470
7471 (*--------------------------------------------------------------------*)
7472 (* check whether a dfa's state is an accepting state. *)
7473 (*--------------------------------------------------------------------*)
7474 fun dfaFinal (tab,q) =
7475 q<0 orelse #4(Vector.sub(tab,q):Row)
7476 end
7477 (* stop of ../../Parser/Dfa/dfa.sml *)
7478 (* start of ../../Parser/entities.sml *)
7479 (*--------------------------------------------------------------------------*)
7480 (* Structure: Entities *)
7481 (* *)
7482 (* Exceptions raised by functions in this structure: *)
7483 (* closeAll : none *)
7484 (* getChar : none *)
7485 (* getEntId : none *)
7486 (* getPos : none *)
7487 (* inInternalSubset : none *)
7488 (* isOpenEntity : none *)
7489 (* isSpecialEnd : none *)
7490 (* Position2String : none *)
7491 (* pushDummy : none *)
7492 (* pushExtern : NoSuchFile *)
7493 (* pushIntern : none *)
7494 (* pushSpecial : NoSuchFile *)
7495 (* statePos : none *)
7496 (*--------------------------------------------------------------------------*)
7497 (* This module maintains the entity stack. For each open entity it holds a *)
7498 (* buffer to read characters from. When the buffer is exceeded, it gets re- *)
7499 (* filled with new characters, depending on the entity's encoding. *)
7500 (* *)
7501 (* End-of-line handling as specified in 2.11 is performed: *)
7502 (* *)
7503 (* ... To simplify the tasks of applications, wherever an external parsed *)
7504 (* entity or the literal entity value of an internal parsed entity *)
7505 (* contains either the literal two-character sequence "#xD#xA" or a *)
7506 (* standalone literal #xD, an XML processor must pass to the application *)
7507 (* the single character #xA. *)
7508 (* (This behavior can conveniently be produced by normalizing all line *)
7509 (* breaks to #xA on input, before parsing.) *)
7510 (* *)
7511 (* It also checks for illegal characters, cf. 2.2: *)
7512 (* *)
7513 (* [2] Char ::= #x9 | #xA | #xD /* any Unicode character, *)
7514 (* | [#x20-#xD7FF] excluding the surrogate *)
7515 (* | [#xE000-#xFFFD] blocks, FFFE, and FFFF. */ *)
7516 (* | [#x10000-#x10FFFF] *)
7517 (* *)
7518 (* More precisely, it assumes that all decoded characters are valid Unicode *)
7519 (* characters. It thus only checks for control characters other than #x9, *)
7520 (* #xA or #xD. *)
7521 (*--------------------------------------------------------------------------*)
7522 signature Entities =
7523 sig
7524 include Hooks
7525
7526 type State
7527 eqtype EntId
7528 datatype Special = DOC_ENTITY | EXT_SUBSET
7529
7530 exception CantOpenFile of (string * string) * AppData
7531
7532 val pushIntern : State * int * bool * UniChar.Vector -> State
7533 val pushExtern : State * int * bool * Uri.Uri -> State * Encoding.Encoding
7534 val pushSpecial : Special * Uri.Uri option -> State * Encoding.Encoding
7535
7536 val closeAll : State -> unit
7537
7538 val commitAuto : AppData * State -> AppData * State
7539 val changeAuto : AppData * State * string -> AppData * State * Encoding.Encoding
7540
7541 val getEntId : State -> EntId
7542 val getPos : State -> Errors.Position
7543 val getUri : State -> Uri.Uri
7544
7545 val getChar : AppData * State -> UniChar.Char * AppData * State
7546 val ungetChars : State * UniChar.Data -> State
7547
7548 val isOpen : int * bool * State -> bool
7549 val isSpecial : State -> bool
7550 val inDocEntity : State -> bool
7551 end
7552
7553 functor Entities (structure Hooks : Hooks) : Entities =
7554 struct
7555 open
7556 UniChar Decode Decode.Error Errors Hooks Uri UtilError
7557
7558 val THIS_MODULE = "Entities"
7559 val BUFSIZE = 1024
7560 type CharBuffer = UniChar.Char array
7561
7562 (*--------------------------------------------------------------------*)
7563 (* A special entity can not be popped from the stack by getChar, so *)
7564 (* it must be popped explicitly. This is for the document entity and *)
7565 (* the external subset. *)
7566 (*--------------------------------------------------------------------*)
7567 datatype Special = DOC_ENTITY | EXT_SUBSET
7568 (*--------------------------------------------------------------------*)
7569 (* In order to distinguish a general entity from a paramter entity, *)
7570 (* entity idxs are marked with this datatype. *)
7571 (*--------------------------------------------------------------------*)
7572 datatype EntId = GENERAL of int | PARAMETER of int
7573
7574 (*--------------------------------------------------------------------*)
7575 (* Make an EntId from the entity's index. *)
7576 (*--------------------------------------------------------------------*)
7577 fun makeEntId(idx,isParam) =
7578 if isParam then PARAMETER idx else GENERAL idx
7579
7580 (*--------------------------------------------------------------------*)
7581 (* A non-empty stack is: *)
7582 (* *)
7583 (* an internal entity INT(buf,size,idx,(id,other)): *)
7584 (* - (vec,idx,size) is a buffer,current index and its size; *)
7585 (* - id is the index of the entity's name in the entity table. *)
7586 (* - other contains the underlying entities (the rest of the stack). *)
7587 (* The components are nested according to access frequency. *)
7588 (* *)
7589 (* an external entity has three forms: *)
7590 (* EXT2(buf,size,idx,line,col,break,(dec,err,typ)) *)
7591 (* - (buf,size,idx) is a buffer, its size and current index; *)
7592 (* - (line,col) are the line and column; *)
7593 (* - break is a boolean indicating whether the last character was a *)
7594 (* carriage return (0xD) (then a succeeding line feed (0xA) must be *)
7595 (* supressed); *)
7596 (* - err is an option: if it is SOME(f,ee,err) then it indicates that *)
7597 (* the array was finished by a decoding error err, with the basic *)
7598 (* file f; f was at end of file if ee is true. Otherwise there was *)
7599 (* no error when loading the array. *)
7600 (* - dec describies the encoding of the entity and thus, how more *)
7601 (* data can be loaded; *)
7602 (* - typ is either of the form SPECIAL spec indicating a special *)
7603 (* entity; then this is the only entity on the stack. Otherwise it *)
7604 (* is NORMAL(id,other) for a normal external entity, with: *)
7605 (* + id is the index of the entity's name in the DTD; *)
7606 (* + other is the underlying stack. *)
7607 (* The components are nested according to access frequency. *)
7608 (* *)
7609 (* The second form of an external entity is *)
7610 (* EXT1(dec,line,col,break,typ). This is an unbuffered *)
7611 (* entity whose encoding declaration is being read. We may not load *)
7612 (* an array of characters as a whole because the encoding might still *)
7613 (* change. The components have the same meaning as for EXT2. *)
7614 (* *)
7615 (* A closed entity remains on the stack until the next getChar, for *)
7616 (* purposes of error printing. A closed external entity has the form *)
7617 (* CLOSED(dec,l,col,typ); components have the same meaning *)
7618 (* as for open external entities. A closed internal entity has the *)
7619 (* form ENDED(id,other) with components as above. *)
7620 (* *)
7621 (* Sometimes (for parsing xml/decl declarations) we need a lookahead. *)
7622 (* LOOKING(cs,q) is a state remembering all chars cs looked ahead up *)
7623 (* to state q, in reverse order. LOOKED(cs,q) is an undone lookahead, *)
7624 (* the looked-ahead chars now in the right order. *)
7625 (*--------------------------------------------------------------------*)
7626 datatype ExtType = SPECIAL of Special | NORMAL of EntId * State
7627 and State =
7628 LOOKED of Data * State
7629 | ENDED of EntId * State
7630 | CLOSED of DecFile * int * int * ExtType
7631 | INT of Vector * int * int * (EntId * State)
7632 | EXT1 of DecFile * int * int * bool * ExtType
7633 | EXT2 of CharBuffer * int * int * int * int * bool
7634 * (DecFile * DecodeError option * ExtType)
7635
7636 exception CantOpenFile of (string * string) * AppData
7637
7638 (*--------------------------------------------------------------------*)
7639 (* Extract the unique number from a state. *)
7640 (*--------------------------------------------------------------------*)
7641 fun getExtEntId extType =
7642 case extType
7643 of SPECIAL DOC_ENTITY => GENERAL 0
7644 | SPECIAL EXT_SUBSET => PARAMETER 0
7645 | NORMAL(id,_) => id
7646 fun getEntId q =
7647 case q
7648 of LOOKED (_,q) => getEntId q
7649 | ENDED(id,_) => id
7650 | CLOSED(_,_,_,extType) => getExtEntId extType
7651 | INT(_,_,_,(id,_)) => id
7652 | EXT1(_,_,_,_,extType) => getExtEntId extType
7653 | EXT2(_,_,_,_,_,_,(_,_,extType)) => getExtEntId extType
7654
7655 (*--------------------------------------------------------------------*)
7656 (* Find the nearest enclosing external entity, and return its *)
7657 (* filename, line and column number. *)
7658 (*--------------------------------------------------------------------*)
7659 fun getPos q =
7660 case q
7661 of ENDED(_,other) => getPos other
7662 | INT(_,_,_,(_,other)) => getPos other
7663 | CLOSED(dec,l,col,_) => (decName dec,l,col)
7664 | EXT1(dec,l,col,_,_) => (decName dec,l,col)
7665 | EXT2(_,_,_,l,col,_,(dec,_,_)) => (decName dec,l,col)
7666 | LOOKED (cs,q) => let val (f,l,c) = getPos q
7667 val k = length cs
7668 in if c>=k then (f,l,c-k) else (f,l,0)
7669 end
7670
7671 (*--------------------------------------------------------------------*)
7672 (* get the path of the nearest enclosing external entity. *)
7673 (*--------------------------------------------------------------------*)
7674 fun getUri q =
7675 case q
7676 of LOOKED (_,q) => getUri q
7677 | ENDED(_,other) => getUri other
7678 | INT(_,_,_,(_,other)) => getUri other
7679 | CLOSED(dec,l,col,_) => decUri dec
7680 | EXT1(dec,l,col,_,_) => decUri dec
7681 | EXT2(_,_,_,l,col,_,(dec,_,_)) => decUri dec
7682
7683 (*--------------------------------------------------------------------*)
7684 (* close all files, return nothing. *)
7685 (*--------------------------------------------------------------------*)
7686 fun closeAll q =
7687 case q
7688 of LOOKED(_,other) => closeAll other
7689 | ENDED(_,other) => closeAll other
7690 | CLOSED(_,_,_,SPECIAL _) => ()
7691 | CLOSED(_,_,_,NORMAL(_,other)) => closeAll other
7692 | INT(_,_,_,(_,other)) => closeAll other
7693 | EXT1(dec,_,_,_,SPECIAL _) => ignore(decClose dec)
7694 | EXT1(dec,_,_,_,NORMAL(_,other)) => (decClose dec; closeAll other)
7695 | EXT2(_,_,_,_,_,_,(dec,_,SPECIAL _)) => ignore(decClose dec)
7696 | EXT2(_,_,_,_,_,_,(dec,_,NORMAL(_,other))) => (decClose dec; closeAll other)
7697
7698 (*--------------------------------------------------------------------*)
7699 (* is this entity already on the stack? *)
7700 (*--------------------------------------------------------------------*)
7701 fun isOpen (idx,isParam,q) =
7702 let val id = makeEntId(idx,isParam)
7703 fun doit q =
7704 case q
7705 of LOOKED (_,other) => doit other
7706 | ENDED(id',other) => id=id' orelse doit other
7707 | CLOSED(_,_,_,SPECIAL _) => false
7708 | CLOSED(_,_,_,NORMAL(id',other)) => id=id' orelse doit other
7709 | INT(_,_,_,(id',other)) => id=id' orelse doit other
7710 | EXT1(_,_,_,_,SPECIAL _) => false
7711 | EXT1(_,_,_,_,NORMAL(id',other)) => id=id' orelse doit other
7712 | EXT2(_,_,_,_,_,_,(_,_,SPECIAL _)) => false
7713 | EXT2(_,_,_,_,_,_,(_,_,NORMAL(id',other))) => id=id' orelse doit other
7714 in doit q
7715 end
7716
7717 (*--------------------------------------------------------------------*)
7718 (* are we in the internal subset, i.e., in the document entity? *)
7719 (* The internal subset can only be in the document entity, since no *)
7720 (* parameter entities are declared prior to it. The document entity *)
7721 (* is then the only entity on the stack. *)
7722 (*--------------------------------------------------------------------*)
7723 fun inDocEntity q =
7724 case q
7725 of LOOKED (_,q) => inDocEntity q
7726 | ENDED(_,other) => inDocEntity other
7727 | INT(_,_,_,(_,other)) => inDocEntity other
7728 | CLOSED(_,_,_,NORMAL _) => false
7729 | CLOSED(_,_,_,SPECIAL what) => what=DOC_ENTITY
7730 | EXT1(_,_,_,_,NORMAL _) => false
7731 | EXT1(_,_,_,_,SPECIAL what) => what=DOC_ENTITY
7732 | EXT2(_,_,_,_,_,_,(_,_,NORMAL _)) => false
7733 | EXT2(_,_,_,_,_,_,(_,_,SPECIAL what)) => what=DOC_ENTITY
7734
7735 (*--------------------------------------------------------------------*)
7736 (* is this state the document end, i.e., are all entities closed? *)
7737 (*--------------------------------------------------------------------*)
7738 fun isSpecial q =
7739 case q
7740 of LOOKED (_,q) => isSpecial q
7741 | CLOSED(_,_,_,SPECIAL _) => true
7742 | EXT1(_,_,_,_,SPECIAL _) => true
7743 | EXT2(_,_,_,_,_,_,(_,_,SPECIAL _)) => true
7744 | _ => false
7745
7746 (*--------------------------------------------------------------------*)
7747 (* Initialize and load a new buffer when opening an external entity. *)
7748 (*--------------------------------------------------------------------*)
7749 fun initArray dec =
7750 let
7751 val arr = Array.array(BUFSIZE,0wx0)
7752 val (n,dec1,err) = decGetArray dec arr
7753 in (arr,n,dec1,err)
7754 end
7755
7756 (*--------------------------------------------------------------------*)
7757 (* Open an external/internal entity. *)
7758 (*--------------------------------------------------------------------*)
7759 fun pushIntern(q,id,isParam,vec) =
7760 INT(vec,Vector.length vec,0,(makeEntId(id,isParam),q))
7761 fun pushExtern(q,id,isParam,uri) =
7762 let
7763 val dec = decOpenXml (SOME uri)
7764 val auto = decEncoding dec
7765 val q1 = EXT1(dec,1,0,false,NORMAL(makeEntId(id,isParam),q))
7766 in (q1,auto)
7767 end
7768 fun pushSpecial(what,uri) =
7769 let
7770 val dec = decOpenXml uri
7771 val auto = decEncoding dec
7772 val q = EXT1(dec,1,0,false,SPECIAL what)
7773 in (q,auto)
7774 end
7775
7776 (*--------------------------------------------------------------------*)
7777 (* confirm the autodetected encoding of an external entity. *)
7778 (*--------------------------------------------------------------------*)
7779 fun commitAuto(a,q) =
7780 case q
7781 of EXT1(dec,l,col,brk,typ) =>
7782 let
7783 val a1 = a before decCommit dec
7784 handle DecError(_,_,err)
7785 => hookError(a,(getPos q,ERR_DECODE_ERROR err))
7786 val (arr,n,dec1,err) = initArray dec
7787 in (a1,EXT2(arr,n,0,l,col,brk,(dec1,err,typ)))
7788 end
7789 (*
7790 in (a1,EXT1(dec,l,col,brk,typ))
7791 end
7792 *)
7793 | LOOKED(cs,q1) => let val (a1,q2) = commitAuto (a,q1)
7794 in (a1,LOOKED(cs,q2))
7795 end
7796 | CLOSED _ => (a,q)
7797 | _ => raise InternalError(THIS_MODULE,"commitAuto",
7798 "entity is neither EXT1 nor CLOSED nor LOOKED")
7799
7800 (*--------------------------------------------------------------------*)
7801 (* change from the autodetected encoding to the declared one. *)
7802 (*--------------------------------------------------------------------*)
7803 fun changeAuto (a,q,decl) =
7804 case q
7805 of EXT1(dec,l,col,brk,typ) =>
7806 let
7807 val dec1 = decSwitch(dec,decl)
7808 handle DecError(dec,_,err)
7809 => let val a1 = hookError(a,(getPos q,ERR_DECODE_ERROR err))
7810 val _ = decClose dec
7811 val uri = decName dec
7812 val msg = case err
7813 of ERR_UNSUPPORTED_ENC _ => "Unsupported encoding"
7814 | _ => "Declared encoding incompatible"
7815 ^"with auto-detected encoding"
7816 in raise CantOpenFile ((uri,msg),a1)
7817 end
7818 val newEnc = decEncoding dec1
7819 val (arr,n,dec2,err) = initArray dec1
7820 in (a,EXT2(arr,n,0,l,col,brk,(dec2,err,typ)),newEnc)
7821 end
7822 (*
7823 in (a,EXT1(dec1,l,col,brk,typ),newEnc)
7824 end
7825 *)
7826
7827 | LOOKED(cs,q1) => let val (a2,q2,enc2) = changeAuto(a,q1,decl)
7828 in (a2,LOOKED(cs,q2),enc2)
7829 end
7830 | CLOSED(dec,_,_,_) => (a,q,decEncoding dec)
7831 | _ => raise InternalError(THIS_MODULE,"changeAuto",
7832 "entity is neither EXT1 nor CLOSED nor LOOKED")
7833
7834 (*--------------------------------------------------------------------*)
7835 (* Get one character from the current entity. Possibly reload buffer. *)
7836 (* Return 0wx0 at entity end. Otherwise check whether the character *)
7837 (* is valid (cf. 2.2). If the last character was a carriage return *)
7838 (* (0xD) supress a line feed (0xA). *)
7839 (*--------------------------------------------------------------------*)
7840 fun getChar (a,q) =
7841 case q
7842 of ENDED(_,other) => getChar(a,other)
7843 | CLOSED(_,_,_,typ) =>
7844 (case typ
7845 of SPECIAL _ => raise InternalError (THIS_MODULE,"getChar",
7846 "attempt to read beyond special entity end")
7847 | NORMAL(_,other) => getChar(a,other))
7848 | INT(vec,s,i,io) =>
7849 if i>=s then (0wx0,a,ENDED io)
7850 else (Vector.sub(vec,i),a,INT(vec,s,i+1,io))
7851 | EXT1(dec,l,col,br,typ) =>
7852 (let
7853 val (c,dec1) = decGetChar dec
7854 in
7855 if (* c>=0wx20 orelse c=0wx09 *)
7856 c>=0wx0020
7857 andalso (c<=0wxD7FF
7858 orelse c>=0wxE000 andalso (c<=0wxFFFD
7859 orelse c>=0wx10000))
7860 orelse c=0wx9
7861 then (c,a,EXT1(dec1,l,col+1,false,typ))
7862 else if c=0wxA
7863 then if br then getChar(a,EXT1(dec1,l,col,false,typ))
7864 else (c,a,EXT1(dec1,l+1,0,false,typ))
7865 else (if c=0wxD then (0wxA,a,EXT1(dec1,l+1,0,true,typ))
7866 else let val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
7867 in getChar(a1,EXT1(dec1,l,col+1,false,typ))
7868 end)
7869 end
7870 handle DecEof dec => (0wx0,a,CLOSED(dec,l,col,typ))
7871 | DecError(dec,eof,err) =>
7872 let val err = ERR_DECODE_ERROR err
7873 val a1 = hookError(a,(getPos q,err))
7874 in if eof then (0wx0,a,CLOSED(dec,l,col,typ))
7875 else getChar(a1,EXT1(dec,col,l,br,typ))
7876 end)
7877 | EXT2(arr,s,i,l,col,br,det) =>
7878 if i<s
7879 then let val c = Array.sub(arr,i)
7880 in if (* c>=0wx20 orelse c=0wx09 *)
7881 (* c>=0wx0020 andalso c<=0wxD7FF orelse c=0wx9 orelse *)
7882 (* c>=0wxE000 andalso c<=0wxFFFD orelse c>=0wx10000 *)
7883 c>=0wx0020
7884 andalso (c<=0wxD7FF
7885 orelse c>=0wxE000 andalso (c<=0wxFFFD
7886 orelse c>=0wx10000))
7887 orelse c=0wx9
7888 then (c,a,EXT2(arr,s,i+1,l,col+1,false,det))
7889 else if c=0wxA
7890 then if br then getChar(a,EXT2(arr,s,i+1,l,col,false,det))
7891 else (c,a,EXT2(arr,s,i+1,l+1,0,false,det))
7892 else (if c=0wxD then (0wxA,a,EXT2(arr,s,i+1,l+1,0,true,det))
7893 else let val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
7894 in getChar(a1,EXT2(arr,s,i+1,l,col+1,false,det))
7895 end)
7896 end
7897 else let val (dec,err,typ) = det
7898 val (a1,(n,dec1,err1)) =
7899 case err
7900 of NONE => if s=BUFSIZE then (a,decGetArray dec arr)
7901 else (a,(0,dec,NONE))
7902 | SOME err => (hookError(a,(getPos q,ERR_DECODE_ERROR err)),
7903 decGetArray dec arr)
7904 in if n=0 andalso not (isSome err1)
7905 then (0wx0,a1,CLOSED(dec1,l,col,typ))
7906 else getChar(a1,EXT2(arr,n,0,l,col,br,(dec1,err1,typ)))
7907 end
7908 | LOOKED(nil,q) => getChar(a,q)
7909 | LOOKED(c::cs,q) => (c,a,LOOKED(cs,q))
7910
7911 (*--------------------------------------------------------------------*)
7912 (* unget a list of characters. *)
7913 (*--------------------------------------------------------------------*)
7914 fun ungetChars (q,cs) = LOOKED(cs,q)
7915 end
7916 (* stop of ../../Parser/entities.sml *)
7917 (* start of ../../Parser/Dtd/dtdDeclare.sml *)
7918 (*--------------------------------------------------------------------------*)
7919 (* Structure: DtdDeclare *)
7920 (* *)
7921 (*--------------------------------------------------------------------------*)
7922 (* Functor: DtdDeclare *)
7923 (*--------------------------------------------------------------------------*)
7924 (* This module provides functions for adding declarations to the DTD tables *)
7925 (* and for doing checks on components of declarations. *)
7926 (*--------------------------------------------------------------------------*)
7927 functor DtdDeclare (structure Dtd : Dtd
7928 structure Entities : Entities
7929 structure ParserOptions : ParserOptions) =
7930 struct
7931 open
7932 UtilInt UtilList
7933 Base Dtd Errors Entities ParserOptions UniChar UniClasses
7934
7935 (*--------------------------------------------------------------------*)
7936 (* check whether a sequence a chars is the b-adic representation of a *)
7937 (* character's code, terminated by ";". base will be 10 or 16, isBase *)
7938 (* will check for a character being a decimal/hexadecimal number. *)
7939 (*--------------------------------------------------------------------*)
7940 fun checkBasimal (base,baseValue) (ch:Char,cs) =
7941 let fun doit _ (nil:Data) = false
7942 | doit yet [0wx3B] = yet=ch
7943 | doit yet (c::cs) = case baseValue c
7944 of NONE => false
7945 | SOME v => doit (base*yet+v) cs
7946 in doit 0w0 cs
7947 end
7948 val checkDecimal = checkBasimal (0w10,decValue)
7949 val checkHeximal = checkBasimal (0wx10,hexValue)
7950
7951 (*--------------------------------------------------------------------*)
7952 (* check a character reference for identifying a character. *)
7953 (*--------------------------------------------------------------------*)
7954 fun checkRef (ch,0wx26::0wx23::0wx78::cs) (* "&#x..." *) = checkHeximal(ch,cs)
7955 | checkRef (ch,0wx26::0wx23::cs) (* "&#..." *) = checkDecimal(ch,cs)
7956 | checkRef _ = false
7957
7958 (*--------------------------------------------------------------------*)
7959 (* check for a single character ch. *)
7960 (*--------------------------------------------------------------------*)
7961 fun checkSingle (ch,[c]) = c=ch
7962 | checkSingle _ = false
7963
7964 (*--------------------------------------------------------------------*)
7965 (* check a predefined entity for being well defined. Note that both *)
7966 (* a single char and a char ref representation are allowed, except *)
7967 (* for 'amp' which must be escaped. *)
7968 (*--------------------------------------------------------------------*)
7969 fun checkPredef (idx,cs) =
7970 case idx
7971 of 1 => checkRef(0wx26,cs)
7972 | 2 => checkSingle(0wx3C,cs) orelse checkRef(0wx3C,cs)
7973 | 3 => checkSingle(0wx3E,cs) orelse checkRef(0wx3E,cs)
7974 | 4 => checkSingle(0wx27,cs) orelse checkRef(0wx27,cs)
7975 | 5 => checkSingle(0wx22,cs) orelse checkRef(0wx22,cs)
7976 | _ => true
7977
7978 (*--------------------------------------------------------------------*)
7979 (* Given the declaration of an entity check whether it is predefined. *)
7980 (* If no return false. If yes, check whether is was already declared *)
7981 (* and whether it is correctly declared. See 4.6: *)
7982 (* *)
7983 (* All XML processors must recognize these entities whether they *)
7984 (* are declared or not. For interoperability, valid XML documents *)
7985 (* should declare these entities, like any others, before using *)
7986 (* them. If the entities in question are declared, they must be *)
7987 (* declared as internal entities whose replacement text is the *)
7988 (* single character being escaped or a character reference to that *)
7989 (* character, as shown below. *)
7990 (* *)
7991 (* <!ENTITY lt "&#38;#60;"> *)
7992 (* <!ENTITY gt "&#62;"> *)
7993 (* <!ENTITY amp "&#38;#38;"> *)
7994 (* <!ENTITY apos "&#39;"> *)
7995 (* <!ENTITY quot "&#34;"> *)
7996 (* *)
7997 (* Note that the < and & characters in the declarations of "lt" and *)
7998 (* "amp" are doubly escaped to meet the requirement that entity *)
7999 (* replacement be well-formed. *)
8000 (* *)
8001 (* print an error if the entity was already declared. *)
8002 (* print an error if the declaration is not correct. *)
8003 (*--------------------------------------------------------------------*)
8004 fun checkPredefined dtd (a,q) (idx,ent) =
8005 if !O_VALIDATE andalso idx>=1 andalso idx<=5 then
8006 let
8007 val a1 = if !O_WARN_MULT_ENT_DECL andalso isRedefined dtd idx
8008 then let val warn = WARN_MULT_DECL(IT_GEN_ENT,Index2GenEnt dtd idx)
8009 in hookWarning(a,(getPos q,warn))
8010 end
8011 else a before setRedefined dtd idx
8012 val a2 =
8013 if !O_CHECK_PREDEFINED then
8014 let val correct =
8015 case ent
8016 of GE_INTERN(_,rep) => checkPredef (idx,Vector2Data rep)
8017 | _ => false
8018 in if correct then a1
8019 else let val err = ERR_DECL_PREDEF(Index2GenEnt dtd idx,validPredef idx)
8020 in hookError(a1,(getPos q,err))
8021 end
8022 end
8023 else a1
8024 in (true,a2)
8025 end
8026 else (false,a)
8027
8028 (*--------------------------------------------------------------------*)
8029 (* add an entity declaration to the DTD tables. 4.2 *)
8030 (* *)
8031 (* ... If the same entity is declared more than once, the first *)
8032 (* declaration encountered is binding; at user option, an XML *)
8033 (* processor may issue a warning if entities are declared multiple *)
8034 (* times. *)
8035 (* *)
8036 (* For general entities, check whether it is a predefined entity and *)
8037 (* if so, whether it is declared correctly. *)
8038 (*--------------------------------------------------------------------*)
8039 (* print a warning and ignore the declaration if the notation was *)
8040 (* declared previously. *)
8041 (*--------------------------------------------------------------------*)
8042 fun addGenEnt dtd (a,q) (idx,ent,ext) =
8043 case getGenEnt dtd idx
8044 of (GE_NULL,_) => a before setGenEnt dtd (idx,(ent,ext))
8045 | _ => let val (pre,a1) = checkPredefined dtd (a,q) (idx,ent)
8046 in if pre orelse not (!O_WARN_MULT_ENT_DECL) then a1
8047 else hookWarning(a1,(getPos q,WARN_MULT_DECL
8048 (IT_GEN_ENT,Index2GenEnt dtd idx)))
8049 end
8050
8051 fun addParEnt dtd (a,q) (idx,ent,ext) =
8052 case getParEnt dtd idx
8053 of (PE_NULL,_) => a before setParEnt dtd (idx,(ent,ext))
8054 | _ => if !O_WARN_MULT_ENT_DECL
8055 then hookWarning(a,(getPos q,WARN_MULT_DECL
8056 (IT_PAR_ENT,Index2ParEnt dtd idx)))
8057 else a
8058
8059 (*--------------------------------------------------------------------*)
8060 (* at option print a warning if not all predefined entities have been *)
8061 (* declared. Cf. 4.1: *)
8062 (* *)
8063 (* For interoperability, valid documents should declare the *)
8064 (* entities amp, lt, gt, apos, quot, in the form specified in *)
8065 (* "4.6 Predefined Entities". *)
8066 (*--------------------------------------------------------------------*)
8067 fun checkPreDefined dtd (a,q) =
8068 if !O_VALIDATE andalso !O_INTEROPERABILITY andalso
8069 !O_WARN_SHOULD_DECLARE andalso hasDtd dtd
8070 then case notRedefined dtd
8071 of nil => a
8072 | ents => hookWarning(a,(getPos q,WARN_SHOULD_DECLARE ents))
8073 else a
8074
8075 (*--------------------------------------------------------------------*)
8076 (* add a notation declaration to the DTD tables. *)
8077 (* *)
8078 (* though the rec. says nothing about repeated notation declarations, *)
8079 (* I assume that the intention is to treat them like entities, i.e. *)
8080 (* ignore repeated declarations with an optional warning. *)
8081 (* *)
8082 (* print a warning and ignore the declaration if the notation was *)
8083 (* declared previously. *)
8084 (*--------------------------------------------------------------------*)
8085 fun addNotation dtd (a,q) (idx,nt) =
8086 if hasNotation dtd idx
8087 then if !O_WARN_MULT_NOT_DECL
8088 then hookWarning(a,(getPos q,WARN_MULT_DECL
8089 (IT_NOTATION,Index2AttNot dtd idx)))
8090 else a
8091 else a before setNotation dtd (idx,nt)
8092
8093 (*--------------------------------------------------------------------*)
8094 (* add an element declaration to the element table. Only the content *)
8095 (* part of the element info is updated. 3.2: *)
8096 (* *)
8097 (* Validity Constraint: Unique Element Type Declaration *)
8098 (* No element type may be declared more than once. *)
8099 (* *)
8100 (* print an error and ignore the declaration if the element was *)
8101 (* declared previously. *)
8102 (*--------------------------------------------------------------------*)
8103 fun addElement dtd (a,q) (idx,cont,ext) =
8104 let val {decl,atts,errAtts,...} = getElement dtd idx
8105 in case decl
8106 of NONE => a before setElement dtd (idx,{decl = SOME(cont,ext),
8107 atts = atts,
8108 errAtts = errAtts})
8109 | SOME _ => if !O_VALIDATE
8110 then hookError(a,(getPos q,ERR_REDEC_ELEM(Index2Element dtd idx)))
8111 else a
8112 end
8113
8114 (*--------------------------------------------------------------------*)
8115 (* at option, pretend an element is declared by adding a default *)
8116 (* declaration. Only the decl flag of the element info is updated. *)
8117 (*--------------------------------------------------------------------*)
8118 fun handleUndeclElement dtd idx =
8119 let
8120 val {atts,errAtts,...} = getElement dtd idx
8121 val newInfo = {decl = SOME(CT_ANY,false),
8122 atts = atts,
8123 errAtts = errAtts}
8124 in newInfo before setElement dtd (idx,newInfo)
8125 end
8126
8127 (*--------------------------------------------------------------------*)
8128 (* check whether an element is declared and whether it already had an *)
8129 (* attribute list declaration. Cf. 3.3: *)
8130 (* *)
8131 (* At user option, an XML processor may issue a warning if *)
8132 (* attributes are declared for an element type not itself declared, *)
8133 (* but this is not an error. *)
8134 (* *)
8135 (* ... an XML processor may at user option issue a warning when *)
8136 (* more than one attribute-list declaration is provided for a given *)
8137 (* element type, ... *)
8138 (* *)
8139 (* print a warning if the element is not declared or already had an *)
8140 (* attribute list declaration. *)
8141 (*--------------------------------------------------------------------*)
8142 fun enterAttList dtd (a,q) idx =
8143 let
8144 val {decl,atts,errAtts,...} = getElement dtd idx
8145 val a1 = if isSome decl orelse not (!O_WARN_ATT_NO_ELEM) then a
8146 else hookWarning(a,(getPos q,WARN_ATT_UNDEC_ELEM(Index2Element dtd idx)))
8147 in
8148 case atts
8149 of NONE => a1 before
8150 setElement dtd (idx,{decl=decl,atts=SOME(nil,false),errAtts=errAtts})
8151 | _ => if !O_INTEROPERABILITY andalso !O_WARN_MULT_ATT_DECL
8152 then hookWarning(a1,(getPos q,WARN_MULT_ATT_DECL(Index2Element dtd idx)))
8153 else a1
8154 end
8155
8156 (*--------------------------------------------------------------------*)
8157 (* check whether attribute "xml:space" is declared correctly. 2.10: *)
8158 (* *)
8159 (* A special attribute named xml:space may be attached ... In valid *)
8160 (* documents, this attribute, like any other, must be declared if *)
8161 (* it is used. When declared, it must be given as an enumerated *)
8162 (* type whose only possible values are "default" and "preserve". *)
8163 (*--------------------------------------------------------------------*)
8164 fun checkAttDef (a,q) (aidx,attType,_,_) =
8165 if aidx<>xmlSpaceIdx orelse attType=xmlSpaceType then a
8166 else hookError(a,(getPos q,ERR_XML_SPACE))
8167
8168 (*--------------------------------------------------------------------*)
8169 (* enter a definition of a single attribute to the element table. *)
8170 (* ignore the definition if the attribute is already defined for that *)
8171 (* element. Cf. 3.3: *)
8172 (* *)
8173 (* When more than one AttlistDecl is provided for a given element *)
8174 (* type, the contents of all those provided are merged. When more *)
8175 (* than one definition is provided for the same attribute of a *)
8176 (* given element type, the first declaration is binding and later *)
8177 (* declarations are ignored. For interoperability, an XML processor *)
8178 (* may at user option issue a warning when ... more than one *)
8179 (* attribute definition is provided for a given attribute, but this *)
8180 (* is not an error. *)
8181 (* *)
8182 (* If the attribute type is ID, check whether an element already has *)
8183 (* an attribute of that type. 3.3.1: *)
8184 (* *)
8185 (* Validity Constraint: One ID per Element Type *)
8186 (* No element type may have more than one ID attribute specified. *)
8187 (*--------------------------------------------------------------------*)
8188 (* print an error if the element already has an ID attribute. *)
8189 (* print a warning if the attr. is already defined for this element. *)
8190 (*--------------------------------------------------------------------*)
8191 (* return the new application data. *)
8192 (*--------------------------------------------------------------------*)
8193 fun addAttribute dtd (a,q) (eidx,attDef as (att,attType,attDefault,_)) =
8194 let
8195 val a1 = checkAttDef (a,q) attDef
8196
8197 fun doit nil = (false,[attDef],a)
8198 | doit (atts as (ad as (aidx,_,_,_))::rest) =
8199 if aidx=att
8200 then let val a1 = if !O_INTEROPERABILITY andalso !O_WARN_MULT_ATT_DEF
8201 then let val warn = WARN_MULT_ATT_DEF
8202 (Index2Element dtd eidx,Index2AttNot dtd att)
8203 in hookWarning(a,(getPos q,warn))
8204 end
8205 else a
8206 in (true,atts,a1)
8207 end
8208 else (if aidx<att then (false,attDef::atts,a)
8209 else let val (redefined,atts1,a1) = doit rest
8210 in (redefined,ad::atts1,a1)
8211 end)
8212
8213 val {decl,atts,errAtts,...} = getElement dtd eidx
8214 val (defs,hadId) = getOpt(atts,(nil,false))
8215 val (redefined,defs1,a1) = doit defs
8216 val (newId,a1) = if isIdType attType
8217 then let val a1 = if hadId andalso (not redefined) andalso !O_VALIDATE
8218 then hookError(a,(getPos q,ERR_MULT_ID_ELEM
8219 (Index2Element dtd eidx)))
8220 else a
8221 in (true,a1)
8222 end
8223 else (hadId,a)
8224 val (_,defs1,a1) = doit defs
8225 val _ = setElement dtd (eidx,{decl = decl,
8226 atts = SOME(defs1,newId),
8227 errAtts = errAtts})
8228 in a1
8229 end
8230
8231 (*--------------------------------------------------------------------*)
8232 (* check whether a name starts with (a case variant of) "xml" and if *)
8233 (* yes, whether it is an allowed name from the spec. Cf. 3: *)
8234 (* *)
8235 (* This specification does not constrain ... names of the element *)
8236 (* types and attributes, except that names beginning with a match *)
8237 (* to (('X'|'x')('M'|'m')('L'|'l')) are reserved for standardization*)
8238 (* in this or future versions of this specification. *)
8239 (* *)
8240 (* and 2.10, 2.12: *)
8241 (* *)
8242 (* ... a special attribute named xml:space may be attached ... *)
8243 (* ... A special attribute named xml:lang may be inserted ... *)
8244 (* *)
8245 (* print an error if the name is reserved and not standardized. *)
8246 (*--------------------------------------------------------------------*)
8247 fun startsWithXml name =
8248 case name
8249 of c1::c2::c3::cs => (c1=0wx58 orelse c1=0wx78) andalso
8250 (c2=0wx4D orelse c2=0wx6D) andalso (c3=0wx4C orelse c3=0wx6C)
8251 | _ => false
8252 fun checkAttName (a,q) name =
8253 if !O_CHECK_RESERVED andalso startsWithXml name then
8254 case name
8255 of [0wx78,0wx6d,0wx6c,0wx3a,0wx6c,0wx61,0wx6e,0wx67] (* ":lang" *) => a
8256 | [0wx78,0wx6d,0wx6c,0wx3a,0wx73,0wx70,0wx61,0wx63,0wx65] (* ":space" *) => a
8257 | _ => hookError(a,(getPos q,ERR_RESERVED(name,IT_ATT_NAME)))
8258 else a
8259 fun checkElemName (a,q) name =
8260 if !O_CHECK_RESERVED andalso startsWithXml name
8261 then hookError(a,(getPos q,ERR_RESERVED(name,IT_ELEM)))
8262 else a
8263
8264 (*--------------------------------------------------------------------*)
8265 (* check for each element in the dtd, whether a name token occurs *)
8266 (* more than once in its enumerated attribute types. *)
8267 (* *)
8268 (* print a warning for each element where this is true. *)
8269 (* *)
8270 (* return nothing. *)
8271 (*--------------------------------------------------------------------*)
8272 fun checkMultEnum dtd (a,q) =
8273 if !O_INTEROPERABILITY andalso !O_WARN_MULT_ENUM then
8274 let
8275 fun doElem a idx =
8276 let
8277 (*-----------------------------------------------------*)
8278 (* for each i, add i to yet if it not in that list. *)
8279 (* otherwise add it to dup. *)
8280 (*-----------------------------------------------------*)
8281 fun do_list yd nil = yd
8282 | do_list (yet,dup) (i::is) =
8283 let val yd' = case insertNewInt (i,yet)
8284 of NONE => (yet,insertInt (i,dup))
8285 | SOME new => (new,dup)
8286 in do_list yd' is
8287 end
8288 (*-----------------------------------------------------*)
8289 (* For each enumerated attribute type call the appro- *)
8290 (* priate function. *)
8291 (*-----------------------------------------------------*)
8292 fun doit (yet,dup) nil = dup
8293 | doit (yet,dup) ((_,attType,_,_)::rest) =
8294 case attType
8295 of AT_GROUP is => doit (do_list (yet,dup) is) rest
8296 | AT_NOTATION is => doit (do_list (yet,dup) is) rest
8297 | _ => doit (yet,dup) rest
8298
8299 val defs = case #atts(getElement dtd idx)
8300 of NONE => nil
8301 | SOME(defs,_) => defs
8302 val dup = doit (nil,nil) defs
8303 in
8304 if null dup then a
8305 else hookWarning(a,(getPos q,WARN_ENUM_ATTS
8306 (Index2Element dtd idx,map (Index2AttNot dtd) dup)))
8307 end
8308 (*-----------------------------------------------------------*)
8309 (* the highest used index is usedIndices-1. *)
8310 (*-----------------------------------------------------------*)
8311 val maxIdx = maxUsedElem dtd
8312
8313 fun doit a i = if i>maxIdx then a else doit (doElem a i) (i+1)
8314 in
8315 doit a 0
8316 end
8317 else a
8318
8319 (*--------------------------------------------------------------------*)
8320 (* check for all id names refereneced by some IDREF attribute whether *)
8321 (* it was also declared by an ID attribute. *)
8322 (* *)
8323 (* print an error if a referenced ID name was not defined. *)
8324 (* *)
8325 (* return nothing. *)
8326 (*--------------------------------------------------------------------*)
8327 fun checkDefinedIds dtd (a,q) =
8328 if !O_VALIDATE then
8329 let
8330 val maxId = maxUsedId dtd
8331
8332 fun doOne a i = let val (decl,refs) = getId dtd i
8333 in if decl orelse null refs then a
8334 else hookError(a,(hd refs,ERR_UNDECL_ID(Index2Id dtd i,tl refs)))
8335 end
8336 fun doAll a i = if i>maxId then a else doAll (doOne a i) (i+1)
8337 in
8338 doAll a 0
8339 end
8340 else a
8341
8342 (*--------------------------------------------------------------------*)
8343 (* check for all declared unparsed entities, whether their notations *)
8344 (* have been declared. *)
8345 (* *)
8346 (* print an error if a notation was not declared. *)
8347 (* *)
8348 (* return nothing. *)
8349 (*--------------------------------------------------------------------*)
8350 fun checkUnparsed dtd a =
8351 if !O_VALIDATE then
8352 let
8353 val maxGen = maxUsedGen dtd
8354
8355 fun doOne a i =
8356 case getGenEnt dtd i
8357 of (GE_UNPARSED(_,nidx,pos),_) =>
8358 if hasNotation dtd nidx then a
8359 else hookError(a,(pos,ERR_UNDECLARED
8360 (IT_NOTATION,Index2AttNot dtd nidx,LOC_NONE)))
8361 | _ => a
8362 fun doAll a i = if i>maxGen then a else doAll (doOne a i) (i+1)
8363 in
8364 doAll a 0
8365 end
8366 else a
8367 end
8368 (* stop of ../../Parser/Dtd/dtdDeclare.sml *)
8369 (* start of ../../Parser/Dtd/dtdAttributes.sml *)
8370 (*--------------------------------------------------------------------------*)
8371 (* Structure: DtdAttributes *)
8372 (* *)
8373 (* Exceptions raised by functions in this structure: *)
8374 (* checkAttValue : AttValue InternalError *)
8375 (* checkDefinedIds : none *)
8376 (* genMissingAtts : none *)
8377 (* makeAttValue : AttValue InternalError *)
8378 (*--------------------------------------------------------------------------*)
8379 functor DtdAttributes (structure Dtd : Dtd
8380 structure Entities : Entities
8381 structure ParserOptions : ParserOptions) =
8382 struct
8383 structure DtdDeclare = DtdDeclare (structure Dtd = Dtd
8384 structure Entities = Entities
8385 structure ParserOptions = ParserOptions)
8386 open
8387 UniChar UniClasses UtilList
8388 Base Dtd DtdDeclare Errors Entities HookData ParserOptions
8389
8390 val THIS_MODULE = "DtdAttributes"
8391
8392 exception AttValue of AppData
8393
8394 (*--------------------------------------------------------------------*)
8395 (* this is the list of language codes in ISO 639. *)
8396 (*--------------------------------------------------------------------*)
8397 val iso639codes =
8398 Vector.fromList
8399 ["AA","AB","AF","AM","AR","AS","AY","AZ",
8400 "BA","BE","BG","BH","BI","BN","BO","BR",
8401 "CA","CO","CS","CY",
8402 "DA","DE","DZ",
8403 "EL","EN","EO","ES","ET","EU",
8404 "FA","FI","FJ","FO","FR","FY",
8405 "GA","GD","GL","GN","GU",
8406 "HA","HE","HI","HR","HU","HY",
8407 "IA","ID","IE","IK","IN","IS","IT","IU","IW",
8408 "JA","JI","JW",
8409 "KA","KK","KL","KM","KN","KO","KS","KU","KY",
8410 "LA","LN","LO","LT","LV",
8411 "MG","MI","MK","ML","MN","MO","MR","MS","MT","MY",
8412 "NA","NE","NL","NO",
8413 "OC","OM","OR",
8414 "PA","PL","PS","PT",
8415 "QU",
8416 "RM","RN","RO","RU","RW",
8417 "SA","SD","SG","SH","SI","SK","SL","SM","SN","SO","SQ","SR","SS","ST","SU","SV","SW",
8418 "TA","TE","TG","TH","TI","TK","TL","TN","TO","TR","TS","TT","TW",
8419 "UG","UK","UR","UZ",
8420 "VI","VO",
8421 "WO",
8422 "XH",
8423 "YI","YO",
8424 "ZA","ZH","ZU"]
8425
8426 (*--------------------------------------------------------------------*)
8427 (* a two-dimensional field [0..25][0..25] of booleans for ISO 639. *)
8428 (*--------------------------------------------------------------------*)
8429 val iso639field =
8430 let
8431 val arr = Array.tabulate(26,fn _ => Array.array(26,false))
8432 val _ = Vector.map
8433 (fn s => Array.update(Array.sub(arr,ord(String.sub(s,0))-65),
8434 ord(String.sub(s,1))-65,
8435 true))
8436 iso639codes
8437 in Vector.tabulate(26,fn i => Array.extract (Array.sub(arr,i),0,NONE))
8438 end
8439
8440 (*--------------------------------------------------------------------*)
8441 (* for a letter, compute ord(toUpper c)-ord(#"A"), for subscripting. *)
8442 (*--------------------------------------------------------------------*)
8443 val toUpperMask = Chars.notb(0wx20)
8444 fun cIndex c = Chars.toInt(Chars.andb(c,toUpperMask)-0wx41)
8445
8446 (*--------------------------------------------------------------------*)
8447 (* are these two letters an ISO 639 code? *)
8448 (*--------------------------------------------------------------------*)
8449 fun isIso639 (c1,c2) =
8450 if !O_CHECK_ISO639 then
8451 Vector.sub(Vector.sub(iso639field,cIndex c1),cIndex c2)
8452 handle Subscript => false
8453 else isAsciiLetter c1 andalso isAsciiLetter c2
8454
8455 (*--------------------------------------------------------------------*)
8456 (* does this match Subcode ('-' Subcode)* ? *)
8457 (* is this a sequence of ('-' Subcode) ? *)
8458 (* Iana codes and user codes also end on ([a-z] | [A-Z])+ *)
8459 (*--------------------------------------------------------------------*)
8460 fun isSubcode' nil = false
8461 | isSubcode' (c::cs) =
8462 let fun doit nil = true
8463 | doit (c::cs) = if c=0wx2D then isSubcode' cs
8464 else isAsciiLetter c andalso doit cs
8465 in isAsciiLetter c andalso doit cs
8466 end
8467 fun isSubcode nil = true
8468 | isSubcode (c::cs) = c=0wx2D andalso isSubcode' cs
8469 val isIanaUser = isSubcode'
8470
8471 (*--------------------------------------------------------------------*)
8472 (* Check whether a "xml:lang" attribute matches the LanguageID *)
8473 (* production. 2.12: *)
8474 (* *)
8475 (* [33] LanguageID ::= Langcode ('-' Subcode)* *)
8476 (* [34] Langcode ::= ISO639Code | IanaCode | UserCode *)
8477 (* [35] ISO639Code ::= ([a-z] | [A-Z]) ([a-z] | [A-Z]) *)
8478 (* [36] IanaCode ::= ('i' | 'I') '-' ([a-z] | [A-Z])+ *)
8479 (* [37] UserCode ::= ('x' | 'X') '-' ([a-z] | [A-Z])+ *)
8480 (* [38] Subcode ::= ([a-z] | [A-Z])+ *)
8481 (* *)
8482 (* print an error and raise AttValue if the "xml:lang" attribute does *)
8483 (* not have a valid value. *)
8484 (*--------------------------------------------------------------------*)
8485 fun checkAttSpec (a,q) (aidx,cs) =
8486 if !O_CHECK_LANGID andalso aidx=xmlLangIdx
8487 then let val valid = case cs
8488 of c::0wx2D::cs' => (c=0wx49 orelse
8489 c=0wx69 orelse
8490 c=0wx58 orelse
8491 c=0wx78) andalso isIanaUser cs'
8492 | c1::c2::cs' => isIso639 (c1,c2) andalso isSubcode cs'
8493 | _ => false
8494 in
8495 if valid then a
8496 else raise AttValue(hookError(a,(getPos q,ERR_ATT_IS_NOT(cs,IT_LANG_ID))))
8497 end
8498 else a
8499
8500 (*--------------------------------------------------------------------*)
8501 (* Normalize an attribute value of type other than CDATA, and split *)
8502 (* it into tokens at space characters. Cf. 3.3.3: *)
8503 (* *)
8504 (* ... If the declared value is not CDATA, then the XML processor *)
8505 (* must further process the normalized attribute value by dis- *)
8506 (* carding any leading and trailing space (#x20) characters, and by *)
8507 (* replacing sequences of space (#x20) characters by a single space *)
8508 (* (#x20) character. *)
8509 (* *)
8510 (* replacement of references is already done when parsing the literal,*)
8511 (* thus we need only do whitespace normalization. we don't need to *)
8512 (* take care of the 3rd rule since replacement of sequences of #x20 *)
8513 (* and then splitting subsumes its effect. *)
8514 (* *)
8515 (* return the list of tokens as character lists and the normalized *)
8516 (* value as a char vector. *)
8517 (*--------------------------------------------------------------------*)
8518 fun splitAttValue av =
8519 let
8520 fun doOne nil = (nil,nil,nil)
8521 | doOne (c::cs) = if c=0wx20 then let val (toks,ys) = doAll true cs
8522 in (nil,toks,ys)
8523 end
8524 else let val (tok,toks,ys) = doOne cs
8525 in ((c::tok),toks,c::ys)
8526 end
8527 and doAll addS nil = (nil,nil)
8528 | doAll addS (c::cs) = if c=0wx20 then doAll addS cs
8529 else let val (tok,toks,ys) = doOne cs
8530 in ((c::tok)::toks,
8531 if addS then 0wx20::c::ys else c::ys)
8532 end
8533
8534 val (tokens,normed) = doAll false av
8535 in (Data2Vector normed,tokens)
8536 end
8537 (*--------------------------------------------------------------------*)
8538 (* normalize an attribute value other than CDATA according to 3.3.3. *)
8539 (* *)
8540 (* return the normalized att value as a Vector. *)
8541 (*--------------------------------------------------------------------*)
8542 fun normAttValue av =
8543 let fun doOne nil = nil
8544 | doOne (c::cs) = if c=0wx20 then doAll true cs
8545 else c::doOne cs
8546 and doAll addS nil = nil
8547 | doAll addS (c::cs) = if c=0wx20 then doAll addS cs
8548 else let val ys = doOne cs
8549 in if addS then 0wx20::c::ys else c::ys
8550 end
8551 val normed = doAll false av
8552 in Data2Vector normed
8553 end
8554
8555 (*--------------------------------------------------------------------*)
8556 (* Check whether a sequence of chars forms a name (token). *)
8557 (*--------------------------------------------------------------------*)
8558 fun isNmToken cs = List.all isName cs
8559 fun isaName nil = false
8560 | isaName (c::cs) = isNms c andalso List.all isName cs
8561
8562 (*--------------------------------------------------------------------*)
8563 (* Check whether a list of tokens is a single what fulfilling isWhat. *)
8564 (* print an error and raise AttValue if it is not. *)
8565 (*--------------------------------------------------------------------*)
8566 fun checkOne (isWhat,what,detail) (a,q) toks =
8567 case toks
8568 of nil => raise AttValue (hookError(a,(getPos q,ERR_EXACTLY_ONE detail)))
8569 | [one] => if isWhat one then one
8570 else raise AttValue(hookError(a,(getPos q,ERR_ATT_IS_NOT(one,what))))
8571 | more => raise AttValue(hookError(a,(getPos q,ERR_AT_MOST_ONE detail)))
8572 (*--------------------------------------------------------------------*)
8573 (* Check whether a list of tokens is non-empty and all elements ful- *)
8574 (* fil isWhat. *)
8575 (* print an error and raise AttValue if not. *)
8576 (*--------------------------------------------------------------------*)
8577 fun checkList (isWhat,what,detail) (a,q) toks =
8578 case toks
8579 of nil => raise AttValue (hookError(a,(getPos q,ERR_AT_LEAST_ONE detail)))
8580 | _ => app (fn one => if isWhat one then ()
8581 else let val err = ERR_ATT_IS_NOT(one,what)
8582 in raise AttValue(hookError(a,(getPos q,err)))
8583 end) toks
8584 (*--------------------------------------------------------------------*)
8585 (* Convert a list of tokens into an ID att value. 3.3.1: *)
8586 (* *)
8587 (* Validity Constraint: ID *)
8588 (* Values of type ID must match the Name production. *)
8589 (* *)
8590 (* Validity Constraint: ID *)
8591 (* ... A name must not appear more than once in an XML document as *)
8592 (* a value of this type; i.e., ID values must uniquely identify the *)
8593 (* elements which bear them. *)
8594 (* *)
8595 (* mark the value as used, print an error and raise AttValue if it *)
8596 (* was already used. *)
8597 (* print an error and raise AttValue if it is not a name. *)
8598 (*--------------------------------------------------------------------*)
8599 fun takeId (dtd,inDtd) (a,q) toks =
8600 let val one = checkOne (isaName,IT_NAME,IT_ID_NAME) (a,q) toks
8601 val idx = Id2Index dtd one
8602 val _ = if inDtd then ()
8603 else let val (decl,refs) = getId dtd idx
8604 in if decl then let val err = ERR_REPEATED_ID one
8605 in raise AttValue (hookError(a,(getPos q,err)))
8606 end
8607 else setId dtd (idx,(true,refs))
8608 end
8609 in (SOME(AV_ID idx),a)
8610 end
8611
8612 (*--------------------------------------------------------------------*)
8613 (* Convert a list of tokens into an IDREF/IDREFS att value. 3.3.1: *)
8614 (* *)
8615 (* Validity Constraint: IDREF *)
8616 (* Values of type IDREF must match the Name production. *)
8617 (* *)
8618 (* print an error an raise AttValue if it is not a (list of) name(s). *)
8619 (*--------------------------------------------------------------------*)
8620 fun setIdRef (dtd,q) idx =
8621 let val (decl,refs) = getId dtd idx
8622 in setId dtd (idx,(decl,getPos q::refs))
8623 end
8624 fun takeIdref (dtd,_) (a,q) toks =
8625 let val one = checkOne (isaName,IT_NAME,IT_ID_NAME) (a,q) toks
8626 val idx=Id2Index dtd one
8627 val _ = setIdRef (dtd,q) idx
8628 in (SOME(AV_IDREF idx),a)
8629 end
8630 fun takeIdrefs (dtd,_) (a,q) toks =
8631 let val _ = checkList (isaName,IT_NAME,IT_ID_NAME) (a,q) toks
8632 val idxs = map (Id2Index dtd) toks
8633 val _ = app (setIdRef (dtd,q)) idxs
8634 in (SOME(AV_IDREFS idxs),a)
8635 end
8636
8637 (*--------------------------------------------------------------------*)
8638 (* Convert a list of tokens into an ENTITY/IES att value. 3.3.1: *)
8639 (* *)
8640 (* Validity Constraint: Entity Name *)
8641 (* Values of type ENTITY must match the Name production... *)
8642 (* must match the name of an unparsed entity declared in the DTD. *)
8643 (* *)
8644 (* print an error and raise AttValue if a token is not a name. *)
8645 (* print an error and raise AttValue if an entity is undeclared or a *)
8646 (* parsed entity. *)
8647 (*--------------------------------------------------------------------*)
8648 fun checkEntity (dtd,inDtd) (a,q) name =
8649 let val idx = GenEnt2Index dtd name
8650 val (ent,_) = getGenEnt dtd idx
8651 val _ = if inDtd then ()
8652 else case ent
8653 of GE_UNPARSED _ => ()
8654 | GE_NULL => let val err = ERR_UNDECLARED(IT_GEN_ENT,name,LOC_NONE)
8655 in raise AttValue (hookError(a,(getPos q,err)))
8656 end
8657 | _ => let val err = ERR_MUST_BE_UNPARSED(name,LOC_NONE)
8658 in raise AttValue (hookError(a,(getPos q,err)))
8659 end
8660 in idx
8661 end
8662 fun takeEntity (dtd,inDtd) (aq as (a,_)) toks =
8663 let val one = checkOne (isaName,IT_NAME,IT_ENT_NAME) aq toks
8664 val idx = checkEntity (dtd,inDtd) aq one
8665 in (SOME(AV_ENTITY idx),a)
8666 end
8667 fun takeEntities (dtd,inDtd) (aq as (a,_)) toks =
8668 let val _ = checkList (isaName,IT_NAME,IT_ENT_NAME) aq toks
8669 val idxs = map (checkEntity (dtd,inDtd) aq) toks
8670 in (SOME(AV_ENTITIES idxs),a)
8671 end
8672
8673 (*--------------------------------------------------------------------*)
8674 (* Convert a list of tokens into a NOTATION att value. 3.3.1: *)
8675 (* *)
8676 (* Validity Constraint: Notation Attributes *)
8677 (* Values of this type must match one of the notation names *)
8678 (* included in the declaration. *)
8679 (* *)
8680 (* print an error and raise AttValue if it is not a single name. *)
8681 (* print an error and raise AttValue if the notation's index is not *)
8682 (* in the list given as 1st arg. *)
8683 (*--------------------------------------------------------------------*)
8684 fun takeNotation is (dtd,inDtd) (aq as (a,q)) toks =
8685 let val one = checkOne (isaName,IT_NAME,IT_NOT_NAME) aq toks
8686 val idx = AttNot2Index dtd one
8687 val _ = if member idx is then ()
8688 else let val nots = map (Index2AttNot dtd) is
8689 val err = ERR_MUST_BE_AMONG(IT_NOT_NAME,one,nots)
8690 in raise AttValue (hookError(a,(getPos q,err)))
8691 end
8692 in (SOME(AV_NOTATION(is,idx)),a)
8693 end
8694
8695 (*--------------------------------------------------------------------*)
8696 (* Convert a list of tokens into an enumerated att value. 3.3.1: *)
8697 (* *)
8698 (* Validity Constraint: Enumeration *)
8699 (* Values of this type must match one of the Nmtoken tokens in *)
8700 (* the declaration. *)
8701 (* *)
8702 (* print an error and raise AttValue if it is not a single name token.*)
8703 (* print an error and raise AttValue if the token's index is not *)
8704 (* in the list given as 1st arg. *)
8705 (*--------------------------------------------------------------------*)
8706 fun takeGroup is (dtd,_) (aq as (a,q)) toks =
8707 let val one = checkOne (isNmToken,IT_NMTOKEN,IT_NMTOKEN) aq toks
8708 val idx = AttNot2Index dtd one
8709 val _ = if member idx is then ()
8710 else let val toks = map (Index2AttNot dtd) is
8711 val err = ERR_MUST_BE_AMONG(IT_NMTOKEN,one,toks)
8712 in raise AttValue (hookError(a,(getPos q,err)))
8713 end
8714 in (SOME(AV_GROUP(is,idx)),a)
8715 end
8716
8717 (*--------------------------------------------------------------------*)
8718 (* Given an attribute type and a list of characters, construct the *)
8719 (* corresponding AttValue. *)
8720 (* *)
8721 (* print an error (and possibly raise AttValue) if the attribute *)
8722 (* is ill-formed. *)
8723 (*--------------------------------------------------------------------*)
8724 fun makeAttValue dtd (a,q) (aidx,attType,ext,inDtd,cs) =
8725 if attType=AT_CDATA
8726 then let val cv = Data2Vector cs
8727 in if !O_VALIDATE andalso hasDtd dtd
8728 then (cv,(SOME(AV_CDATA cv),checkAttSpec (a,q) (aidx,cs)))
8729 else (cv,(NONE,a))
8730 end
8731 else
8732 if !O_VALIDATE andalso hasDtd dtd then
8733 let
8734 val a1 = checkAttSpec (a,q) (aidx,cs)
8735 val (cv,toks) = splitAttValue cs
8736 val a2 =
8737 if ext andalso standsAlone dtd
8738 then let val cdata = Data2Vector cs
8739 in if cdata=cv then a1
8740 else let val err = ERR_STANDALONE_NORM(Index2AttNot dtd aidx)
8741 val _ = setStandAlone dtd (not (!O_ERROR_MINIMIZE))
8742 in hookError(a1,(getPos q,err))
8743 end
8744 end
8745 else a1
8746 in case attType
8747 of AT_NMTOKEN => (cv,(SOME(AV_NMTOKEN(checkOne(isNmToken,IT_NMTOKEN,
8748 IT_NMTOKEN) (a2,q) toks)),a2))
8749 | AT_NMTOKENS => (cv,(SOME(AV_NMTOKENS toks),a2)) before
8750 checkList(isNmToken,IT_NMTOKEN,IT_NMTOKEN) (a2,q) toks
8751 | AT_ID => (cv,takeId (dtd,inDtd) (a2,q) toks)
8752 | AT_IDREF => (cv,takeIdref (dtd,inDtd) (a2,q) toks)
8753 | AT_IDREFS => (cv,takeIdrefs (dtd,inDtd) (a2,q) toks)
8754 | AT_ENTITY => (cv,takeEntity (dtd,inDtd) (a2,q) toks)
8755 | AT_ENTITIES => (cv,takeEntities (dtd,inDtd) (a2,q) toks)
8756 | AT_GROUP is => (cv,takeGroup is (dtd,inDtd) (a2,q) toks)
8757 | AT_NOTATION is => (cv,takeNotation is (dtd,inDtd) (a2,q) toks)
8758 | AT_CDATA => raise InternalError(THIS_MODULE,"makeAttValue",
8759 "AT_CDATA in the innermost case")
8760 end
8761 else (normAttValue cs,(NONE,a))
8762
8763 (*--------------------------------------------------------------------*)
8764 (* given an attribute value literal and the attribute type, generate *)
8765 (* the AttValue, and check whether it complies with its default value.*)
8766 (* If yes, make an AttPresent value out of it. *)
8767 (* See 3.3.2: *)
8768 (* *)
8769 (* Validity Constraint: Fixed Attribute Default *)
8770 (* If an attribute has a default value declared with the #FIXED *)
8771 (* keyword, instances of that attribute must match the default *)
8772 (* value. *)
8773 (* *)
8774 (* print an error and raise AttValue if the attribute value doesn't *)
8775 (* comply. *)
8776 (* *)
8777 (* return the value as a AttPresent value. *)
8778 (*--------------------------------------------------------------------*)
8779 fun checkAttValue dtd (a,q) ((aidx,attType,defVal,ext),literal,cs) =
8780 let val (cv,(av,a1)) = makeAttValue dtd (a,q) (aidx,attType,ext,false,cs)
8781 in if !O_VALIDATE andalso hasDtd dtd then
8782 case defVal
8783 of AD_FIXED((def,cv',_),_) =>
8784 if cv=cv' then (AP_PRESENT(literal,cv,av),a1)
8785 else raise AttValue
8786 (hookError(a1,(getPos q,ERR_FIXED_VALUE(Index2AttNot dtd aidx,cv,cv'))))
8787 | _ => (AP_PRESENT(literal,cv,av),a1)
8788 else (AP_PRESENT(literal,cv,av),a1)
8789 end
8790
8791 (*--------------------------------------------------------------------*)
8792 (* check a defaulted attribute value for validity. *)
8793 (* *)
8794 (* since the lexical constraints are checked when the default is *)
8795 (* declared we only need to check whether notations are declared and *)
8796 (* entities are declared and unparsed. An ID attribute cannot be *)
8797 (* defaulted, so no need to check for duplicate ID attributes. *)
8798 (*--------------------------------------------------------------------*)
8799 fun checkDefaultValue dtd (a,q,pos) av =
8800 let
8801 fun checkEntity (idx,a) =
8802 let val (ent,_) = getGenEnt dtd idx
8803 in case ent
8804 of GE_UNPARSED _ => a
8805 | GE_NULL => hookError(a,(getPos q,ERR_UNDECLARED
8806 (IT_GEN_ENT,Index2GenEnt dtd idx,
8807 LOC_ATT_DEFAULT pos)))
8808 | _ => hookError(a,(getPos q,ERR_MUST_BE_UNPARSED
8809 (Index2GenEnt dtd idx,LOC_ATT_DEFAULT pos)))
8810 end
8811
8812 fun checkNotation (idx,a) =
8813 if hasNotation dtd idx then a
8814 else hookError(a,(getPos q,ERR_UNDECLARED
8815 (IT_NOTATION,Index2AttNot dtd idx,LOC_ATT_DEFAULT pos)))
8816 in
8817 case av
8818 of SOME(AV_ENTITY i) => checkEntity (i,a)
8819 | SOME(AV_ENTITIES is) => foldl checkEntity a is
8820 | SOME(AV_NOTATION(_,i)) => checkNotation(i,a)
8821 | _ => a
8822 end
8823
8824 (*--------------------------------------------------------------------*)
8825 (* Generate the attributes not specified in a start-tag, the defs of *)
8826 (* these atts and the specified atts given as argument. 3.3.2: *)
8827 (* *)
8828 (* If the declaration is neither #REQUIRED nor #IMPLIED, then the *)
8829 (* AttValue value contains the declared default value; ... If a *)
8830 (* default value is declared, when an XML processor encounters an *)
8831 (* omitted attribute, it is to behave as though the attribute were *)
8832 (* present with the declared default value. *)
8833 (* *)
8834 (* Validity Constraint: Required Attribute *)
8835 (* If the default declaration is the keyword #REQUIRED, then the *)
8836 (* attribute must be specified for all elements of the type in the *)
8837 (* attribute-list declaration. *)
8838 (* *)
8839 (* print an error if a required attribute was omitted. *)
8840 (* *)
8841 (* return the AttSpecList of all attributes for this tag. *)
8842 (*--------------------------------------------------------------------*)
8843 fun genMissingAtts dtd (a,q) (defs,specd) =
8844 let
8845 fun default a (idx,(v as (_,_,av),(pos,checked)),ext) =
8846 let val a1 = if ext andalso !O_VALIDATE andalso standsAlone dtd
8847 then let val err = ERR_STANDALONE_DEF(Index2AttNot dtd idx)
8848 val _ = setStandAlone dtd (not (!O_ERROR_MINIMIZE))
8849 in hookError(a,(getPos q,err))
8850 end
8851 else a
8852 val a2 = if !O_VALIDATE andalso not (!checked andalso !O_ERROR_MINIMIZE)
8853 then checkDefaultValue dtd (a1,q,pos) av before checked := true
8854 else a1
8855 in (AP_DEFAULT v,a1)
8856 end
8857 fun doit a nil = (specd,a)
8858 | doit a ((idx,_,dv,ext)::rest) =
8859 let val (value,a1) =
8860 case dv
8861 of AD_DEFAULT v => default a (idx,v,ext)
8862 | AD_FIXED v => default a (idx,v,ext)
8863 | AD_IMPLIED => (AP_IMPLIED,a)
8864 | AD_REQUIRED =>
8865 let val a1 = if not (!O_VALIDATE) then a
8866 else hookError(a,(getPos q,
8867 ERR_MISSING_ATT(Index2AttNot dtd idx)))
8868 in (AP_MISSING,a1)
8869 end
8870 val (other,a2) = doit a1 rest
8871 in ((idx,value,NONE)::other,a2)
8872 end
8873 in doit a defs
8874 end
8875
8876 (*--------------------------------------------------------------------*)
8877 (* process an undeclared attribute in a start-tag. *)
8878 (* At option, an error message is generated only once for the same *)
8879 (* attribute and element. *)
8880 (* *)
8881 (* possibly print an error. *)
8882 (* *)
8883 (* return nothing. *)
8884 (*--------------------------------------------------------------------*)
8885 fun handleUndeclAtt dtd (a,q) (aidx,att,eidx,elem) =
8886 if !O_ERROR_MINIMIZE then
8887 let val {decl,atts,errAtts} = getElement dtd eidx
8888 in if member aidx errAtts then a
8889 else let val a1 = if !O_VALIDATE andalso hasDtd dtd
8890 then let val err = ERR_UNDECL_ATT(att,elem)
8891 in hookError(a,(getPos q,err))
8892 end
8893 else a
8894 val a2 = checkAttName (a1,q) att
8895 val _ = setElement dtd (eidx,{decl = decl,
8896 atts = atts,
8897 errAtts = aidx::errAtts})
8898 in a2
8899 end
8900 end
8901 else let val a1 = if !O_VALIDATE andalso hasDtd dtd
8902 then hookError(a,(getPos q,ERR_UNDECL_ATT(att,elem)))
8903 else a
8904 in checkAttName (a1,q) att
8905 end
8906
8907 end
8908 (* stop of ../../Parser/Dtd/dtdAttributes.sml *)
8909 (* start of ../../Parser/Dtd/dtdManager.sml *)
8910 (*--------------------------------------------------------------------------*)
8911 (* Structure: Dtd *)
8912 (* *)
8913 (* Depends on: *)
8914 (* UniChar *)
8915 (* DtdAttributes *)
8916 (* DtdElements *)
8917 (* DtdEntities *)
8918 (* DtdNotations *)
8919 (* DtdStandalone *)
8920 (* *)
8921 (* Exceptions raised by functions in this structure: *)
8922 (* initDtdTables : none *)
8923 (* AttIdx2String : NoSuchSymbol *)
8924 (* ElemIdx2String : NoSuchIndex *)
8925 (* GenEntIdx2String : NoSuchIndex *)
8926 (* IdIdx2String : NoSuchIndex *)
8927 (* NotIdx2String : NoSuchIndex *)
8928 (* GenEntity2String : NoSuchIndex *)
8929 (* ElemInfo2String : NoSuchIndex NoSuchSymbol *)
8930 (* printGenEntTable : NoSuchIndex *)
8931 (* printElementTable : NoSuchIndex NoSuchSymbol *)
8932 (* printDtdTables : NoSuchIndex NoSuchSymbol *)
8933 (*--------------------------------------------------------------------------*)
8934 signature DtdManager =
8935 sig
8936 include Entities
8937 include Dtd
8938
8939 exception AttValue of AppData
8940
8941 val makeAttValue : Dtd -> AppData * State
8942 -> int * Base.AttType * bool * bool * UniChar.Data
8943 -> UniChar.Vector * (Base.AttValue option * AppData)
8944 val checkAttValue : Dtd -> AppData * State
8945 -> Base.AttDef * UniChar.Vector * UniChar.Data
8946 -> HookData.AttPresent * AppData
8947 val genMissingAtts : Dtd -> AppData * State
8948 -> Base.AttDefList * HookData.AttSpecList -> HookData.AttSpecList * AppData
8949 val handleUndeclAtt : Dtd -> AppData * State
8950 -> int * UniChar.Data * int * UniChar.Data -> AppData
8951 val handleUndeclElement : Dtd -> int -> Base.ElemInfo
8952
8953 val checkAttName : AppData * State -> UniChar.Data -> AppData
8954 val checkElemName : AppData * State -> UniChar.Data -> AppData
8955 val checkDefinedIds : Dtd -> AppData * State -> AppData
8956 val checkMultEnum : Dtd -> AppData * State -> AppData
8957 val checkPreDefined : Dtd -> AppData * State -> AppData
8958 val checkUnparsed : Dtd -> AppData -> AppData
8959
8960 val enterAttList : Dtd -> AppData * State -> int -> AppData
8961
8962 val addAttribute : Dtd -> AppData * State -> int * Base.AttDef -> AppData
8963 val addElement : Dtd -> AppData * State -> int * Base.ContentSpec * bool -> AppData
8964 val addGenEnt : Dtd -> AppData * State -> int * Base.GenEntity * bool -> AppData
8965 val addNotation : Dtd -> AppData * State -> int * Base.ExternalId -> AppData
8966 val addParEnt : Dtd -> AppData * State -> int * Base.ParEntity * bool -> AppData
8967 end
8968
8969 functor DtdManager (structure Dtd : Dtd
8970 structure Hooks : Hooks
8971 structure ParserOptions : ParserOptions) : DtdManager =
8972 struct
8973 structure Entities = Entities (structure Hooks = Hooks)
8974 structure DtdAttributes = DtdAttributes (structure Dtd = Dtd
8975 structure Entities = Entities
8976 structure ParserOptions = ParserOptions)
8977 open
8978 Dtd
8979 DtdAttributes
8980 end
8981 (* stop of ../../Parser/Dtd/dtdManager.sml *)
8982 (* start of ../../Parser/Parse/parseBase.sml *)
8983 signature ParseBase =
8984 sig
8985 include Dfa DtdManager Resolve DfaOptions ParserOptions
8986
8987 exception NoSuchChar of AppData * State
8988 exception NoSuchEntity of AppData * State
8989 exception NotFound of UniChar.Char * AppData * State
8990 exception SyntaxError of UniChar.Char * AppData * State
8991
8992 val expectedOrEnded : Errors.Expected * Errors.Location -> UniChar.Char -> Errors.Error
8993
8994 val recoverXml : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
8995 val recoverETag : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
8996 val recoverSTag : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
8997 val recoverDecl : bool -> UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
8998
8999 val useParamEnts : unit -> bool
9000 end
9001
9002 (*--------------------------------------------------------------------------*)
9003 (* Structure: ParseBase *)
9004 (*--------------------------------------------------------------------------*)
9005 (* This structure provides exceptions for the Parse functions, and strings *)
9006 (* for error generation (these strings don't really need to reside in their *)
9007 (* own structure, but like this the code is more easier to read). *)
9008 (*--------------------------------------------------------------------------*)
9009 functor ParseBase (structure Dtd : Dtd
9010 structure Hooks : Hooks
9011 structure Resolve : Resolve
9012 structure ParserOptions : ParserOptions) : ParseBase =
9013 struct
9014 structure DfaOptions = ParserOptions.DfaOptions
9015 structure Dfa = Dfa (structure DfaOptions = DfaOptions)
9016 structure DtdManager = DtdManager (structure Dtd = Dtd
9017 structure Hooks = Hooks
9018 structure ParserOptions = ParserOptions)
9019 open
9020 Base DtdManager DfaOptions Dfa Errors ParserOptions Resolve UniChar
9021
9022 exception NoSuchChar of AppData * State
9023 exception NoSuchEntity of AppData * State
9024 exception NotFound of UniChar.Char * AppData * State
9025 exception SyntaxError of UniChar.Char * AppData * State
9026
9027 fun expectedOrEnded (exp,ended) c =
9028 if c=0wx00 then ERR_ENDED_BY_EE ended
9029 else ERR_EXPECTED(exp,[c])
9030
9031 (*--------------------------------------------------------------------*)
9032 (* Besides "?>" also recognize ">" as end delimiter, because the typo *)
9033 (* might be an omitted "?". Also stop on "<"; then the entire "?>" *)
9034 (* was omitted; the "<" may not be consumed then. *)
9035 (* Within literals dont recognize ">" and "<", but only "?>"; then *)
9036 (* the typo is an omitted quote character. *)
9037 (*--------------------------------------------------------------------*)
9038 fun recoverXml caq =
9039 let
9040 fun do_lit ch (c,a,q) =
9041 case c
9042 of 0wx00 => (c,a,q)
9043 | 0wx3F (* #"?" *) =>
9044 let val (c1,a1,q1) = getChar (a,q)
9045 in if c1=0wx3E (* #">" *) then (c1,a1,q1)
9046 else do_lit ch (c1,a1,q1)
9047 end
9048 | _ => if c=ch then (getChar (a,q))
9049 else do_lit ch (getChar (a,q))
9050 fun doit (c,a,q) =
9051 case c
9052 of 0wx00 => (c,a,q)
9053 | 0wx22 (* #""""*) => doit (do_lit c (getChar (a,q)))
9054 | 0wx25 (* #"%" *) => (c,a,q)
9055 | 0wx26 (* #"&" *) => (c,a,q)
9056 | 0wx27 (* #"'" *) => doit (do_lit c (getChar (a,q)))
9057 | 0wx3C (* #"<" *) => (c,a,q)
9058 | 0wx3E (* #">" *) => (getChar (a,q))
9059 | _ => doit (getChar (a,q))
9060 in
9061 doit caq
9062 end
9063
9064 fun recoverETag caq =
9065 let
9066 fun do_lit ch (c,a,q) =
9067 case c
9068 of 0wx00 => (c,a,q)
9069 | _ => if c=ch then (getChar (a,q))
9070 else do_lit ch (getChar (a,q))
9071 fun doit (c,a,q) =
9072 case c
9073 of 0wx00 => (c,a,q)
9074 | 0wx22 (* #""""*) => doit (do_lit c (getChar (a,q)))
9075 | 0wx26 (* #"&" *) => (c,a,q)
9076 | 0wx27 (* #"'" *) => doit (do_lit c (getChar (a,q)))
9077 | 0wx3E (* #">" *) => (getChar (a,q))
9078 | 0wx3C (* #"<" *) => (c,a,q)
9079 | _ => doit (getChar (a,q))
9080 in
9081 doit caq
9082 end
9083
9084 fun recoverSTag caq =
9085 let
9086 fun do_lit ch (c,a,q) =
9087 case c
9088 of 0wx00 => (c,a,q)
9089 | _ => if c=ch then (getChar (a,q))
9090 else do_lit ch (getChar (a,q))
9091 fun doit (c,a,q) =
9092 case c
9093 of 0wx00 => (false,(c,a,q))
9094 | 0wx22 (* #""""*) => doit (do_lit c (getChar (a,q)))
9095 | 0wx26 (* #"&" *) => (false,(c,a,q))
9096 | 0wx27 (* #"'" *) => doit (do_lit c (getChar (a,q)))
9097 | 0wx2F (* #"/" *) => let val (c1,a1,q1) = getChar (a,q)
9098 in if c1=0wx3E (* #">" *) then (true,(c1,a1,q1))
9099 else doit (c1,a1,q1)
9100 end
9101 | 0wx3E (* #">" *) => (false,getChar (a,q))
9102 | 0wx3C (* #"<" *) => (false,(c,a,q))
9103 | _ => doit (getChar (a,q))
9104 in
9105 doit caq
9106 end
9107
9108 fun recoverDecl hasSubset caq =
9109 let
9110 fun do_lit ch (c,a,q) =
9111 if c=0wx00 then (c,a,q)
9112 else if c=ch then getChar (a,q)
9113 else do_lit ch (getChar(a,q))
9114 fun do_decl (c,a,q) =
9115 case c
9116 of 0wx00 => (c,a,q)
9117 | 0wx22 (* #"\""*) => do_decl (do_lit c (getChar (a,q)))
9118 | 0wx27 (* #"'" *) => do_decl (do_lit c (getChar (a,q)))
9119 | 0wx3E (* #">" *) => getChar (a,q)
9120 | _ => do_decl (getChar (a,q))
9121 fun do_subset (c,a,q) =
9122 case c
9123 of 0wx00 => (c,a,q)
9124 | 0wx3C (* #"<" *) => do_subset (do_decl (getChar (a,q)))
9125 | 0wx5D (* #"]" *) => getChar (a,q)
9126 | _ => do_subset (getChar (a,q))
9127 fun doit (c,a,q) =
9128 case c
9129 of 0wx00 => if isSpecial q then (c,a,q) else doit (getChar (a,q))
9130 | 0wx22 (* #"\""*) => doit (do_lit c (getChar (a,q)))
9131 | 0wx25 (* #"%" *) => if hasSubset then (c,a,q) else doit (getChar (a,q))
9132 | 0wx27 (* #"'" *) => doit (do_lit c (getChar (a,q)))
9133 | 0wx3C (* #"<" *) => (c,a,q)
9134 | 0wx3E (* #">" *) => getChar (a,q)
9135 | 0wx5B (* #"[" *) => if hasSubset then doit (do_subset (getChar (a,q)))
9136 else doit (getChar (a,q))
9137 | _ => doit (getChar (a,q))
9138 in doit caq
9139 end
9140
9141 fun useParamEnts() = !O_VALIDATE orelse !O_INCLUDE_PARAM_ENTS
9142 end
9143 (* stop of ../../Parser/Parse/parseBase.sml *)
9144 (* start of ../../Parser/Parse/parseNames.sml *)
9145
9146
9147
9148
9149
9150
9151
9152 signature ParseNames =
9153 sig
9154 include ParseBase
9155
9156 val parseName : UniChar.Char * AppData * State
9157 -> UniChar.Data * (UniChar.Char * AppData * State)
9158 val parseNmtoken : UniChar.Char * AppData * State
9159 -> UniChar.Data * (UniChar.Char * AppData * State)
9160
9161 val parseNameLit : UniChar.Data -> UniChar.Char * AppData * State
9162 -> UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9163 val parseEntName : UniChar.Data * UniChar.Data -> UniChar.Char * AppData * State
9164 -> bool * UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9165 end
9166
9167 (*--------------------------------------------------------------------------*)
9168 (* Structure: ParseNames *)
9169 (* *)
9170 (* Exceptions raised by functions in this structure: *)
9171 (* parseEntName : none *)
9172 (* parseName : NotFound *)
9173 (* parseNmtoken : NotFound *)
9174 (*--------------------------------------------------------------------------*)
9175 functor ParseNames (structure ParseBase : ParseBase)
9176 : ParseNames =
9177 struct
9178 open
9179 Errors ParseBase UniClasses
9180
9181 (*--------------------------------------------------------------------*)
9182 (* parse (the remainder of) a name or nmtoken. *)
9183 (* *)
9184 (* [5] Name ::= (Letter | '_' | ':') (NameChar)* *)
9185 (* *)
9186 (* raise NotFound if no name/name start character comes first. *)
9187 (* *)
9188 (* return the name as a list of characters, together with the next *)
9189 (* character and the remaining state. *)
9190 (*--------------------------------------------------------------------*)
9191 fun parseName' (c,a,q) =
9192 if isName c
9193 then let val (cs,caq1) = parseName'(getChar(a,q))
9194 in (c::cs,caq1)
9195 end
9196 else (nil,(c,a,q))
9197 fun parseName (c,a,q) =
9198 if isNms c
9199 then let val (cs,caq1) = parseName'(getChar(a,q))
9200 in (c::cs,caq1)
9201 end
9202 else raise NotFound(c,a,q)
9203 fun parseNmtoken (c,a,q) =
9204 if isName c
9205 then let val (cs,caq1) = parseName'(getChar(a,q))
9206 in (c::cs,caq1)
9207 end
9208 else raise NotFound(c,a,q)
9209
9210 (*--------------------------------------------------------------------*)
9211 (* parse a name, additionally accumulating its characters in reverse *)
9212 (* order to the first argument. *)
9213 (* *)
9214 (* raise NotFound if no name/name start character comes first. *)
9215 (*--------------------------------------------------------------------*)
9216 fun parseNameLit cs (c,a,q) =
9217 let fun doit (cs,ns) (c,a,q) =
9218 if isName c then doit (c::cs,c::ns) (getChar(a,q))
9219 else (cs,rev ns,(c,a,q))
9220 in
9221 if isNms c then doit (c::cs,[c]) (getChar(a,q))
9222 else raise NotFound(c,a,q)
9223 end
9224 (*--------------------------------------------------------------------*)
9225 (* parse a name, accumulating its reverse in the first arg text. This *)
9226 (* is useful for parsing of entity values, where entity references *)
9227 (* are parsed but bypassed, and must thus be accumulated together *)
9228 (* the other literal text. *)
9229 (* *)
9230 (* print an error if no name/name start character comes first. *)
9231 (* *)
9232 (* return a boolean indicating whether a name was found, the reverse *)
9233 (* name as a list of characters, concatenated with the text in the *)
9234 (* first arg, together with the next character and remaining state. *)
9235 (*--------------------------------------------------------------------*)
9236 fun parseEntName (lit,text) (c,a,q) =
9237 let
9238 fun doit (lit,text) (c,a,q) =
9239 if isName c then doit (c::lit,c::text) (getChar (a,q))
9240 else (true,lit,text,(c,a,q))
9241 in
9242 if isNms c then doit (c::lit,c::text) (getChar (a,q))
9243 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expAnEntName,[c])))
9244 in (false,lit,text,(c,a1,q))
9245 end
9246 end
9247
9248 end
9249
9250 (* stop of ../../Parser/Parse/parseNames.sml *)
9251 (* start of ../../Parser/Parse/parseMisc.sml *)
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262 signature ParseMisc =
9263 sig
9264 (*----------------------------------------------------------------------
9265 include ParseBase
9266
9267 val parseName : UniChar.Char * AppData * State
9268 -> UniChar.Data * (UniChar.Char * AppData * State)
9269 val parseNmtoken : UniChar.Char * AppData * State
9270 -> UniChar.Data * (UniChar.Char * AppData * State)
9271 val parseNameLit : UniChar.Data -> UniChar.Char * AppData * State
9272 -> UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9273 val parseEntName : UniChar.Data * UniChar.Data -> UniChar.Char * AppData * State
9274 -> bool * UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9275 ----------------------------------------------------------------------*)
9276 include ParseNames
9277
9278 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
9279 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
9280 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
9281
9282 val parseSopt : UniChar.Data -> UniChar.Char * AppData * State
9283 -> UniChar.Data * (UniChar.Char * AppData * State)
9284 val parseSmay : UniChar.Data -> UniChar.Char * AppData * State
9285 -> bool * (UniChar.Data * (UniChar.Char * AppData * State))
9286
9287 val skipEq : UniChar.Char * AppData * State
9288 -> UniChar.Char * AppData * State
9289 val parseEq : UniChar.Char * AppData * State
9290 -> UniChar.Data * (UniChar.Char * AppData * State)
9291
9292 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
9293 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
9294 end
9295
9296 (*--------------------------------------------------------------------------*)
9297 (* Structure: ParseMisc *)
9298 (* *)
9299 (* Exceptions raised by functions in this structure: *)
9300 (* skipS : none *)
9301 (* skipSopt : none *)
9302 (* skipSmay : none *)
9303 (* skipEq : SyntaxError *)
9304 (* skipComment : none *)
9305 (* parseComment : none *)
9306 (* parseProcInstr : none *)
9307 (*--------------------------------------------------------------------------*)
9308 functor ParseMisc (structure ParseBase : ParseBase)
9309 : ParseMisc =
9310 struct
9311 structure ParseNames = ParseNames (structure ParseBase = ParseBase)
9312
9313 open
9314 UniChar Errors ParseNames
9315
9316 (*--------------------------------------------------------------------*)
9317 (* parse a sequence of white space. 2.3: *)
9318 (* *)
9319 (* [3] S ::= (#x20 | #x9 | #xD | #xA)+ *)
9320 (*--------------------------------------------------------------------*)
9321 (* parse optional white space. *)
9322 (*--------------------------------------------------------------------*)
9323 (* Return type: Char * AppData * State *)
9324 (*--------------------------------------------------------------------*)
9325 fun skipSopt (c,a,q) =
9326 case c
9327 of 0wx09 => skipSopt (getChar (a,q))
9328 | 0wx0A => skipSopt (getChar (a,q))
9329 | 0wx20 => skipSopt (getChar (a,q))
9330 | _ => (c,a,q)
9331 fun parseSopt cs (c,a,q) =
9332 case c
9333 of 0wx09 => parseSopt (c::cs) (getChar (a,q))
9334 | 0wx0A => parseSopt (c::cs) (getChar (a,q))
9335 | 0wx20 => parseSopt (c::cs) (getChar (a,q))
9336 | _ => (cs,(c,a,q))
9337 (*--------------------------------------------------------------------*)
9338 (* parse optional white space. *)
9339 (*--------------------------------------------------------------------*)
9340 (* Return type: bool * (Char * AppData * State) *)
9341 (* the bool indicates whether white space was found or not. *)
9342 (*--------------------------------------------------------------------*)
9343 fun skipSmay (c,a,q) =
9344 case c
9345 of 0wx09 => (true,skipSopt (getChar (a,q)))
9346 | 0wx0A => (true,skipSopt (getChar (a,q)))
9347 | 0wx20 => (true,skipSopt (getChar (a,q)))
9348 | _ => (false,(c,a,q))
9349 fun parseSmay cs (c,a,q) =
9350 case c
9351 of 0wx09 => (true,parseSopt (c::cs) (getChar (a,q)))
9352 | 0wx0A => (true,parseSopt (c::cs) (getChar (a,q)))
9353 | 0wx20 => (true,parseSopt (c::cs) (getChar (a,q)))
9354 | _ => (false,(cs,(c,a,q)))
9355 (*--------------------------------------------------------------------*)
9356 (* parse required white space. *)
9357 (*--------------------------------------------------------------------*)
9358 (* print an error if no white space character is found. *)
9359 (*--------------------------------------------------------------------*)
9360 (* Return type: Char * AppData * State *)
9361 (*--------------------------------------------------------------------*)
9362 fun skipS (c,a,q) =
9363 case c
9364 of 0wx09 => skipSopt (getChar (a,q))
9365 | 0wx0A => skipSopt (getChar (a,q))
9366 | 0wx20 => skipSopt (getChar (a,q))
9367 | _ => (c,hookError(a,(getPos q,ERR_MISSING_WHITE)),q)
9368
9369 (*--------------------------------------------------------------------*)
9370 (* parse a "=" together with surrounding white space. Cf. 28: *)
9371 (* *)
9372 (* [25] Eq ::= S? '=' S? *)
9373 (*--------------------------------------------------------------------*)
9374 (* Raises: *)
9375 (* SyntaxError if no "=" is found. *)
9376 (*--------------------------------------------------------------------*)
9377 (* Return type: Char * AppData * State *)
9378 (*--------------------------------------------------------------------*)
9379 fun skipEq caq =
9380 let val (c1,a1,q1) = skipSopt caq
9381 in if c1=0wx3D then skipSopt (getChar (a1,q1))
9382 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expEq,[c1])))
9383 in raise SyntaxError(c1,a2,q1)
9384 end
9385 end
9386 fun parseEq caq =
9387 let val (cs1,(c1,a1,q1)) = parseSopt nil caq
9388 in if c1=0wx3D
9389 then let val (cs2,caq2)= parseSopt (c1::cs1) (getChar (a1,q1))
9390 in (rev cs2,caq2)
9391 end
9392 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expEq,[c1])))
9393 in raise SyntaxError(c1,a2,q1)
9394 end
9395 end
9396
9397 (*--------------------------------------------------------------------*)
9398 (* parse a comment, the initial "<--" already consumed. cf. 2.5: *)
9399 (* *)
9400 (* They are not part of the document's character data; an XML *)
9401 (* processor may, but need not, make it possible for an application *)
9402 (* to retrieve the text of comments. For compatibility, the string *)
9403 (* "--" (double-hyphen) must not occur within comments. *)
9404 (* *)
9405 (* [15] Comment ::= '<!--' ( (Char - '-') *)
9406 (* | ('-' (Char - '-')))* '-->' *)
9407 (*--------------------------------------------------------------------*)
9408 (* print an error and end the comment if an entity end is found. *)
9409 (* print an error if the comment contains "--". *)
9410 (*--------------------------------------------------------------------*)
9411 (* add the comment to the user data. *)
9412 (*--------------------------------------------------------------------*)
9413 (* Return type: Char * AppData * State *)
9414 (*--------------------------------------------------------------------*)
9415 fun parseComment startPos aq =
9416 let
9417 fun check_end yet (a0,q0) =
9418 let val (c,a,q) = getChar (a0,q0)
9419 in if c=0wx2D (* #"-" *)
9420 then let val (c1,a1,q1) = getChar (a,q)
9421 in if c1=0wx3E (* #">" *)
9422 then let val cs = Data2Vector(rev yet)
9423 val a2 = hookComment(a1,((startPos,getPos q1),cs))
9424 in getChar(a2,q1)
9425 end
9426 else let val a2 = if not (!O_COMPATIBILITY) then a1
9427 else hookError(a1,(getPos q0,ERR_FORBIDDEN_HERE
9428 (IT_DATA [c,c],LOC_COMMENT)))
9429 in doit (c::c::yet) (c1,a2,q1)
9430 end
9431 end
9432 else doit (0wx2D::yet) (c,a,q)
9433 end
9434 and doit yet (c,a,q) =
9435 if c=0wx2D (* #"-" *) then check_end yet (a,q)
9436 else if c<>0wx00 then doit (c::yet) (getChar (a,q))
9437 else let val err = ERR_ENDED_BY_EE LOC_COMMENT
9438 val a1 = hookError(a,(getPos q,err))
9439 val cs = Data2Vector(rev yet)
9440 val a2 = hookComment(a1,((startPos,getPos q),cs))
9441 in (c,a2,q)
9442 end
9443 in doit nil (getChar aq)
9444 end
9445
9446 (*--------------------------------------------------------------------*)
9447 (* check whether a name matches "xml", disregarding case, cf. 2.6: *)
9448 (* *)
9449 (* [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) *)
9450 (* *)
9451 (* The target names "XML", "xml", and so on are reserved for *)
9452 (* standardization in this or future versions of this specification.*)
9453 (*--------------------------------------------------------------------*)
9454 (* print an error if it does match. *)
9455 (*--------------------------------------------------------------------*)
9456 (* Return type: AppData *)
9457 (*--------------------------------------------------------------------*)
9458 fun checkPiTarget (a,q) name =
9459 case name
9460 of [c1,c2,c3] => if ((c1=0wx58 orelse c1=0wx78) andalso
9461 (c2=0wx4D orelse c2=0wx6D) andalso
9462 (c3=0wx4C orelse c3=0wx6C))
9463 then hookError(a,(getPos q,ERR_RESERVED(name,IT_TARGET)))
9464 else a
9465 | _ => a
9466 (*--------------------------------------------------------------------*)
9467 (* parse a processing instruction, the initial "<?" and target *)
9468 (* already consumed. cf. 2.5: *)
9469 (* *)
9470 (* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char* )))? '?>'*)
9471 (* *)
9472 (* The first arg consists of the target and the (reversed) list of *)
9473 (* leading characters of the text that have been looked ahead. *)
9474 (*--------------------------------------------------------------------*)
9475 (* print an error and end the proc. instr. if an entity end is found. *)
9476 (*--------------------------------------------------------------------*)
9477 (* add the processing instruction to the user data. *)
9478 (*--------------------------------------------------------------------*)
9479 (* Return type: Char * AppData * State *)
9480 (*--------------------------------------------------------------------*)
9481 fun parseProcInstr' (startPos,target,txtPos,yetText) caq =
9482 let
9483 fun doit text (c1,a1,q1) =
9484 case c1
9485 of 0wx00 => let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_PROC))
9486 in (text,getPos q1,(c1,a2,q1))
9487 end
9488 | 0wx3F => (* #"?" *)
9489 let val (c2,a2,q2) = getChar (a1,q1)
9490 in case c2
9491 of 0wx3E => (* #">" *) (text,getPos q2,getChar(a2,q2))
9492 | _ => doit (c1::text) (c2,a2,q2)
9493 end
9494 | _ => doit (c1::text) (getChar (a1,q1))
9495
9496 val (cs,endPos,(c2,a2,q2)) = doit yetText caq
9497 val text = Data2Vector(rev cs)
9498 val a3 = hookProcInst(a2,((startPos,endPos),target,txtPos,text))
9499 in
9500 (c2,a3,q2)
9501 end
9502 (*--------------------------------------------------------------------*)
9503 (* parse a processing instruction, the initial "<?" already read. *)
9504 (* *)
9505 (* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char* )))? '?>'*)
9506 (*--------------------------------------------------------------------*)
9507 (* print an error and end the proc. instr. if an entity end is found. *)
9508 (* print an error if no target name is found. *)
9509 (* print an error if no whitespace follows the target. *)
9510 (*--------------------------------------------------------------------*)
9511 (* add the processing instruction to the user data. *)
9512 (*--------------------------------------------------------------------*)
9513 (* Return type: Char * AppData * State *)
9514 (*--------------------------------------------------------------------*)
9515 fun parseProcInstr startPos (a,q) =
9516 let
9517 (* NotFound is handled after the 'in .. end' *)
9518 val (target,(c1,a1,q1)) = parseName (getChar(a,q))
9519 val a1 = checkPiTarget (a1,q) target
9520 in
9521 case c1
9522 of 0wx00 =>
9523 let
9524 val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_PROC))
9525 val a3 = hookProcInst(a2,((startPos,getPos q1),target,getPos q1,nullVector))
9526 in (c1,a3,q1)
9527 end
9528 | 0wx3F => (* #"?" *)
9529 let val (c2,a2,q2) = getChar (a1,q1)
9530 in case c2
9531 of 0wx3E => (* #">" *)
9532 let val a3 = hookProcInst(a2,((startPos,getPos q2),target,
9533 getPos q1,nullVector))
9534 in getChar (a3,q2)
9535 end
9536 | _ => let val a3 = hookError(a2,(getPos q1,ERR_MISSING_WHITE))
9537 in parseProcInstr' (startPos,target,getPos q1,[c1]) (c2,a3,q2)
9538 end
9539 end
9540 | _ => let val (hadS,(c2,a2,q2)) = skipSmay (c1,a1,q1)
9541 val a3 = if hadS then a2
9542 else hookError(a2,(getPos q2,ERR_MISSING_WHITE))
9543 in parseProcInstr' (startPos,target,getPos q2,nil) (c2,a3,q2)
9544 end
9545 end
9546 handle NotFound(c,a,q) =>
9547 let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expATarget,[c])))
9548 in parseProcInstr' (startPos,nullData,getPos q,nil) (c,a1,q)
9549 end
9550 end
9551 (* stop of ../../Parser/Parse/parseMisc.sml *)
9552 (* start of ../../Parser/Parse/parseXml.sml *)
9553 signature ParseXml =
9554 sig
9555 (*----------------------------------------------------------------------
9556 include ParseBase
9557
9558 val parseName : UniChar.Char * AppData * State
9559 -> UniChar.Data * (UniChar.Char * AppData * State)
9560 val parseNmtoken : UniChar.Char * AppData * State
9561 -> UniChar.Data * (UniChar.Char * AppData * State)
9562 val parseNameLit : UniChar.Data -> UniChar.Char * AppData * State
9563 -> UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9564 val parseEntName : UniChar.Data * UniChar.Data -> UniChar.Char * AppData * State
9565 -> bool * UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
9566
9567 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
9568 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
9569 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
9570 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
9571 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
9572 val parseSopt : UniChar.Data -> UniChar.Char * AppData * State
9573 -> UniChar.Data * (UniChar.Char * AppData * State)
9574 val parseSmay : UniChar.Data -> UniChar.Char * AppData * State
9575 -> bool * (UniChar.Data * (UniChar.Char * AppData * State))
9576 val parseEq : UniChar.Char * AppData * State
9577 -> UniChar.Data * (UniChar.Char * AppData * State)
9578 ----------------------------------------------------------------------*)
9579 include ParseMisc
9580
9581 val openDocument : Uri.Uri option -> AppData
9582 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
9583 val openSubset : Uri.Uri -> AppData
9584 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
9585 val openExtern : int * bool * Uri.Uri -> AppData * State
9586 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
9587 end
9588
9589 (*--------------------------------------------------------------------------*)
9590 (* Structure: ParseXml *)
9591 (* *)
9592 (* Exceptions raised by functions in this structure: *)
9593 (* openDocument : NoSuchFile *)
9594 (* openExtern : none *)
9595 (* openSubset : NoSuchFile *)
9596 (*--------------------------------------------------------------------------*)
9597 functor ParseXml (structure ParseBase : ParseBase)
9598 : ParseXml =
9599 struct
9600 structure ParseMisc = ParseMisc (structure ParseBase = ParseBase)
9601
9602 open
9603 Errors UniChar UniClasses UtilString
9604 ParseMisc
9605
9606 fun checkVersionNum (a,q) version =
9607 if not (!O_CHECK_VERSION) orelse version="1.0" then a
9608 else hookError(a,(getPos q,ERR_VERSION version))
9609
9610 (*--------------------------------------------------------------------*)
9611 (* parse a version number, the quote character ("'" or '"') passed as *)
9612 (* first argument. cf. 2.8: *)
9613 (* *)
9614 (* [24] VersionInfo ::= S 'version' Eq (' VersionNum ' *)
9615 (* | " VersionNum ") *)
9616 (* [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ *)
9617 (* *)
9618 (* print an error and end the literal if an entity end is found. *)
9619 (* print an error if a disallowed character is found. *)
9620 (* *)
9621 (* return the version number as a string option, together with the *)
9622 (* next character and state. *)
9623 (*--------------------------------------------------------------------*)
9624 (* might raise: none *)
9625 (*--------------------------------------------------------------------*)
9626 fun parseVersionNum quote aq =
9627 let
9628 fun doit text (c,a,q) =
9629 if c=quote then (text,getChar (a,q))
9630 else if isVers c then doit (c::text) (getChar (a,q))
9631 else if c=0wx0
9632 then let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_VERSION))
9633 in (text,(c,a1,q))
9634 end
9635 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR c,LOC_VERSION)
9636 val a1 = hookError(a,(getPos q,err))
9637 in doit text (getChar (a1,q))
9638 end
9639
9640 val (c1,a1,q1) = getChar aq
9641
9642 val (text,(c2,a2,q2)) =
9643 if isVers c1 then doit [c1] (getChar (a1,q1))
9644 else if c1=quote
9645 then let val a2 = hookError(a1,(getPos q1,ERR_EMPTY LOC_VERSION))
9646 in (nil,getChar (a2,q1))
9647 end
9648 else if c1=0wx00
9649 then let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_VERSION))
9650 val a3 = hookError(a2,(getPos q1,ERR_EMPTY LOC_VERSION))
9651 in (nil,(c1,a3,q1))
9652 end
9653 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR c1,LOC_VERSION)
9654 val a2 = hookError(a1,(getPos q1,err))
9655 in doit nil (getChar (a2,q1))
9656 end
9657 val version = Latin2String (rev text)
9658 val a3 = checkVersionNum (a2,q1) version
9659 in
9660 (SOME version,(c2,a3,q2))
9661 end
9662 (*--------------------------------------------------------------------*)
9663 (* parse a version info starting after 'version'. Cf. 2.8: *)
9664 (* *)
9665 (* [24] VersionInfo ::= S 'version' Eq (' VersionNum ' *)
9666 (* | " VersionNum ") *)
9667 (* *)
9668 (* print an error and raise SyntaxState if no '=' is found. *)
9669 (* print an error and raise SyntaxState if no quote sign is found. *)
9670 (* *)
9671 (* return the version number as a string option, together with the *)
9672 (* next char and the remaining state. *)
9673 (*--------------------------------------------------------------------*)
9674 (* might raise: SyntaxState *)
9675 (*--------------------------------------------------------------------*)
9676 fun parseVersionInfo caq =
9677 let val (c1,a1,q1) = skipEq caq
9678 in case c1
9679 of 0wx22 (* '""' *) => parseVersionNum c1 (a1,q1)
9680 | 0wx27 (* "'" *) => parseVersionNum c1 (a1,q1)
9681 | _ => let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expLitQuote,[c1])))
9682 in raise SyntaxError(c1,a2,q1)
9683 end
9684 end
9685
9686 (*--------------------------------------------------------------------*)
9687 (* parse an encoding name, the quote character ("'" or '"') passed as *)
9688 (* first argument. cf. 4.3.3: *)
9689 (* *)
9690 (* [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' *)
9691 (* |"'" EncName "'") *)
9692 (* *)
9693 (* [81] EncName ::= [A-Za-z] /* Encoding name *)
9694 (* ([A-Za-z0-9._] | '-')* contains only Latin *)
9695 (* characters */ *)
9696 (* *)
9697 (* print an error and end the literal if an entity end is found. *)
9698 (* print an error if a disallowed character is found. *)
9699 (* *)
9700 (* return the encoding name as a string option, together with the *)
9701 (* next character and state. *)
9702 (*--------------------------------------------------------------------*)
9703 (* might raise: none *)
9704 (*--------------------------------------------------------------------*)
9705 fun parseEncName quote aq =
9706 let
9707 fun doit text (c,a,q) =
9708 if c=quote then (text,getChar (a,q))
9709 else if isEnc c then doit (c::text) (getChar (a,q))
9710 else if c=0wx00
9711 then let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_ENCODING))
9712 in (text,(c,a1,q))
9713 end
9714 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR c,LOC_ENCODING)
9715 val a1 = hookError(a,(getPos q,err))
9716 in doit text (getChar (a,q))
9717 end
9718
9719 val (c1,a1,q1) = getChar aq
9720
9721 val (text,caq2) =
9722 if isEncS c1 then doit [c1] (getChar (a1,q1))
9723 else if c1=quote
9724 then let val a2 = hookError(a1,(getPos q1,ERR_EMPTY LOC_ENCODING))
9725 in (nil,getChar (a2,q1))
9726 end
9727 else if c1=0wx00
9728 then let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_ENCODING))
9729 val a3 = hookError(a2,(getPos q1,ERR_EMPTY LOC_ENCODING))
9730 in (nil,(c1,a3,q1))
9731 end
9732 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expStartEnc,[c1])))
9733 in doit nil (getChar (a2,q1))
9734 end
9735
9736 val enc = toUpperString (Latin2String (rev text))
9737 in
9738 (enc,caq2)
9739 end
9740 (*--------------------------------------------------------------------*)
9741 (* parse an encoding decl starting after 'encoding'. Cf. 4.3.3: *)
9742 (* *)
9743 (* *)
9744 (* [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' *)
9745 (* |"'" EncName "'") *)
9746 (* *)
9747 (* print an error and raise SyntaxState if no '=' is found. *)
9748 (* print an error and raise SyntaxState if no quote sign is found. *)
9749 (* *)
9750 (* return the encoding name as a string option, together with the *)
9751 (* next char and the remaining state. *)
9752 (*--------------------------------------------------------------------*)
9753 (* might raise: SyntaxState *)
9754 (*--------------------------------------------------------------------*)
9755 fun parseEncodingDecl caq =
9756 let val (c1,a1,q1) = skipEq caq
9757 in case c1
9758 of 0wx22 (* '""' *) => parseEncName c1 (a1,q1)
9759 | 0wx27 (* "'" *) => parseEncName c1 (a1,q1)
9760 | _ => let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expLitQuote,[c1])))
9761 in raise SyntaxError(c1,a2,q1)
9762 end
9763 end
9764
9765 (*--------------------------------------------------------------------*)
9766 (* parse a standalone declaration starting after 'standalone'. *)
9767 (* Cf. 2.9: *)
9768 (* *)
9769 (* [32] SDDecl ::= S 'standalone' Eq [ VC: Standalone *)
9770 (* ( ("'" ('yes' | 'no') "'") Document *)
9771 (* | ('"' ('yes' | 'no') '"')) Declaration ] *)
9772 (* *)
9773 (* print an error and raise SyntaxState if no '=' is found. *)
9774 (* print an error and raise SyntaxState if no literal is found. *)
9775 (* print an error and end the literal if an entity end is found. *)
9776 (* print an error if the literal is neither 'yes' nor 'no'. *)
9777 (* *)
9778 (* return the standalone status as a boolean option, together with *)
9779 (* the next character and the remaining state. *)
9780 (*--------------------------------------------------------------------*)
9781 (* might raise: SyntaxState *)
9782 (*--------------------------------------------------------------------*)
9783 fun parseStandaloneDecl caq0 =
9784 let
9785 val (quote,a,q) = skipEq caq0
9786
9787 fun doit text (c,a,q) =
9788 if c=quote then (text,getChar (a,q))
9789 else if c<>0wx0 then doit (c::text) (getChar (a,q))
9790 else let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_LITERAL))
9791 in (text,(c,a1,q))
9792 end
9793
9794 val caq1 as (_,_,q1) =
9795 case quote
9796 of 0wx22 (* '""' *) => (getChar (a,q))
9797 | 0wx27 (* "'" *) => (getChar (a,q))
9798 | _ => let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expLitQuote,[quote])))
9799 in raise SyntaxError(quote,a1,q)
9800 end
9801 val (text,caq2) = doit nil caq1
9802 in
9803 case text
9804 of [0wx73,0wx65,0wx79] (* reversed "yes" *) => (SOME true,caq2)
9805 | [0wx6f,0wx6e] (* reversed "no" *) => (SOME false,caq2)
9806 | revd => let val (c2,a2,q2) = caq2
9807 val a3 = hookError(a2,(getPos q1,ERR_EXPECTED(expNoYes,revd)))
9808 in (NONE,(c2,a3,q2))
9809 end
9810 end
9811
9812 (*--------------------------------------------------------------------*)
9813 (* parse an xml declaration starting after 'xml ' (i.e. the first *)
9814 (* white space character is already consumed). Cf. 2.8: *)
9815 (* *)
9816 (* [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S?'?>'*)
9817 (* *)
9818 (* [24] VersionInfo ::= S 'version' Eq (' VersionNum ' *)
9819 (* | " VersionNum ") *)
9820 (* *)
9821 (* [32] SDDecl ::= S 'standalone' Eq [ VC: Standalone *)
9822 (* ( ("'" ('yes' | 'no') "'") Document *)
9823 (* | ('"' ('yes' | 'no') '"')) Declaration ] *)
9824 (* *)
9825 (* [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' *)
9826 (* |"'" EncName "'") *)
9827 (* *)
9828 (* default version, encoding and standalone status to NONE. *)
9829 (* *)
9830 (* print an error if no leading white space is found. *)
9831 (* print an error whenever a wrong name is encountered. *)
9832 (* print an Error if no VersionInfo is found. *)
9833 (* print an Error if no '?>' is found at the end. *)
9834 (* print an error and raise SyntaxState if no '=' or no literal is *)
9835 (* found in VersionInfo, EncodingDecl or SDDecl. *)
9836 (* print an error if a literal does not have a correct value. *)
9837 (* *)
9838 (* return the corresponding XmlDecl option and the next char & state. *)
9839 (*--------------------------------------------------------------------*)
9840 (* might raise: SyntaxState *)
9841 (*--------------------------------------------------------------------*)
9842 fun parseXmlDecl auto caq =
9843 let
9844 (*-----------------------------------------------------------------*)
9845 (* skip the '?>' at the end of the xml declaration. *)
9846 (* *)
9847 (* print an error and raise SyntaxState if no '?>' is found. *)
9848 (* *)
9849 (* return the info passed as first arg, and the next char & state. *)
9850 (*-----------------------------------------------------------------*)
9851 (* might raise: SyntaxState *)
9852 (*-----------------------------------------------------------------*)
9853 fun skipXmlDeclEnd enc res (c,a,q) =
9854 if c=0wx3F (* "#?" *)
9855 then let val (c1,a1,q1) = getChar (a,q)
9856 in if c1=0wx3E (* #">" *) then (enc,SOME res,getChar (a1,q1))
9857 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expGt,[c1])))
9858 in raise SyntaxError (c1,a2,q1)
9859 end
9860 end
9861 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expProcEnd,[c])))
9862 in raise SyntaxError (c,a1,q)
9863 end
9864 (*-----------------------------------------------------------------*)
9865 (* parse the remainder after the keyword 'standalone', the version *)
9866 (* and encoding already parsed and given in the first arg. *)
9867 (* *)
9868 (* pass the version,encoding and sd status to skipXmlDeclEnd *)
9869 (*-----------------------------------------------------------------*)
9870 (* might raise: SyntaxState *)
9871 (*-----------------------------------------------------------------*)
9872 fun parseXmlDeclAfterS enc (v,e) caq =
9873 let
9874 val (alone,caq1) = parseStandaloneDecl caq
9875 val caq2 = skipSopt caq1
9876 in skipXmlDeclEnd enc (v,e,alone) caq2
9877 end
9878 (*-----------------------------------------------------------------*)
9879 (* parse the remainder after the encoding declaration, the version *)
9880 (* and encoding already parsed and given in the first arg. *)
9881 (* *)
9882 (* print an error if a name other than 'standalone' is found. *)
9883 (* *)
9884 (* pass the version and encoding to parseXmlDeclAfterS. *)
9885 (*-----------------------------------------------------------------*)
9886 (* might raise: SyntaxState *)
9887 (*-----------------------------------------------------------------*)
9888 fun parseXmlDeclBeforeS enc (v,e) caq =
9889 let
9890 val (hadS,caq1 as (_,_,q1)) = skipSmay caq
9891 val (name,(c2,a2,q2)) = parseName caq1 (* NotFound handled below *)
9892 val a3 = if hadS then a2
9893 else hookError(a2,(getPos q1,ERR_MISSING_WHITE))
9894 in case name
9895 of [0wx73,0wx74,0wx61,0wx6e,0wx64,0wx61,0wx6c,0wx6f,0wx6e,0wx65] =>
9896 (* "standalone" *) parseXmlDeclAfterS enc (v,e) (c2,a3,q2)
9897 | _ => let val a4 = hookError(a3,(getPos q1,ERR_EXPECTED(expStandOpt,name)))
9898 in parseXmlDeclAfterS enc (v,e) (c2,a4,q2)
9899 end
9900 end
9901 handle NotFound caq => (* exception raised by parseName *)
9902 skipXmlDeclEnd enc (v,e,NONE) caq
9903 (*-----------------------------------------------------------------*)
9904 (* parse the remainder after the keyword 'encoding', the version *)
9905 (* already parsed and given in the first arg. *)
9906 (* *)
9907 (* pass the version and encoding and to parseXmlDeclBeforeS *)
9908 (*-----------------------------------------------------------------*)
9909 (* might raise: SyntaxState *)
9910 (*-----------------------------------------------------------------*)
9911 fun parseXmlDeclAfterE ver caq =
9912 let
9913 val (enc,(c1,a1,q1)) = parseEncodingDecl caq
9914 val (a2,q2,enc1) = changeAuto(a1,q1,enc)
9915 in
9916 parseXmlDeclBeforeS enc1 (ver,SOME enc) (c1,a2,q2)
9917 end
9918 (*-----------------------------------------------------------------*)
9919 (* parse the remainder after the version info, the version already *)
9920 (* parsed and given in the first arg. *)
9921 (* *)
9922 (* print an error if a name other than 'encoding' or 'standalone' *)
9923 (* is found. *)
9924 (* *)
9925 (* pass obtained/default values to parseXmlDeclAfter[E|S] or to *)
9926 (* skipXmlDeclEnd. *)
9927 (*-----------------------------------------------------------------*)
9928 (* might raise: SyntaxState *)
9929 (*-----------------------------------------------------------------*)
9930 fun parseXmlDeclBeforeE ver caq =
9931 let
9932 val (hadS,caq1 as (_,_,q1)) = skipSmay caq
9933 val (name,(c2,a2,q2)) = parseName caq1 (* NotFound handled below *)
9934 val a3 = if hadS then a2
9935 else hookError(a2,(getPos q1,ERR_MISSING_WHITE))
9936 in
9937 case name
9938 of [0wx65,0wx6e,0wx63,0wx6f,0wx64,0wx69,0wx6e,0wx67] =>
9939 (* "encoding" *) parseXmlDeclAfterE ver (c2,a3,q2)
9940 | [0wx73,0wx74,0wx61,0wx6e,0wx64,0wx61,0wx6c,0wx6f,0wx6e,0wx65] =>
9941 (* "standalone" *) parseXmlDeclAfterS auto (ver,NONE) (c2,a3,q2)
9942 | _ => let val a4 = hookError(a3,(getPos q1,ERR_EXPECTED(expEncStand,name)))
9943 in parseXmlDeclAfterE ver (c2,a4,q2)
9944 end
9945 end
9946 handle NotFound caq => (* exception raised by parseName *)
9947 skipXmlDeclEnd auto (ver,NONE,NONE) caq
9948
9949 (*-----------------------------------------------------------------*)
9950 (* do the main work. if the first name is not 'version' then it *)
9951 (* might be 'encoding' or 'standalone'. Then take the default *)
9952 (* NONE for version and - if needed - encoding and call the *)
9953 (* appropriate function. otherwise assume a typo and parse the *)
9954 (* version number, then call parseXmlDeclBeforeE. if no name is *)
9955 (* found at all, proceed with skipXmlDeclEnd. *)
9956 (* *)
9957 (* print an error and raise SyntaxState if an entity end is found. *)
9958 (* print an error and raise SyntaxState if appropriate. *)
9959 (* print an error if a name other than 'version' is found. *)
9960 (*-----------------------------------------------------------------*)
9961 (* might raise: SyntaxState *)
9962 (*-----------------------------------------------------------------*)
9963 val caq1 as (_,_,q1) = skipSopt caq
9964 val (name,(caq2 as (c2,a2,q2))) = parseName caq1
9965 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expVersion,[c])
9966 val a1 = hookError(a,(getPos q,err))
9967 in raise SyntaxError (c,a1,q)
9968 end
9969 in
9970 if name=[0wx76,0wx65,0wx72,0wx73,0wx69,0wx6f,0wx6e] (* "version" *)
9971 then let val (ver,caq3) = parseVersionInfo caq2
9972 in parseXmlDeclBeforeE ver caq3
9973 end
9974 else let val a3 = hookError(a2,(getPos q1,ERR_EXPECTED(expVersion,name)))
9975 in case name
9976 of [0wx65,0wx6e,0wx63,0wx6f,0wx64,0wx69,0wx6e,0wx67] =>
9977 (* "encoding" *) parseXmlDeclAfterE NONE (c2,a3,q2)
9978 | [0wx73,0wx74,0wx61,0wx6e,0wx64,0wx61,0wx6c,0wx6f,0wx6e,0wx65] =>
9979 (* "standalone" *) parseXmlDeclAfterS auto (NONE,NONE) (c2,a3,q2)
9980 | _ => let val (ver,caq3) = parseVersionInfo (c2,a3,q2)
9981 in parseXmlDeclBeforeE ver caq3
9982 end
9983 end
9984 end
9985 (*----------------------------------------------------------------*)
9986 (* catch entity end exceptions raised by subfunctions, print an *)
9987 (* error and re-raise the exception. *)
9988 (*----------------------------------------------------------------*)
9989 handle SyntaxError(c,a,q) =>
9990 let val err = if c=0wx0 then ERR_ENDED_BY_EE LOC_XML_DECL
9991 else ERR_CANT_PARSE LOC_XML_DECL
9992 val a1 = hookError(a,(getPos q,err))
9993 in (auto,NONE,recoverXml(c,a1,q))
9994 end
9995
9996 (*--------------------------------------------------------------------*)
9997 (* parse a text declaration starting after 'xml ' (i.e. the first *)
9998 (* white space character is already consumed). Cf. 2.8: *)
9999 (* *)
10000 (* [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' *)
10001 (* *)
10002 (* [24] VersionInfo ::= S 'version' Eq (' VersionNum ' *)
10003 (* | " VersionNum ") *)
10004 (* *)
10005 (* [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' *)
10006 (* |"'" EncName "'") *)
10007 (* *)
10008 (* default version and encoding to NONE. *)
10009 (* *)
10010 (* print an error if no leading white space is found. *)
10011 (* print an error whenever a wrong name is encountered. *)
10012 (* print an Error if no EncodingDecl is found. *)
10013 (* print an Error if '?>' is found at the end. *)
10014 (* print an error and raise SyntaxState if no '=' or no literal is *)
10015 (* found in VersionInfo or EncodingDecl. *)
10016 (* print an error if a literal does not have a correct value. *)
10017 (* *)
10018 (* return the corresponding TextDecl option and the next char & state.*)
10019 (*--------------------------------------------------------------------*)
10020 (* might raise: SyntaxState *)
10021 (*--------------------------------------------------------------------*)
10022 fun parseTextDecl auto caq =
10023 let
10024 (*-----------------------------------------------------------------*)
10025 (* skip the '?>' at the end of the text declaration. *)
10026 (* *)
10027 (* print an error and raise SyntaxState if no '?>' is found. *)
10028 (* *)
10029 (* return the info passed as first arg, and the next char & state. *)
10030 (*-----------------------------------------------------------------*)
10031 (* might raise: SyntaxState *)
10032 (*-----------------------------------------------------------------*)
10033 fun skipTextDeclEnd enc res (c,a,q) =
10034 if c=0wx3F (* "#?" *)
10035 then let val (c1,a1,q1) = getChar (a,q)
10036 in if c1=0wx3E (* #">" *) then (enc,SOME res,getChar (a1,q1))
10037 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expGt,[c1])))
10038 in raise SyntaxError(c1,a2,q1)
10039 end
10040 end
10041 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expProcEnd,[c])))
10042 in raise SyntaxError(c,a1,q)
10043 end
10044 (*-----------------------------------------------------------------*)
10045 (* parse the remainder after the keyword 'encoding', the version *)
10046 (* already parsed and given in the first arg. *)
10047 (* *)
10048 (* pass the version and encoding and to skipTextDeclEnd. *)
10049 (*-----------------------------------------------------------------*)
10050 (* might raise: SyntaxState *)
10051 (*-----------------------------------------------------------------*)
10052 fun parseTextDeclAfterE ver caq =
10053 let
10054 val (enc,(c1,a1,q1)) = parseEncodingDecl caq
10055 val (a2,q2,enc1) = changeAuto(a1,q1,enc)
10056 val caq3 = skipSopt (c1,a2,q2)
10057 in skipTextDeclEnd enc1 (ver,SOME enc) caq3
10058 end
10059 (*-----------------------------------------------------------------*)
10060 (* parse the remainder after the version info, the version given *)
10061 (* as first argument. *)
10062 (* *)
10063 (* print an error and raise SyntaxState is no name is found. *)
10064 (* print an error if a name other than 'encoding' is found. *)
10065 (* *)
10066 (* pass obtained/default values to parseTextDeclAfterE. *)
10067 (*-----------------------------------------------------------------*)
10068 (* might raise: SyntaxState *)
10069 (*-----------------------------------------------------------------*)
10070 fun parseTextDeclBeforeE ver caq =
10071 let
10072 val caq1 as (_,_,q1) = skipS caq
10073 val (name,caq2) = parseName caq1
10074 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expEncoding,[c])
10075 val a1 = hookError(a,(getPos q,err))
10076 in raise SyntaxError (c,a1,q)
10077 end
10078 in
10079 if name=[0wx65,0wx6e,0wx63,0wx6f,0wx64,0wx69,0wx6e,0wx67] (* "encoding" *)
10080 then parseTextDeclAfterE ver caq2
10081 else let val (c2,a2,q2) = caq2
10082 val a3 = hookError(a2,(getPos q1,ERR_EXPECTED(expEncoding,name)))
10083 in parseTextDeclAfterE ver (c2,a3,q2)
10084 end
10085 end
10086 (*-----------------------------------------------------------------*)
10087 (* do the main work. if the first name is neither 'version' nor *)
10088 (* 'encoding' then assume typo of 'version'. Then parse the *)
10089 (* version number, call parseTextDeclBeforeE. if no name is found *)
10090 (* at all, proceed with skipTextDeclEnd. *)
10091 (* *)
10092 (* print an error and raise SyntaxState if appropriate. *)
10093 (* print an error if a name other than 'version' or 'encoding' is *)
10094 (* found. *)
10095 (*-----------------------------------------------------------------*)
10096 (* might raise: SyntaxState *)
10097 (*-----------------------------------------------------------------*)
10098 val caq1 as (_,_,q1) = skipSopt caq
10099 val (name,caq2) = parseName caq1
10100 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expEncVers,[c])
10101 val a1 = hookError(a,(getPos q,err))
10102 in raise SyntaxError(c,a1,q)
10103 end
10104 in case name
10105 of [0wx76,0wx65,0wx72,0wx73,0wx69,0wx6f,0wx6e] => (* "version" *)
10106 let val (ver,caq3) = parseVersionInfo caq2
10107 in parseTextDeclBeforeE ver caq3
10108 end
10109 | [0wx65,0wx6e,0wx63,0wx6f,0wx64,0wx69,0wx6e,0wx67] => (* "encoding" *)
10110 parseTextDeclAfterE NONE caq2
10111 | _ => let val (c2,a2,q2) = caq2
10112 val a3 = hookError(a2,(getPos q1,ERR_EXPECTED(expEncVers,name)))
10113 val (ver,caq3) = parseVersionInfo (c2,a3,q2)
10114 in parseTextDeclBeforeE ver caq3
10115 end
10116 end
10117 (*----------------------------------------------------------------*)
10118 (* catch entity end exceptions raised by subfunctions, print an *)
10119 (* error and re-raise the exception. *)
10120 (*----------------------------------------------------------------*)
10121 handle SyntaxError(c,a,q) =>
10122 let val err = if c=0wx0 then ERR_ENDED_BY_EE LOC_TEXT_DECL
10123 else ERR_CANT_PARSE LOC_TEXT_DECL
10124 val a1 = hookError(a,(getPos q,err))
10125 in (auto,NONE,recoverXml(c,a1,q))
10126 end
10127
10128 (*--------------------------------------------------------------------*)
10129 (* check for the string "<?xml" followed by a white space. The first *)
10130 (* paramter seen is a prefix of that string already consued. If the *)
10131 (* complete string is not found, unget all characters seen, including *)
10132 (* those from parameter seen. *)
10133 (* *)
10134 (* return a boolean indicating wheher the string was found, together *)
10135 (* with the remaining app data and state. *)
10136 (*--------------------------------------------------------------------*)
10137 (* might raise: none *)
10138 (*--------------------------------------------------------------------*)
10139 fun checkForXml aq =
10140 let
10141 val unseen = [0wx3c,0wx3f,0wx78,0wx6d,0wx6c]
10142 fun doit (seen,unseen) (a,q) =
10143 let val (c1,a1,q1) = getChar (a,q)
10144 in case unseen
10145 of nil => if isS c1 then (true,(a1,q1))
10146 else (false,(a1,ungetChars(q1,rev(c1::seen))))
10147 | c::cs => if c1=c then doit (c1::seen,cs) (a1,q1)
10148 else (false,(a1,ungetChars(q1,rev(c1::seen))))
10149 end
10150 in doit (nil,unseen) aq
10151 end
10152
10153 (*--------------------------------------------------------------------*)
10154 (* consume the text/xml declaration. The first parameter is a pair of *)
10155 (* the function that parses the declaration and a boolean indicating *)
10156 (* whether a warning should we produced if the declaration is missing.*)
10157 (* The second parameter is a pair (seen,auto), where auto is the *)
10158 (* auto-detected encoding, and seen is SOME cs, if auto-detection *)
10159 (* found some initial characters cs of the string "<?xml", otherwise *)
10160 (* NONE. A text/xml declaration can only be present if seen is SOME. *)
10161 (* *)
10162 (* Check whether the declaration is present with checkForXml. If yes, *)
10163 (* parse it, if no, possibly print a warning. *)
10164 (* *)
10165 (* Return the encoding of the entity, the optional declaration and *)
10166 (* the next char, app data and state. *)
10167 (*--------------------------------------------------------------------*)
10168 fun findTextDecl (parseDecl,warn) auto aq =
10169 let val (hasXml,aq1) = checkForXml aq
10170 in if hasXml then parseDecl auto (getChar aq1)
10171 else let val (a1,q1) = aq1
10172 val (a2,q2) = commitAuto(a1,q1)
10173 val a3 = if warn then hookWarning(a2,(getPos q2,WARN_NO_XML_DECL)) else a2
10174 in (auto,NONE,getChar(a3,q2))
10175 end
10176 end
10177
10178 (*--------------------------------------------------------------------*)
10179 (* open an external entity; consume its text declaration if present. *)
10180 (* See 4.3.2: *)
10181 (* *)
10182 (* [78] extParsedEnt ::= TextDecl? content *)
10183 (* [79] extPE ::= TextDecl? extSubsetDecl *)
10184 (* *)
10185 (* handle NoSuchFile by printing an error and opening an empty dummy *)
10186 (* entity (some functions might rely on the entity's entity end). *)
10187 (* *)
10188 (* return the optional text declaration and the resulting first char *)
10189 (* together with the new state. *)
10190 (*--------------------------------------------------------------------*)
10191 (* might raise: none *)
10192 (*--------------------------------------------------------------------*)
10193 fun openExtern (id,isParam,uri) (a,q) =
10194 let val (q1,auto) = pushExtern (q,id,isParam,uri)
10195 in findTextDecl (parseTextDecl,false) auto (a,q1)
10196 end
10197 handle NoSuchFile fmsg => raise CantOpenFile(fmsg,a)
10198
10199 (*--------------------------------------------------------------------*)
10200 (* open the external subset; consume its text declaration if present. *)
10201 (* See 2.8: *)
10202 (* *)
10203 (* [30] extSubset ::= TextDecl? extSubsetDecl *)
10204 (* *)
10205 (* return the optional text declaration and the first char and state. *)
10206 (*--------------------------------------------------------------------*)
10207 (* might raise: NoSuchFile *)
10208 (*--------------------------------------------------------------------*)
10209 fun openSubset uri a =
10210 let val (q,auto) = pushSpecial (EXT_SUBSET,SOME uri)
10211 in findTextDecl (parseTextDecl,false) auto (a,q)
10212 end
10213 handle NoSuchFile fmsg => raise CantOpenFile(fmsg,a)
10214
10215 (*--------------------------------------------------------------------*)
10216 (* open the document entity; consume its xml declaration if present. *)
10217 (* See 2.8: *)
10218 (* *)
10219 (* [1] document ::= prolog element Misc* *)
10220 (* [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc* )? *)
10221 (* *)
10222 (* return the optional xml declaration and the first char and state. *)
10223 (*--------------------------------------------------------------------*)
10224 (* might raise: NoSuchFile *)
10225 (*--------------------------------------------------------------------*)
10226 fun openDocument uri a =
10227 let val (q,auto) = pushSpecial (DOC_ENTITY,uri)
10228 in findTextDecl (parseXmlDecl,!O_WARN_XML_DECL) auto (a,q)
10229 end
10230 handle NoSuchFile fmsg => raise CantOpenFile(fmsg,a)
10231 end
10232 (* stop of ../../Parser/Parse/parseXml.sml *)
10233 (* start of ../../Parser/Parse/parseRefs.sml *)
10234 signature ParseRefs =
10235 sig
10236 (*----------------------------------------------------------------------
10237 include ParseBase
10238
10239 val parseName : UniChar.Char * AppData * State
10240 -> UniChar.Data * (UniChar.Char * AppData * State)
10241 val parseNmtoken : UniChar.Char * AppData * State
10242 -> UniChar.Data * (UniChar.Char * AppData * State)
10243 val parseEntName : UniChar.Data * UniChar.Data -> UniChar.Char * AppData * State
10244 -> bool * UniChar.Data * UniChar.Data * (UniChar.Char * AppData * State)
10245
10246 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
10247 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
10248 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
10249 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
10250 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
10251 val parseSopt : UniChar.Data -> UniChar.Char * AppData * State
10252 -> UniChar.Data * (UniChar.Char * AppData * State)
10253 val parseSmay : UniChar.Data -> UniChar.Char * AppData * State
10254 -> bool * (UniChar.Data * (UniChar.Char * AppData * State))
10255 val parseEq : UniChar.Char * AppData * State
10256 -> UniChar.Data * (UniChar.Char * AppData * State)
10257
10258 val openExtern : int * Uri.Uri -> AppData * State
10259 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
10260 val openDocument : Uri.Uri option -> AppData
10261 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
10262 val openSubset : Uri.Uri -> AppData
10263 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
10264 ----------------------------------------------------------------------*)
10265 include ParseXml
10266
10267 val parseCharRef : AppData * State -> UniChar.Char * AppData * State
10268 val parseGenRef : Dtd -> UniChar.Char * AppData * State
10269 -> (int * Base.GenEntity) * (AppData * State)
10270 val parseParRef : Dtd -> UniChar.Char * AppData * State
10271 -> (int * Base.ParEntity) * (AppData * State)
10272
10273 val parseCharRefLit : UniChar.Data -> AppData * State
10274 -> UniChar.Data * (UniChar.Char * AppData * State)
10275 val parseGenRefLit : Dtd -> UniChar.Data -> UniChar.Char * AppData * State
10276 -> UniChar.Data * ((int * Base.GenEntity) * (AppData * State))
10277 val parseParRefLit : Dtd -> UniChar.Data -> UniChar.Char * AppData * State
10278 -> UniChar.Data * ((int * Base.ParEntity) * (AppData * State))
10279
10280 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
10281 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
10282
10283 val skipPS : Dtd -> UniChar.Char * AppData * State
10284 -> UniChar.Char * AppData * State
10285 val skipPSopt : Dtd -> UniChar.Char * AppData * State
10286 -> UniChar.Char * AppData * State
10287 val skipPSmay : Dtd -> UniChar.Char * AppData * State
10288 -> bool * (UniChar.Char * AppData * State)
10289 val skipPSdec : Dtd -> UniChar.Char * AppData * State
10290 -> bool * (UniChar.Char * AppData * State)
10291 end
10292
10293 (*--------------------------------------------------------------------------*)
10294 (* Structure: ParseRefs *)
10295 (* *)
10296 (* Exceptions raised by functions in this structure: *)
10297 (* parseCharRef : NoSuchChar SyntaxError *)
10298 (* parseGenRef : NoSuchEntity SyntaxState *)
10299 (* parseParRef : NoSuchEntity SyntaxState *)
10300 (* skipCharRef : none *)
10301 (* skipPS : none *)
10302 (* skipPSdec : none *)
10303 (* skipPSmay : none *)
10304 (* skipPSopt : none *)
10305 (* skipReference : none *)
10306 (*--------------------------------------------------------------------------*)
10307 functor ParseRefs (structure ParseBase : ParseBase)
10308 : ParseRefs =
10309 struct
10310 structure ParseXml = ParseXml (structure ParseBase = ParseBase)
10311
10312 open
10313 Base Errors UniClasses
10314 ParseXml
10315
10316 (*--------------------------------------------------------------------*)
10317 (* parse a character reference, the "&#" already read. See 4.1: *)
10318 (* *)
10319 (* [66] CharRef ::= '&#' [0-9]+ ';' *)
10320 (* | '&#x' [0-9a-fA-F]+ ';' [ WFC: Legal Character ] *)
10321 (* *)
10322 (* Well-Formedness Constraint: Legal Character *)
10323 (* Characters referred to using character references must match the *)
10324 (* production for Char. *)
10325 (* *)
10326 (* If the character reference begins with "&#x", the digits and *)
10327 (* letters up to the terminating ; provide a hexadecimal *)
10328 (* representation of the character's code point in ISO/IEC 10646. *)
10329 (* If it begins just with "&#", the digits up to the terminating ; *)
10330 (* provide a decimal representation of the character's code point. *)
10331 (* *)
10332 (* raise SyntaxError if no number or x plus hexnum is found, or if no *)
10333 (* semicolon follows it. *)
10334 (* raise NoSuchChar if the reference is to a non-XML character. *)
10335 (* *)
10336 (* return the character referred to, and the remaining state. *)
10337 (*--------------------------------------------------------------------*)
10338 fun parseCharRef aq =
10339 let
10340 (*--------------------------------------------------------------*)
10341 (* parse a (hexa)decimal number, accumulating the value in the *)
10342 (* first parameter. *)
10343 (* *)
10344 (* return the numbers value as a Char. *)
10345 (*--------------------------------------------------------------*)
10346 fun do_hex_n yet (c,a,q) =
10347 case hexValue c
10348 of NONE => (yet,(c,a,q))
10349 | SOME v => do_hex_n (0wx10*yet+v) (getChar (a,q))
10350 fun do_dec_n yet (c,a,q) =
10351 case decValue c
10352 of NONE => (yet,(c,a,q))
10353 | SOME v => do_dec_n (0wx0A*yet+v) (getChar (a,q))
10354 (*--------------------------------------------------------------*)
10355 (* Parse a (hexa)decimal number of at least one digit. *)
10356 (* *)
10357 (* raise SyntaxError if no hexdigit is found first. *)
10358 (* *)
10359 (* return the numbers value as a Char. *)
10360 (*--------------------------------------------------------------*)
10361 fun do_hex_1 (c,a,q) =
10362 case hexValue c
10363 of SOME v => do_hex_n v (getChar (a,q))
10364 | NONE => let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expHexDigit,[c])))
10365 in raise SyntaxError(c,a1,q)
10366 end
10367 (*--------------------------------------------------------------*)
10368 (* Parse a decimal number of at least one digit, or a hexnumber *)
10369 (* if the first character is 'x'. *)
10370 (* *)
10371 (* raise SyntaxError if neither 'x' nor digit is found first. *)
10372 (* *)
10373 (* return the number's value as a Char. *)
10374 (*--------------------------------------------------------------*)
10375 fun do_dec_1 (c,a,q) =
10376 case decValue c
10377 of SOME v => do_dec_n v (getChar (a,q))
10378 | NONE => if c=0wx78 (* #"x" *)
10379 then do_hex_1 (getChar (a,q))
10380 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expDigitX,[c])))
10381 in raise SyntaxError(c,a1,q)
10382 end
10383
10384 val (ch,(c1,a1,q1)) = do_dec_1 (getChar aq)
10385
10386 val _ = if c1=0wx3B then ()
10387 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expSemi,[c1])))
10388 in raise SyntaxError(c1,a2,q1)
10389 end
10390
10391 val _ = if isXml ch then ()
10392 else let val a2 = hookError(a1,(getPos q1,ERR_NON_XML_CHARREF ch))
10393 in raise NoSuchChar (a2,q1)
10394 end
10395 in (ch,a1,q1)
10396 end
10397 fun parseCharRefLit cs aq =
10398 let
10399 (*--------------------------------------------------------------*)
10400 (* parse a (hexa)decimal number, accumulating the value in the *)
10401 (* first parameter. *)
10402 (* *)
10403 (* return the numbers value as a Char. *)
10404 (*--------------------------------------------------------------*)
10405 fun do_hex_n (cs,yet) (c,a,q) =
10406 case hexValue c
10407 of NONE => (cs,yet,(c,a,q))
10408 | SOME v => do_hex_n (c::cs,0wx10*yet+v) (getChar (a,q))
10409 fun do_dec_n (cs,yet) (c,a,q) =
10410 case decValue c
10411 of NONE => (cs,yet,(c,a,q))
10412 | SOME v => do_dec_n (c::cs,0wx0A*yet+v) (getChar (a,q))
10413 (*--------------------------------------------------------------*)
10414 (* Parse a (hexa)decimal number of at least one digit. *)
10415 (* *)
10416 (* raise SyntaxError if no hexdigit is found first. *)
10417 (* *)
10418 (* return the numbers value as a Char. *)
10419 (*--------------------------------------------------------------*)
10420 fun do_hex_1 cs (c,a,q) =
10421 case hexValue c
10422 of SOME v => do_hex_n (c::cs,v) (getChar (a,q))
10423 | NONE => let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expHexDigit,[c])))
10424 in raise SyntaxError(c,a1,q)
10425 end
10426 (*--------------------------------------------------------------*)
10427 (* Parse a decimal number of at least one digit, or a hexnumber *)
10428 (* if the first character is 'x'. *)
10429 (* *)
10430 (* raise SyntaxError if neither 'x' nor digit is found first. *)
10431 (* *)
10432 (* return the number's value as a Char. *)
10433 (*--------------------------------------------------------------*)
10434 fun do_dec_1 cs (c,a,q) =
10435 case decValue c
10436 of SOME v => do_dec_n (c::cs,v) (getChar (a,q))
10437 | NONE => if c=0wx78 (* #"x" *)
10438 then do_hex_1 (c::cs) (getChar (a,q))
10439 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expDigitX,[c])))
10440 in raise SyntaxError(c,a1,q)
10441 end
10442
10443 val (cs1,ch,(c1,a1,q1)) = do_dec_1 cs (getChar aq)
10444
10445 val _ = if c1=0wx3B then ()
10446 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expSemi,[c1])))
10447 in raise SyntaxError(c1,a2,q1)
10448 end
10449
10450 val _ = if isXml ch then ()
10451 else let val a2 = hookError(a1,(getPos q1,ERR_NON_XML_CHARREF ch))
10452 in raise NoSuchChar (a2,q1)
10453 end
10454 in (c1::cs1,(ch,a1,q1))
10455 end
10456
10457 (*--------------------------------------------------------------------*)
10458 (* parse a general entity reference, the "&" already read. See 4.1: *)
10459 (* *)
10460 (* [68] EntityRef ::= '&' Name ';' [ WFC: Entity Declared ] *)
10461 (* [ VC: Entity Declared ] *)
10462 (* [ WFC: Parsed Entity ] *)
10463 (* [ WFC: No Recursion ] *)
10464 (* *)
10465 (* Well-Formedness Constraint: Entity Declared *)
10466 (* In a document without any DTD, a document with only an internal *)
10467 (* DTD subset which contains no parameter entity references, or a *)
10468 (* document with "standalone='yes'", the Name given in the entity *)
10469 (* reference must match that in an entity declaration, ... *)
10470 (* ... the declaration of a general entity must precede any *)
10471 (* reference to it which appears in a default value in an *)
10472 (* attribute-list declaration. *)
10473 (* *)
10474 (* Validity Constraint: Entity Declared *)
10475 (* In a document with an external subset or external parameter *)
10476 (* entities with "standalone='no'", the Name given in the entity *)
10477 (* reference must match that in an entity declaration. ... *)
10478 (* ... the declaration of a general entity must precede any *)
10479 (* reference to it which appears in a default value in an *)
10480 (* attribute-list declaration. *)
10481 (* *)
10482 (* Thus: in both cases it is an error if the entity is not declared. *)
10483 (* The only difference is the impact on well-formednes/validity. *)
10484 (* *)
10485 (* There are three contexts in which a general entity reference can *)
10486 (* appear: in content, in attribute value, in entity value. This *)
10487 (* passage states that it need not be declared prior to a reference *)
10488 (* in an entity value. But in this context, it is bypassed and not *)
10489 (* included, i.e., it need not be recognized. *)
10490 (* *)
10491 (* Well-Formedness Constraint: Parsed Entity *)
10492 (* An entity reference must not contain the name of an unparsed *)
10493 (* entity. Unparsed entities may be referred to only in attribute *)
10494 (* values ... *)
10495 (* *)
10496 (* Well-Formedness Constraint: No Recursion *)
10497 (* A parsed entity must not contain a recursive reference to *)
10498 (* itself, either directly or indirectly. *)
10499 (* *)
10500 (* print an error and raise SyntaxState if no name is found, or if no *)
10501 (* semicolon follows it. *)
10502 (* print an error and return GE_NULL if the reference is to an *)
10503 (* undeclared, unparsed or open entity. *)
10504 (* *)
10505 (* return the entity referred to, and the remaining state. *)
10506 (*--------------------------------------------------------------------*)
10507 fun parseGenRef dtd (caq as (_,_,q)) =
10508 let
10509 val (name,(c1,a1,q1)) = parseName caq
10510 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
10511 val a1 = hookError(a,(getPos q,err))
10512 in raise SyntaxError(c,a1,q)
10513 end
10514 val _ = if c1=0wx3B then ()
10515 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expSemi,[c1])))
10516 in raise SyntaxError(c1,a2,q1)
10517 end
10518
10519 val idx = GenEnt2Index dtd name
10520 val (ent,ext) = getGenEnt dtd idx
10521
10522 val _ = (* check whether entity is undeclared/unparsed/open *)
10523 case ent
10524 of GE_NULL =>
10525 if entitiesWellformed dtd
10526 then let val err = ERR_UNDEC_ENTITY(ENT_GENERAL,name)
10527 val a2 = hookError(a1,(getPos q,err))
10528 in raise NoSuchEntity (a2,q1)
10529 end
10530 else if useParamEnts()
10531 then let val err = ERR_UNDECLARED(IT_GEN_ENT,name,LOC_NONE)
10532 val a2 = hookError(a1,(getPos q,err))
10533 in raise NoSuchEntity (a2,q1)
10534 end
10535 else ()
10536 | GE_UNPARSED _ => let val err = ERR_ILLEGAL_ENTITY(ENT_UNPARSED,name,LOC_NONE)
10537 val a2 = hookError(a1,(getPos q,err))
10538 in raise NoSuchEntity (a2,q1)
10539 end
10540 | _ => if isOpen(idx,false,q1)
10541 then let val err = ERR_RECURSIVE_ENTITY(ENT_GENERAL,name)
10542 val a2 = hookError(a1,(getPos q,err))
10543 in raise NoSuchEntity (a2,q1)
10544 end
10545 else ()
10546
10547 val a2 =
10548 if ext andalso !O_VALIDATE andalso standsAlone dtd andalso inDocEntity q1
10549 then let val _ = if !O_ERROR_MINIMIZE then setStandAlone dtd false else ()
10550 in hookError(a1,(getPos q,ERR_STANDALONE_ENT(ENT_GENERAL,name)))
10551 end
10552 else a1
10553
10554 in ((idx,ent),(a2,q1))
10555 end
10556 fun parseGenRefLit dtd cs (caq as (_,_,q)) =
10557 let
10558 val (cs1,name,(c1,a1,q1)) = parseNameLit cs caq
10559 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
10560 val a1 = hookError(a,(getPos q,err))
10561 in raise SyntaxError(c,a1,q)
10562 end
10563 val _ = if c1=0wx3B then ()
10564 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expSemi,[c1])))
10565 in raise SyntaxError(c1,a2,q1)
10566 end
10567
10568 val idx = GenEnt2Index dtd name
10569 val (ent,ext) = getGenEnt dtd idx
10570
10571 val _ = (* check whether entity is undeclared/unparsed/open *)
10572 case ent
10573 of GE_NULL =>
10574 if entitiesWellformed dtd
10575 then let val err = ERR_UNDEC_ENTITY(ENT_GENERAL,name)
10576 val a2 = hookError(a1,(getPos q,err))
10577 in raise NoSuchEntity (a2,q1)
10578 end
10579 else if useParamEnts()
10580 then let val err = ERR_UNDECLARED(IT_GEN_ENT,name,LOC_NONE)
10581 val a2 = hookError(a1,(getPos q,err))
10582 in raise NoSuchEntity (a2,q1)
10583 end
10584 else ()
10585 | GE_UNPARSED _ => let val err = ERR_ILLEGAL_ENTITY(ENT_UNPARSED,name,LOC_NONE)
10586 val a2 = hookError(a1,(getPos q,err))
10587 in raise NoSuchEntity (a2,q1)
10588 end
10589 | _ => if isOpen(idx,false,q1)
10590 then let val err = ERR_RECURSIVE_ENTITY(ENT_GENERAL,name)
10591 val a2 = hookError(a1,(getPos q,err))
10592 in raise NoSuchEntity (a2,q1)
10593 end
10594 else ()
10595
10596 val a2 =
10597 if ext andalso !O_VALIDATE andalso standsAlone dtd andalso inDocEntity q1
10598 then let val _ = if !O_ERROR_MINIMIZE then setStandAlone dtd false else ()
10599 in hookError(a1,(getPos q,ERR_STANDALONE_ENT(ENT_GENERAL,name)))
10600 end
10601 else a1
10602
10603 in (c1::cs1,((idx,ent),(a2,q1)))
10604 end
10605
10606 (*--------------------------------------------------------------------*)
10607 (* parse a parameter entity reference, the "%" already read. See 4.1: *)
10608 (* *)
10609 (* [69] PEReference ::= '%' Name ';' [ VC: Entity Declared ] *)
10610 (* [ WFC: No Recursion ] *)
10611 (* [ WFC: In DTD ] *)
10612 (* *)
10613 (* Well-Formedness Constraint: Entity Declared *)
10614 (* In a document without any DTD, a document with only an internal *)
10615 (* DTD subset which contains no parameter entity references, or a *)
10616 (* document with "standalone='yes'", the Name given in the entity *)
10617 (* reference must match that in an entity declaration, ... *)
10618 (* The declaration of a parameter entity must precede any reference *)
10619 (* to it... *)
10620 (* *)
10621 (* Validity Constraint: Entity Declared *)
10622 (* In a document with an external subset or external parameter *)
10623 (* entities with "standalone='no'", the Name given in the entity *)
10624 (* reference must match that in an entity declaration. ... *)
10625 (* The declaration of a parameter entity must precede any reference *)
10626 (* to it... *)
10627 (* *)
10628 (* Thus: in both cases it is an error if the entity is not declared. *)
10629 (* The only difference is the impact on well-formednes/validity. *)
10630 (* Because the thing to be parsed is a parameter entity reference, *)
10631 (* this DTD has references, and thus an undeclared entity is probably *)
10632 (* a validity and not a well-formedness error. Thus setExternal must *)
10633 (* be called before determining a possible error! *)
10634 (* *)
10635 (* Well-Formedness Constraint: No Recursion *)
10636 (* A parsed entity must not contain a recursive reference to *)
10637 (* itself, either directly or indirectly. *)
10638 (* *)
10639 (* print an error and raise SyntaxError if no name is found, or if no *)
10640 (* semicolon follows it. *)
10641 (* print an error and return PE_NULL if the reference is to an *)
10642 (* undeclared or open entity. *)
10643 (* *)
10644 (* return the entity referred to, and the remaining state. *)
10645 (*--------------------------------------------------------------------*)
10646 fun parseParRef dtd (caq as (_,_,q)) =
10647 let
10648 val (name,(c1,a1,q1)) = parseName caq
10649 handle NotFound(c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
10650 val a1 = hookError(a,(getPos q,err))
10651 in raise SyntaxError(c,a1,q)
10652 end
10653
10654 val _ = if c1=0wx3B then ()
10655 else let val err = ERR_EXPECTED(expSemi,[c1])
10656 val a2 = hookError(a1,(getPos q1,err))
10657 in raise SyntaxError(c1,a2,q1)
10658 end
10659
10660 val _ = setExternal dtd;
10661 val idx = ParEnt2Index dtd name
10662 val (ent,ext) = getParEnt dtd idx
10663
10664 val _ = (* check whether entity is declared *)
10665 case ent
10666 of PE_NULL =>
10667 if entitiesWellformed dtd
10668 then let val err = ERR_UNDEC_ENTITY(ENT_PARAMETER,name)
10669 val a2 = hookError(a1,(getPos q,err))
10670 in raise NoSuchEntity (a2,q1)
10671 end
10672 else if useParamEnts()
10673 then let val err = ERR_UNDECLARED(IT_PAR_ENT,name,LOC_NONE)
10674 val a2 = hookError(a1,(getPos q,err))
10675 in raise NoSuchEntity (a2,q1)
10676 end
10677 else ()
10678 (* check whether the entity is already open *)
10679 | _ => if isOpen(idx,true,q1)
10680 then let val err = ERR_RECURSIVE_ENTITY(ENT_PARAMETER,name)
10681 val a2 = hookError(a1,(getPos q,err))
10682 in raise NoSuchEntity (a2,q1)
10683 end
10684 else ()
10685 in ((idx,ent),(a1,q1))
10686 end
10687 fun parseParRefLit dtd cs (caq as (_,_,q)) =
10688 let
10689 val (cs1,name,(c1,a1,q1)) = parseNameLit cs caq
10690 handle NotFound(c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
10691 val a1 = hookError(a,(getPos q,err))
10692 in raise SyntaxError(c,a1,q)
10693 end
10694
10695 val _ = if c1=0wx3B then ()
10696 else let val err = ERR_EXPECTED(expSemi,[c1])
10697 val a2 = hookError(a1,(getPos q1,err))
10698 in raise SyntaxError(c1,a2,q1)
10699 end
10700
10701 val _ = setExternal dtd;
10702 val idx = ParEnt2Index dtd name
10703 val (ent,ext) = getParEnt dtd idx
10704
10705 val _ = (* check whether entity is declared *)
10706 case ent
10707 of PE_NULL =>
10708 if entitiesWellformed dtd
10709 then let val err = ERR_UNDEC_ENTITY(ENT_PARAMETER,name)
10710 val a2 = hookError(a1,(getPos q,err))
10711 in raise NoSuchEntity (a2,q1)
10712 end
10713 else if useParamEnts()
10714 then let val err = ERR_UNDECLARED(IT_PAR_ENT,name,LOC_NONE)
10715 val a2 = hookError(a1,(getPos q,err))
10716 in raise NoSuchEntity (a2,q1)
10717 end
10718 else ()
10719 (* check whether the entity is already open *)
10720 | _ => if isOpen(idx,true,q1)
10721 then let val err = ERR_RECURSIVE_ENTITY(ENT_PARAMETER,name)
10722 val a2 = hookError(a1,(getPos q,err))
10723 in raise NoSuchEntity (a2,q1)
10724 end
10725 else ()
10726 in (c1::cs1,((idx,ent),(a1,q1)))
10727 end
10728
10729 (*--------------------------------------------------------------------*)
10730 (* skip a general/parameter entity reference, the "&/%" already read. *)
10731 (* *)
10732 (* print an error if no name is found, or if no semicolon follows it. *)
10733 (* *)
10734 (* handle any SyntaxState by returning its char and state. *)
10735 (* *)
10736 (* return the remaining state. *)
10737 (*--------------------------------------------------------------------*)
10738 fun skipReference caq =
10739 let val (_,(c1,a1,q1)) = parseName caq
10740 in if c1=0wx3B then getChar (a1,q1)
10741 else let val err = ERR_EXPECTED(expSemi,[c1])
10742 val a2 = hookError(a1,(getPos q1,err))
10743 in (c1,a2,q1)
10744 end
10745 end
10746 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
10747 val a1 = hookError(a,(getPos q,err))
10748 in (c,a1,q)
10749 end
10750
10751 (*--------------------------------------------------------------------*)
10752 (* skip a character reference, the "&#" already read. See 4.1: *)
10753 (* *)
10754 (* print an error if no number or x plus hexnum is found, or if no *)
10755 (* semicolon follows it. *)
10756 (* *)
10757 (* handle any SyntaxState by returning its char and state. *)
10758 (* *)
10759 (* return the remaining char and state. *)
10760 (*--------------------------------------------------------------------*)
10761 fun skipCharRef aq =
10762 let
10763 (*--------------------------------------------------------------*)
10764 (* skip a (hexa)decimal number. *)
10765 (*--------------------------------------------------------------*)
10766 fun skip_ximal isX (c,a,q) =
10767 if isX c then skip_ximal isX (getChar (a,q)) else (c,a,q)
10768
10769 val (c1,a1,q1) = getChar aq
10770 val (c2,a2,q2) =
10771 if isDec c1 then skip_ximal isDec (getChar (a1,q1))
10772 else if c1=0wx78 (* #"x" *)
10773 then let val (c2,a2,q2) = getChar (a1,q1)
10774 in if isHex c2 then skip_ximal isHex (getChar (a2,q2))
10775 else let val err = ERR_EXPECTED(expHexDigit,[c2])
10776 val a3 = hookError(a2,(getPos q2,err))
10777 in raise SyntaxError(c2,a3,q2)
10778 end
10779 end
10780 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expDigitX,[c1])))
10781 in raise SyntaxError (c1,a2,q1)
10782 end
10783
10784 in if c2=0wx3B then getChar (a2,q2)
10785 else (c2,hookError(a2,(getPos q2,ERR_EXPECTED(expSemi,[c2]))),q2)
10786 end
10787 handle SyntaxError caq => caq
10788
10789 (*--------------------------------------------------------------------*)
10790 (* parse a sequence of white space in markup declarations. Cf. 2.3: *)
10791 (* *)
10792 (* [3] S ::= (#x20 | #x9 | #xD | #xA)+ *)
10793 (* *)
10794 (* and 2.8 states: *)
10795 (* *)
10796 (* The markup declarations may be made up in whole or in part of *)
10797 (* the replacement text of parameter entities. The productions *)
10798 (* later in this specification for individual nonterminals *)
10799 (* (elementdecl, AttlistDecl, and so on) describe the declarations *)
10800 (* after all the parameter entities have been included. *)
10801 (* *)
10802 (* in markup declarations, we thus have to include entity references *)
10803 (* and skip entity ends, except for the document end. *)
10804 (* *)
10805 (* Well-Formedness Constraint: PEs in Internal Subset *)
10806 (* In the internal DTD subset, parameter-entity references can *)
10807 (* occur only where markup declarations can occur, not within *)
10808 (* markup declarations. (This does not apply to references that *)
10809 (* occur in external parameter entities or to the external subset.) *)
10810 (* *)
10811 (* we therefore always check whether we are in the internal subset *)
10812 (* before including a parameter entity. *)
10813 (*--------------------------------------------------------------------*)
10814 (* handle a parameter entity reference *)
10815 (*--------------------------------------------------------------------*)
10816 fun doParRef dtd (caq as (c,a,q)) =
10817 if inDocEntity q
10818 then let val err = ERR_FORBIDDEN_HERE(IT_PAR_REF,LOC_INT_DECL)
10819 val a1 = hookError(a,(getPos q,err))
10820 in skipReference (c,a1,q)
10821 end
10822 else let val ((id,ent),(a1,q1)) = parseParRef dtd caq
10823 in case ent
10824 of PE_NULL => getChar (a1,q1)
10825 | PE_INTERN (_,rep) => getChar(a1,(pushIntern(q1,id,true,rep)))
10826 | PE_EXTERN extId => #3(openExtern(id,true,resolveExtId extId) (a1,q1))
10827 handle CantOpenFile(fmsg,a)
10828 => let val err = ERR_NO_SUCH_FILE fmsg
10829 val a1 = hookError(a,(getPos q1,err))
10830 in (getChar(a1,q1))
10831 end
10832 end
10833 handle SyntaxError caq => caq
10834 | NoSuchEntity aq => getChar aq
10835 (*--------------------------------------------------------------------*)
10836 (* parse optional white space. *)
10837 (* *)
10838 (* catch SyntaxState exceptions from parameter refs. *)
10839 (* *)
10840 (* print an error if a parameter entity reference or an entity end is *)
10841 (* found inside the internal subset. *)
10842 (* *)
10843 (* return the following character and the remaining state. *)
10844 (*--------------------------------------------------------------------*)
10845 fun skipPSopt dtd caq =
10846 let fun doit (c,a,q) =
10847 case c
10848 of 0wx00 =>
10849 if isSpecial q then (c,a,q)
10850 else let val a1 = if !O_VALIDATE andalso inDocEntity q
10851 then hookError(a,(getPos q,ERR_EE_INT_SUBSET))
10852 else a
10853 in doit (getChar (a1,q))
10854 end
10855 | 0wx09 => doit (getChar (a,q))
10856 | 0wx0A => doit (getChar (a,q))
10857 | 0wx20 => doit (getChar (a,q))
10858 | 0wx25 (* #"%" *) => doit (doParRef dtd (getChar (a,q)))
10859 | _ => (c,a,q)
10860 in doit caq
10861 end
10862 (*--------------------------------------------------------------------*)
10863 (* parse optional white space. *)
10864 (* *)
10865 (* catch SyntaxState exceptions from parameter refs. *)
10866 (* *)
10867 (* print an error if a parameter entity reference or an entity end is *)
10868 (* found inside the internal subset. *)
10869 (* *)
10870 (* return a boolean whether white space was actually found, and the *)
10871 (* following character with the remaining state. *)
10872 (*--------------------------------------------------------------------*)
10873 fun skipPSmay dtd (c,a,q) =
10874 case c
10875 of 0wx00 =>
10876 if isSpecial q then (false,(c,a,q))
10877 else let val a1 = if !O_VALIDATE andalso inDocEntity q
10878 then hookError(a,(getPos q,ERR_EE_INT_SUBSET))
10879 else a
10880 in (true,skipPSopt dtd (getChar (a1,q)))
10881 end
10882 | 0wx09 => (true,skipPSopt dtd (getChar (a,q)))
10883 | 0wx0A => (true,skipPSopt dtd (getChar (a,q)))
10884 | 0wx20 => (true,skipPSopt dtd (getChar (a,q)))
10885 | 0wx25 (* #"%" *) => (true,skipPSopt dtd (doParRef dtd (getChar (a,q))))
10886 | _ => (false,(c,a,q))
10887 (*--------------------------------------------------------------------*)
10888 (* parse required white space. *)
10889 (* *)
10890 (* catch SyntaxState exceptions from parameter refs. *)
10891 (* *)
10892 (* print an error and return if no white space character is found. *)
10893 (* print an error if a parameter entity reference or an entity end is *)
10894 (* found inside the internal subset. *)
10895 (* *)
10896 (* return the following character and the remaining state. *)
10897 (*--------------------------------------------------------------------*)
10898 fun skipPS dtd (c,a,q) =
10899 case c
10900 of 0wx00 =>
10901 if isSpecial q then (c,hookError(a,(getPos q,ERR_MISSING_WHITE)),q)
10902 else let val a1 = if !O_VALIDATE andalso inDocEntity q
10903 then hookError(a,(getPos q,ERR_EE_INT_SUBSET))
10904 else a
10905 in skipPSopt dtd (getChar (a1,q))
10906 end
10907 | 0wx09 => skipPSopt dtd (getChar (a,q))
10908 | 0wx0A => skipPSopt dtd (getChar (a,q))
10909 | 0wx20 => skipPSopt dtd (getChar (a,q))
10910 | 0wx25 (* #"%" *) => skipPSopt dtd (doParRef dtd (getChar (a,q)))
10911 | _ => (c,hookError(a,(getPos q,ERR_MISSING_WHITE)),q)
10912 (*--------------------------------------------------------------------*)
10913 (* parse required white space, taking care of a single '%' character. *)
10914 (* this is only needed before the entity name in an entity decl. *)
10915 (* *)
10916 (* catch SyntaxState exceptions from parameter refs. *)
10917 (* *)
10918 (* print an error if no white space character is found. *)
10919 (* print an error if a parameter entity reference or an entity end is *)
10920 (* found inside the internal subset. *)
10921 (* *)
10922 (* return a boolean whether a '%' was found, the following character *)
10923 (* and the remaining state. *)
10924 (*--------------------------------------------------------------------*)
10925 fun skipPSdec dtd caq =
10926 let fun doit req (c,a,q) =
10927 case c
10928 of 0wx00 =>
10929 if isSpecial q then (false,(c,a,q))
10930 else let val a1 = if !O_VALIDATE andalso inDocEntity q
10931 then hookError(a,(getPos q,ERR_EE_INT_SUBSET))
10932 else a
10933 in doit false (getChar (a1,q))
10934 end
10935 | 0wx09 => doit false (getChar (a,q))
10936 | 0wx0A => doit false (getChar (a,q))
10937 | 0wx20 => doit false (getChar (a,q))
10938 | 0wx25 => (* #"%" *)
10939 let val (c1,a1,q1) = getChar (a,q)
10940 in if isNms c1 then doit false (doParRef dtd (c1,a1,q1))
10941 else let val a2 = if req then hookError(a1,(getPos q,ERR_MISSING_WHITE))
10942 else a1
10943 in (true,(c1,a2,q1))
10944 end
10945 end
10946 | _ => let val a1 = if req then hookError(a,(getPos q,ERR_MISSING_WHITE))
10947 else a
10948 in (false,(c,a1,q))
10949 end
10950 in
10951 doit true caq
10952 end
10953 end
10954 (* stop of ../../Parser/Parse/parseRefs.sml *)
10955 (* start of ../../Parser/Parse/parseLiterals.sml *)
10956 signature ParseLiterals =
10957 sig
10958 (*----------------------------------------------------------------------
10959 include ParseBase
10960
10961 val parseName : UniChar.Char * AppData * State
10962 -> UniChar.Data * (UniChar.Char * AppData * State)
10963 val parseNmtoken : UniChar.Char * AppData * State
10964 -> UniChar.Data * (UniChar.Char * AppData * State)
10965
10966 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
10967 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
10968 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
10969 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
10970 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
10971 val parseSopt : UniChar.Data -> UniChar.Char * AppData * State
10972 -> UniChar.Data * (UniChar.Char * AppData * State)
10973 val parseSmay : UniChar.Data -> UniChar.Char * AppData * State
10974 -> bool * (UniChar.Data * (UniChar.Char * AppData * State))
10975 val parseEq : UniChar.Char * AppData * State
10976 -> UniChar.Data * (UniChar.Char * AppData * State)
10977
10978 val openExtern : int * Uri.Uri -> AppData * State
10979 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
10980 val openDocument : Uri.Uri option -> AppData
10981 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
10982 val openSubset : Uri.Uri -> AppData
10983 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
10984
10985 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
10986 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
10987 val parseGenRef : Dtd -> UniChar.Char * AppData * State
10988 -> (int * Base.GenEntity) * (AppData * State)
10989 val parseParRef : Dtd -> UniChar.Char * AppData * State
10990 -> (int * Base.ParEntity) * (AppData * State)
10991 val parseCharRefLit : UniChar.Data -> AppData * State
10992 -> UniChar.Data * (UniChar.Char * AppData * State)
10993 val skipPS : Dtd -> UniChar.Char * AppData * State
10994 -> UniChar.Char * AppData * State
10995 val skipPSopt : Dtd -> UniChar.Char * AppData * State
10996 -> UniChar.Char * AppData * State
10997 val skipPSmay : Dtd -> UniChar.Char * AppData * State
10998 -> bool * (UniChar.Char * AppData * State)
10999 val skipPSdec : Dtd -> UniChar.Char * AppData * State
11000 -> bool * (UniChar.Char * AppData * State)
11001 ----------------------------------------------------------------------*)
11002 include ParseRefs
11003
11004 val parseSystemLiteral : UniChar.Char * AppData * State
11005 -> Uri.Uri * UniChar.Char * (UniChar.Char * AppData * State)
11006 val parsePubidLiteral : UniChar.Char * AppData * State
11007 -> string * UniChar.Char * (UniChar.Char * AppData * State)
11008
11009 val parseAttValue : Dtd -> UniChar.Char * AppData * State
11010 -> UniChar.Vector * UniChar.Data * (UniChar.Char * AppData * State)
11011 val parseEntityValue : Dtd -> (UniChar.Vector * UniChar.Vector -> 'a)
11012 -> UniChar.Char * AppData * State
11013 -> 'a * (UniChar.Char * AppData * State)
11014 end
11015
11016 (*--------------------------------------------------------------------------*)
11017 (* Structure: ParseLiterals *)
11018 (* *)
11019 (* Exceptions raised by functions in this structure: *)
11020 (* parseSystemLiteral : NotFound *)
11021 (* parsePubidLiteral : NotFound *)
11022 (* parseAttValue : NotFound *)
11023 (* parseEntityValue : NotFound *)
11024 (*--------------------------------------------------------------------------*)
11025 functor ParseLiterals (structure ParseBase : ParseBase)
11026 : ParseLiterals =
11027 struct
11028 structure ParseRefs = ParseRefs (structure ParseBase = ParseBase)
11029
11030 open
11031 Base UniChar Errors UniClasses Uri
11032 ParseRefs
11033
11034 val THIS_MODULE = "ParseLiterals"
11035
11036 (*--------------------------------------------------------------------*)
11037 (* parse a system literal, the quote character ("'" or '"') already --*)
11038 (* read and passed as first argument. cf. 2.3: *)
11039 (* *)
11040 (* ... Note that a SystemLiteral can be parsed without scanning *)
11041 (* for markup. *)
11042 (* *)
11043 (* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") *)
11044 (* *)
11045 (* print an error and end the literal if an entity end is found. *)
11046 (* *)
11047 (* return the literal as a string together with the next character *)
11048 (* and remaining state. *)
11049 (*--------------------------------------------------------------------*)
11050 (* might raise: none *)
11051 (*--------------------------------------------------------------------*)
11052 fun parseSystemLiteral' quote aq =
11053 let
11054 fun doit text (c,a,q) =
11055 if c=quote then (text,getChar (a,q))
11056 else if c=0wx0
11057 then let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_SYS_LIT))
11058 in (text,(c,a1,q))
11059 end
11060 else if c>0wx7F andalso !O_WARN_NON_ASCII_URI
11061 then let val a1 = hookWarning(a,(getPos q,WARN_NON_ASCII_URI c))
11062 in doit (c::text) (getChar(a1,q))
11063 end
11064 else doit (c::text) (getChar(a,q))
11065
11066 val (text,caq1) = doit nil (getChar aq)
11067 in
11068 (Data2Uri(rev text),quote,caq1)
11069 end
11070 (*--------------------------------------------------------------------*)
11071 (* parse a system literal. *)
11072 (* *)
11073 (* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") *)
11074 (* *)
11075 (* raise NotFound if neither '"' nor "'" comes first. *)
11076 (* *)
11077 (* return the literal as a string together with the next character *)
11078 (* and remaining state. *)
11079 (*--------------------------------------------------------------------*)
11080 (* might raise: NotFound *)
11081 (*--------------------------------------------------------------------*)
11082 fun parseSystemLiteral (c,a,q) =
11083 if c=0wx22 (* "'" *) orelse
11084 c=0wx27 (* '"' *)
11085 then parseSystemLiteral' c (a,q)
11086 else raise NotFound (c,a,q)
11087
11088 (*--------------------------------------------------------------------*)
11089 (* parse a pubid literal, the quote character ("'" or '"') already ---*)
11090 (* read and passed as first argument. cf. 2.3: *)
11091 (* *)
11092 (* [12] PubidLiteral ::= '"' PubidChar* '"' *)
11093 (* | "'" (PubidChar - "'")* "'" *)
11094 (* *)
11095 (* print an error and end the literal if an entity end is found. *)
11096 (* print an error if a non-pubid character is found. *)
11097 (* *)
11098 (* return the literal as a string together with the next character *)
11099 (* and remaining state. *)
11100 (*--------------------------------------------------------------------*)
11101 (* might raise: none *)
11102 (*--------------------------------------------------------------------*)
11103 fun parsePubidLiteral' quote aq =
11104 let
11105 fun doit (hadSpace,atStart,text) aq =
11106 let val (c1,a1,q1) = getChar aq
11107 in case c1
11108 of 0wx00 => let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_PUB_LIT))
11109 in (text,(c1,a2,q1))
11110 end
11111 | 0wx0A => doit (true,atStart,text) (a1,q1)
11112 | 0wx20 => doit (true,atStart,text) (a1,q1)
11113 | _ =>
11114 if c1=quote then (text,getChar (a1,q1))
11115 else if not (isPubid c1)
11116 then let val err = ERR_FORBIDDEN_HERE(IT_CHAR c1,LOC_PUB_LIT)
11117 val a2 = hookError(a1,(getPos q1,err))
11118 in doit (hadSpace,atStart,text) (a2,q1)
11119 end
11120 else if hadSpace andalso not atStart
11121 then doit (false,false,c1::0wx20::text) (a1,q1)
11122 else doit (false,false,c1::text) (a1,q1)
11123 end
11124 val (text,caq1) = doit (false,true,nil) aq
11125 in
11126 (Latin2String(rev text),quote,caq1)
11127 end
11128 (*--------------------------------------------------------------------*)
11129 (* parse a pubid literal. *)
11130 (* *)
11131 (* [12] PubidLiteral ::= '"' PubidChar* '"' *)
11132 (* | "'" (PubidChar - "'")* "'" *)
11133 (* *)
11134 (* raise NotFound if neither '"' nor "'" comes first. *)
11135 (* *)
11136 (* return the literal as a string together with the next character *)
11137 (* and remaining state. *)
11138 (*--------------------------------------------------------------------*)
11139 (* might raise: NotFound *)
11140 (*--------------------------------------------------------------------*)
11141 fun parsePubidLiteral (c,a,q) =
11142 if c=0wx22 (* "'" *) orelse
11143 c=0wx27 (* '"' *)
11144 then parsePubidLiteral' c (a,q)
11145 else raise NotFound (c,a,q)
11146
11147 (*--------------------------------------------------------------------*)
11148 (* parse an entity value and the quote character ("'" or '"') passed *)
11149 (* as first argument. Cf. 2.3: *)
11150 (* *)
11151 (* [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'*)
11152 (* | "'" ([^%&'] | PEReference | Reference)* "'"*)
11153 (* See also 4.4.5: *)
11154 (* *)
11155 (* When ... a parameter entity reference appears in a literal *)
11156 (* entity value, its replacement text is processed in place of the *)
11157 (* reference itself as though it were part of the document at the *)
11158 (* location the reference was recognized, except that a single or *)
11159 (* double quote character in the replacement text is always treated *)
11160 (* as a normal data character and will not terminate the literal. *)
11161 (* *)
11162 (* and 4.4.7: *)
11163 (* *)
11164 (* When a general entity reference appears in the EntityValue in an *)
11165 (* entity declaration, it is bypassed and left as is. *)
11166 (* *)
11167 (* A bypassed entity ref must, however, be checked for syntactic *)
11168 (* validity, as opposed to SGML, where it is not even recognized. *)
11169 (* *)
11170 (* print an error and end the literal if an entity end is found at *)
11171 (* the toplevel. *)
11172 (* print an error if a general entity reference is ill-formed. *)
11173 (* *)
11174 (* handle any errors in references by ignoring them syntactically. *)
11175 (* *)
11176 (* return argument con applied to the entity value as a char buffer, *)
11177 (* and the remaining char and state. *)
11178 (*--------------------------------------------------------------------*)
11179 (* might raise: none *)
11180 (*--------------------------------------------------------------------*)
11181 fun parseEntityValue' dtd (quote,con) aq =
11182 let fun doit (level,hadCr,lit,text) (c1,a1,q1) =
11183 case c1
11184 of 0wx00 => if level=0 then let val err = ERR_ENDED_BY_EE LOC_ENT_VALUE
11185 val a2 = hookError(a1,(getPos q1,err))
11186 in (lit,text,(c1,a2,q1))
11187 end
11188 else doit (level-1,false,lit,text) (getChar (a1,q1))
11189 | 0wx25 => (* #"%" *)
11190 let val (level1,lit1,caq2) =
11191 if inDocEntity q1
11192 then let val err = ERR_FORBIDDEN_HERE(IT_PAR_REF,LOC_INT_DECL)
11193 val a2 = hookError(a1,(getPos q1,err))
11194 in (level,lit,skipReference (getChar(a2,q1)))
11195 end
11196 else
11197 let val (lit1,((id,ent),(a2,q2))) =
11198 if level=0 then parseParRefLit dtd (c1::lit) (getChar(a1,q1))
11199 else (lit,parseParRef dtd (getChar(a1,q1)))
11200 in case ent
11201 of PE_NULL => (level,lit1,getChar(a2,q2))
11202 | PE_INTERN(_,rep) =>
11203 let val q3 = pushIntern(q2,id,true,rep)
11204 in (level+1,lit1,getChar(a2,q3))
11205 end
11206 | PE_EXTERN extId =>
11207 let
11208 val fname = resolveExtId extId
11209 val caq3 = #3(openExtern (id,true,fname) (a2,q2))
11210 in (level+1,lit1,caq3)
11211 end handle CantOpenFile(fmsg,a)
11212 => let val err = ERR_NO_SUCH_FILE fmsg
11213 val a1 = hookError(a,(getPos q1,err))
11214 in (level,lit1,getChar(a1,q1))
11215 end
11216 end (* ignore syntax errors in references *)
11217 handle SyntaxError caq => (level,lit,caq)
11218 | NoSuchEntity aq => (level,lit,getChar aq)
11219 in doit (level1,false,lit1,text) caq2
11220 end
11221 | 0wx26 => (* #"&" *)
11222 let val (c2,a2,q2) = getChar (a1,q1)
11223 in (if c2=0wx23 (* #"#" *)
11224 (*--------------------------------------------------*)
11225 (* it's a character reference. *)
11226 (*--------------------------------------------------*)
11227 then (if level=0
11228 then
11229 let val (lit3,(ch,a3,q3)) =
11230 parseCharRefLit (c2::c1::lit) (a2,q2)
11231 in doit (level,false,lit3,ch::text) (getChar(a3,q3))
11232 end
11233 else let val (ch,a3,q3) = parseCharRef (a2,q2)
11234 in doit (level,false,lit,ch::text) (getChar(a3,q3))
11235 end)
11236 (* ignore errors in char references *)
11237 handle SyntaxError caq => doit (level,false,lit,text) caq
11238 | NoSuchChar aq => doit (level,false,lit,text) (getChar aq)
11239 (*-----------------------------------------------------*)
11240 (* it's a general entity reference. *)
11241 (*-----------------------------------------------------*)
11242 else let
11243 val (fnd,lit3,text3,(c3,a3,q3)) =
11244 parseEntName (c1::lit,c1::text) (c2,a2,q2)
11245 val (lit4,text4,caq4) =
11246 if not fnd then (lit,text,(c3,a3,q3))
11247 else if c3=0wx3B (* #";" *)
11248 then (c3::lit3,c3::text3,(getChar(a3,q3)))
11249 else let val err = ERR_EXPECTED(expSemi,[c3])
11250 val a4 = hookError(a3,(getPos q3,err))
11251 in (lit,text,(c3,a4,q3))
11252 end
11253 in doit (level,false,lit4,text4) caq4
11254 end
11255 )
11256 end
11257 | 0wx0A => doit (level,false,if level=0 then c1::lit else lit,
11258 if hadCr then text else c1::text) (getChar (a1,q1))
11259 | 0wx0D => doit (level,true,if level=0 then c1::lit else lit,0wx0A::text)
11260 (getChar (a1,q1))
11261 | _ => if c1=quote andalso level=0 then (lit,text,getChar(a1,q1))
11262 else doit (level,false,if level=0 then c1::lit else lit,c1::text)
11263 (getChar (a1,q1))
11264
11265 val (lit,text,caq1) = doit (0,false,nil,nil) (getChar aq)
11266 val literal = Data2Vector(quote::rev(quote::lit))
11267 val repText = Data2Vector(rev text)
11268 in
11269 (con(literal,repText),caq1)
11270 end
11271 (*--------------------------------------------------------------------*)
11272 (* parse an entity value. *)
11273 (* *)
11274 (* [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'*)
11275 (* | "'" ([^%&'] | PEReference | Reference)* "'"*)
11276 (* *)
11277 (* raise NotFound if neither '"' nor "'" comes first. *)
11278 (* *)
11279 (* return the entity value as a char buffer, and the remaining char *)
11280 (* and state. *)
11281 (*--------------------------------------------------------------------*)
11282 (* might raise: NotFound *)
11283 (*--------------------------------------------------------------------*)
11284 fun parseEntityValue dtd con (c,a,q) =
11285 if c=0wx22 (* "'" *) orelse
11286 c=0wx27 (* '"' *)
11287 then parseEntityValue' dtd (c,con) (a,q)
11288 else raise NotFound (c,a,q)
11289
11290 (*--------------------------------------------------------------------*)
11291 (* parse and normalize an attribute value, consume the final quote *)
11292 (* character ("'" or '""') passed in the argument. Cf. 2.3: *)
11293 (* *)
11294 (* [10] AttValue ::= '"' ([^<&""] | Reference)* '"' *)
11295 (* | "'" ([^<&'] | Reference)* "'" *)
11296 (* See also 4.4.5: *)
11297 (* *)
11298 (* When an entity reference appears in an attribute value ..., *)
11299 (* its replacement text is processed in place of the reference *)
11300 (* itself as though it were part of the document at the location *)
11301 (* the reference was recognized, except that a single or double *)
11302 (* quote character in the replacement text is always treated as a *)
11303 (* normal data character and will not terminate the literal. *)
11304 (* *)
11305 (* and 3.3.3: *)
11306 (* *)
11307 (* Before the value of an attribute is passed to the application *)
11308 (* or checked for validity, the XML processor must normalize it as *)
11309 (* follows: *)
11310 (* *)
11311 (* * a character reference is processed by appending the referenced *)
11312 (* character to the attribute value *)
11313 (* * an entity reference is processed by recursively processing the *)
11314 (* replacement text of the entity *)
11315 (* * a whitespace character (#x20, #xD, #xA, #x9) is processed by *)
11316 (* appending #x20 to the normalized value, except that only a *)
11317 (* single #x20 is appended for a "#xD#xA" sequence that is part *)
11318 (* of an external parsed entity or the literal entity value of *)
11319 (* an internal parsed entity *)
11320 (* * other characters are processed by appending them to the *)
11321 (* normalized value *)
11322 (* *)
11323 (* since #xD#xA are normalized by the parseEntityValue (internal) and *)
11324 (* getChar (external entities), we don't need to care about that. *)
11325 (*--------------------------------------------------------------------*)
11326 (* print an error and end the literal if an entity end is found. *)
11327 (* print an error if a general entity reference is ill-formed. *)
11328 (* print an error if a reference to an external or unparsed entity is *)
11329 (* found. *)
11330 (* print an error if character '<' appears literally. *)
11331 (* *)
11332 (* handle any errors in references by ignoring them syntactically. *)
11333 (* raise NotFound if neither '"' nor "'" comes first. *)
11334 (* *)
11335 (* return the list of chars in the value, and the next char and state *)
11336 (*--------------------------------------------------------------------*)
11337 (* might raise: NotFound *)
11338 (*--------------------------------------------------------------------*)
11339 fun parseAttValue dtd (quote,a,q) =
11340 let fun doit (lhlt as (level,lit,text)) (c1,a1,q1) =
11341 case c1
11342 of 0wx00 => if level=0 then let val err = ERR_ENDED_BY_EE LOC_ATT_VALUE
11343 val a2 = hookError(a1,(getPos q1,err))
11344 in (lit,text,(c1,a2,q1))
11345 end
11346 else doit (level-1,lit,text) (getChar (a1,q1))
11347 | 0wx26 => (* #"&" *)
11348 let
11349 val (c2,a2,q2) = getChar (a1,q1)
11350 val ((level1,lit1,text1),caq3) =
11351 (if c2=0wx23 (* #"#" *)
11352 (*--------------------------------------------------*)
11353 (* it's a character reference. *)
11354 (*--------------------------------------------------*)
11355 then if level=0
11356 then
11357 let val (lit3,(ch,a3,q3)) =
11358 parseCharRefLit (c2::c1::lit) (a2,q2)
11359 in ((level,lit3,ch::text),getChar(a3,q3))
11360 end
11361 else let val (ch,a3,q3) = parseCharRef (a2,q2)
11362 in ((level,lit,ch::text),getChar (a3,q3))
11363 end
11364 (*-----------------------------------------------------*)
11365 (* it's a general entity reference. *)
11366 (*-----------------------------------------------------*)
11367 else
11368 let val (lit3,((id,ent),(a3,q3))) =
11369 if level=0 then parseGenRefLit dtd (c1::lit) (c2,a2,q2)
11370 else (nil,parseGenRef dtd (c2,a2,q2))
11371 in case ent
11372 of GE_NULL => ((level,lit3,text),getChar(a3,q3))
11373 | GE_INTERN(_,rep) =>
11374 let val q4 = pushIntern(q3,id,false,rep)
11375 in ((level+1,lit3,text),getChar (a3,q4))
11376 end
11377 | GE_EXTERN _ =>
11378 let val err = ERR_ILLEGAL_ENTITY
11379 (ENT_EXTERNAL,Index2GenEnt dtd id,LOC_ATT_VALUE)
11380 val a4 = hookError(a3,(getPos q2,err))
11381 in ((level,lit,text),getChar (a4,q3))
11382 end
11383 | GE_UNPARSED _ => raise InternalError
11384 (THIS_MODULE,"parseAttValue'",
11385 "parseGenRef returned GE_UNPARSED")
11386 end)
11387 (*------------------------------------------------------*)
11388 (* handle any errors in references by ignoring them. *)
11389 (*------------------------------------------------------*)
11390 handle SyntaxError caq => ((level,lit,text),caq)
11391 | NoSuchEntity aq => ((level,lit,text),getChar aq)
11392 | NoSuchChar aq => ((level,lit,text),getChar aq)
11393 in doit (level1,lit1,text1) caq3
11394 end
11395 | 0wx3C => let val err = ERR_FORBIDDEN_HERE(IT_CHAR c1,LOC_ATT_VALUE)
11396 val a2 = hookError(a1,(getPos q1,err))
11397 val lit1 = if level=0 then c1::lit else lit
11398 in doit (level,lit1,c1::text) (getChar (a2,q1))
11399 end
11400 | _ => if isS c1 then doit (level,if level=0 then c1::lit else lit,0wx20::text)
11401 (getChar (a1,q1))
11402 else (if c1=quote andalso level=0 then (lit,text,getChar (a1,q1))
11403 else doit (level,if level=0 then c1::lit else lit,c1::text)
11404 (getChar (a1,q1)))
11405
11406
11407 val _ = if quote=0wx22 orelse quote=0wx27 (* "'",'"' *) then ()
11408 else raise NotFound (quote,a,q)
11409 val (lit,text,caq1) = doit (0,nil,nil) (getChar(a,q))
11410 in
11411 (Data2Vector(quote::rev(quote::lit)),rev text,caq1)
11412 end
11413 end
11414 (* stop of ../../Parser/Parse/parseLiterals.sml *)
11415 (* start of ../../Parser/Parse/parseTags.sml *)
11416 signature ParseTags =
11417 sig
11418 (*----------------------------------------------------------------------
11419 include ParseBase
11420
11421 val parseName : UniChar.Char * AppData * State
11422 -> UniChar.Data * (UniChar.Char * AppData * State)
11423 val parseNmtoken : UniChar.Char * AppData * State
11424 -> UniChar.Data * (UniChar.Char * AppData * State)
11425
11426 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
11427 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
11428 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11429 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11430 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
11431
11432 val openExtern : int * Uri.Uri -> AppData * State
11433 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
11434 val openDocument : Uri.Uri option -> AppData
11435 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
11436 val openSubset : Uri.Uri -> AppData
11437 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
11438
11439 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
11440 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
11441 val parseGenRef : Dtd -> UniChar.Char * AppData * State
11442 -> (int * Base.GenEntity) * (AppData * State)
11443 val parseParRef : Dtd -> UniChar.Char * AppData * State
11444 -> (int * Base.ParEntity) * (AppData * State)
11445 val parseCharRefLit : UniChar.Data -> AppData * State
11446 -> UniChar.Data * (UniChar.Char * AppData * State)
11447 val skipPS : Dtd -> UniChar.Char * AppData * State
11448 -> UniChar.Char * AppData * State
11449 val skipPSopt : Dtd -> UniChar.Char * AppData * State
11450 -> UniChar.Char * AppData * State
11451 val skipPSmay : Dtd -> UniChar.Char * AppData * State
11452 -> bool * (UniChar.Char * AppData * State)
11453 val skipPSdec : Dtd -> UniChar.Char * AppData * State
11454 -> bool * (UniChar.Char * AppData * State)
11455
11456 val parseSystemLiteral : UniChar.Char * AppData * State
11457 -> Uri.Uri * UniChar.Char * (UniChar.Char * AppData * State)
11458 val parsePubidLiteral : UniChar.Char * AppData * State
11459 -> string * UniChar.Char * (UniChar.Char * AppData * State)
11460 val parseAttValue : Dtd -> UniChar.Char * AppData * State
11461 -> UniChar.Vector * UniChar.Data * (UniChar.Char * AppData * State)
11462 val parseEntityValue : Dtd -> (UniChar.Vector * UniChar.Vector -> 'a)
11463 -> UniChar.Char * AppData * State
11464 -> 'a * (UniChar.Char * AppData * State)
11465 ----------------------------------------------------------------------*)
11466 include ParseLiterals
11467
11468 val skipTag : Errors.Location -> AppData * State -> (UniChar.Char * AppData * State)
11469
11470 val parseETag : Dtd -> AppData * State
11471 -> int * UniChar.Data * Errors.Position * (UniChar.Char * AppData * State)
11472 val parseSTag : Dtd -> Errors.Position -> UniChar.Char * AppData * State
11473 -> (HookData.StartTagInfo * Base.ElemInfo) * (UniChar.Char * AppData * State)
11474 end
11475
11476 (*--------------------------------------------------------------------------*)
11477 (* Structure: ParseTags *)
11478 (* *)
11479 (* Exceptions raised by functions in this structure: *)
11480 (* skipTag : none *)
11481 (* parseETag : SyntaxState *)
11482 (* parseSTag : SyntaxState *)
11483 (*--------------------------------------------------------------------------*)
11484 functor ParseTags (structure ParseBase : ParseBase)
11485 : ParseTags =
11486 struct
11487 structure ParseLiterals = ParseLiterals (structure ParseBase = ParseBase)
11488
11489 open
11490 UtilList
11491 Base Errors UniClasses
11492 ParseLiterals
11493
11494 (*--------------------------------------------------------------------*)
11495 (* parse an end-tag, the "</" already read. 3.1: *)
11496 (* *)
11497 (* [42] ETag ::= '</' Name S? '>' *)
11498 (* *)
11499 (* and 3. states: *)
11500 (* *)
11501 (* Validity Constraint: Element Valid *)
11502 (* An element is valid if there is a declaration matching elementdecl *)
11503 (* where the Name matches the element type, and ... *)
11504 (* *)
11505 (* print an error, recover and raise SyntaxState if no name is found. *)
11506 (* print an error and recover if no ">" is found. *)
11507 (* print an error if the element is not declared. *)
11508 (* *)
11509 (* return the index of the element, and the next char and state. *)
11510 (*--------------------------------------------------------------------*)
11511 (* might raise: SyntaxState *)
11512 (*--------------------------------------------------------------------*)
11513 fun parseETag dtd aq =
11514 let
11515 val caq0 as (_,_,q0) = getChar aq
11516 val (elem,(c1,a1,q1)) = parseName caq0
11517 handle NotFound (c,a,q) => let val err = expectedOrEnded (expAName,LOC_ETAG) c
11518 val a1 = hookError(a,(getPos q,err))
11519 val caq1 = recoverETag (c,a1,q)
11520 in raise SyntaxError caq1
11521 end
11522 val idx = Element2Index dtd elem
11523 val elemInfo as {decl,...} = getElement dtd idx
11524 val a1' = if isSome decl then a1
11525 else let val a2 = if not (!O_VALIDATE andalso hasDtd dtd) then a1
11526 else let val err = ERR_UNDECLARED(IT_ELEM,elem,LOC_ETAG)
11527 val a1' = hookError(a1,(getPos q0,err))
11528 val _ = if not (!O_ERROR_MINIMIZE) then ()
11529 else ignore (handleUndeclElement dtd idx)
11530 in a1'
11531 end
11532 in checkElemName (a2,q0) elem
11533 end
11534
11535 val (cs,(c2,a2,q2)) = parseSopt nil (c1,a1',q1)
11536 val space = rev cs
11537 in
11538 if c2=0wx3E (* #">" *) then (idx,space,getPos q2,getChar(a2,q2))
11539 else let val err = expectedOrEnded (expGt,LOC_ETAG) c2
11540 val a3 = hookError(a2,(getPos q2,err))
11541 val caq3 = recoverETag(c2,a3,q2)
11542 in (idx,space,getPos q2,caq3)
11543 end
11544 end
11545
11546 (*--------------------------------------------------------------------*)
11547 (* parse a start-tag or an empty-element-tag, the "<" already read. *)
11548 (* 3.1: *)
11549 (* *)
11550 (* [40] STag ::= '<' Name (S Attribute)* S? '>' *)
11551 (* [ WFC: Unique Att Spec ] *)
11552 (* [41] Attribute ::= Name Eq AttValue [ VC: Attribute Value Type ] *)
11553 (* *)
11554 (* Well-Formedness Constraint: Unique Att Spec *)
11555 (* No attribute name may appear more than once in the same *)
11556 (* start-tag or empty-element tag. *)
11557 (* *)
11558 (* Validity Constraint: Attribute Value Type *)
11559 (* The attribute must have been declared; the value must be of the *)
11560 (* type declared for it. *)
11561 (* *)
11562 (* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' *)
11563 (* [ WFC: Unique Att Spec ] *)
11564 (* *)
11565 (* and 3. states: *)
11566 (* *)
11567 (* Validity Constraint: Element Valid *)
11568 (* An element is valid if there is a declaration matching elementdecl *)
11569 (* where the Name matches the element type, and ... *)
11570 (* *)
11571 (* catch entity end exceptions in subfunctions by printing an error *)
11572 (* and re-raising the exception. *)
11573 (* *)
11574 (* print an error, recover and raise SyntaxState if no element name *)
11575 (* is found. *)
11576 (* print an error and recover if no ">" or "/>" is found. *)
11577 (* print an error and continue if no "=" is found after an att name. *)
11578 (* print an error and recover if no literal is found after the "=". *)
11579 (* print an error if white space is missing. *)
11580 (* print an error if the element is not declared. *)
11581 (* print an error and ignore the attribute if an attribute is *)
11582 (* specified twice. *)
11583 (* print an error if an attribute is not declared. *)
11584 (* *)
11585 (* return the index of the element, its ElemInfo, the list of *)
11586 (* AttSpecs (specified and omitted atts) and a boolean whether it was *)
11587 (* an empty-element-tag, together with the next char and state. *)
11588 (*--------------------------------------------------------------------*)
11589 (* might raise: SyntaxState *)
11590 (*--------------------------------------------------------------------*)
11591 fun parseSTag dtd startPos (caq as (_,_,q)) =
11592 let
11593 val (elem,(c1,a1,q1)) = parseName caq
11594 handle NotFound (c,a,q) => let val err = expectedOrEnded (expAName,LOC_STAG) c
11595 val a1 = hookError(a,(getPos q,err))
11596 val (_,caq1) = recoverSTag (c,a1,q)
11597 in raise SyntaxError (c,a1,q)
11598 end
11599 val eidx = Element2Index dtd elem
11600 val elemInfo as {atts,decl,...} = getElement dtd eidx
11601 val defs = case atts
11602 of NONE => nil
11603 | SOME (defs,_) => defs
11604 val (a1',elemInfo) =
11605 if isSome decl then (a1,elemInfo)
11606 else
11607 let val (a2,newInfo) =
11608 if not (!O_VALIDATE andalso hasDtd dtd) then (a1,elemInfo)
11609 else let val err = ERR_UNDECLARED(IT_ELEM,elem,LOC_STAG)
11610 val a1' = hookError(a1,(getPos q,err))
11611 val newInfo = if not (!O_ERROR_MINIMIZE) then elemInfo
11612 else handleUndeclElement dtd eidx
11613 in (a1',newInfo)
11614 end
11615 in (checkElemName (a2,q) elem,newInfo)
11616 end
11617
11618 val hscaq2 = parseSmay nil (c1,a1',q1)
11619
11620 (*--------------------------------------------------------------*)
11621 (* yet are the indices of attributes encountered yet, old are *)
11622 (* the valid attributes specified yet, and todo are the defs of *)
11623 (* attributes yet to be specified. hadS indicates whether white *)
11624 (* space preceded. *)
11625 (*--------------------------------------------------------------*)
11626 fun doit (yet,old,todo) (hadS,(sp,(c,a,q))) =
11627 case c
11628 of 0wx3E (* #">" *) => (old,todo,sp,false,q,getChar(a,q))
11629 | 0wx2F (* #"/" *) =>
11630 let val (c1,a1,q1) = getChar(a,q)
11631 in if c1=0wx3E (* #">" *) then (old,todo,sp,true,q1,getChar(a1,q1))
11632 else let val err = expectedOrEnded (expGt,LOC_STAG) c1
11633 val a2 = hookError(a1,(getPos q1,err))
11634 val (mt,caq2) = recoverSTag (c1,a2,q1)
11635 in (old,todo,sp,mt,q,caq2)
11636 end
11637 end
11638 | _ =>
11639 if not (isNms c)
11640 then let val err = expectedOrEnded (expAttSTagEnd,LOC_STAG) c
11641 val a1 = hookError(a,(getPos q,err))
11642 val (mt,caq1) = recoverSTag (c,a1,q)
11643 in (old,todo,sp,mt,q,caq1)
11644 end
11645 else
11646 let(* first parse the name of the attribute *)
11647 val (att,(c1,a1,q1)) = parseName (c,a,q)
11648 val a2 = if hadS then a1
11649 else hookError(a1,(getPos q,ERR_MISSING_WHITE))
11650
11651 (* now get its index, check whether it already *)
11652 (* occurred and get its definition. *)
11653 val aidx = AttNot2Index dtd att
11654 val (hadIt,a3) =
11655 if member aidx yet
11656 then (true,hookError(a2,(getPos q,ERR_MULT_ATT_SPEC att)))
11657 else (false,a2)
11658
11659 val (def,rest) = findAndDelete (fn (i,_,_,_) => i=aidx) todo
11660 val a4 = if isSome def orelse hadIt then a3
11661 else handleUndeclAtt dtd (a3,q) (aidx,att,eidx,elem)
11662
11663 (* consume the " = ", ignore errors *)
11664 val (eq,caq5 as (_,_,q5)) = parseEq (c1,a4,q1)
11665 handle SyntaxError caq => ([0wx3D],caq)
11666
11667 (* now parse the attribute value *)
11668 val (literal,value,(c6,a6,q6)) = parseAttValue dtd caq5
11669
11670 (* possibly make a new AttSpec *)
11671 val space = rev sp
11672 val (new,a7) =
11673 if hadIt then (old,a6)
11674 else case def
11675 of NONE =>
11676 if !O_VALIDATE andalso hasDtd dtd then (old,a6)
11677 else (let val (attVal,a7) = checkAttValue dtd (a6,q5)
11678 (defaultAttDef aidx,literal,value)
11679 in ((aidx,attVal,SOME(space,eq))::old,a7)
11680 end
11681 handle AttValue a => (old,a))
11682 | SOME ad =>
11683 let val (attVal,a7) = checkAttValue dtd (a6,q5)
11684 (ad,literal,value)
11685 in ((aidx,attVal,SOME(space,eq))::old,a7)
11686 end
11687 handle AttValue a => (old,a)
11688 val hscaq8 = parseSmay nil (c6,a7,q6)
11689 in
11690 doit (aidx::yet,new,rest) hscaq8
11691 end
11692 handle NotFound (c,a,q) (* raised by parseAttValue above *)
11693 => let val err = expectedOrEnded (expLitQuote,LOC_STAG) c
11694 val a1 = hookError(a,(getPos q,err))
11695 val (mt,caq1) = recoverSTag (c,a1,q)
11696 in (old,todo,sp,mt,q,caq1)
11697 end
11698
11699 val (specd,todo,sp,empty,qe,(c3,a3,q3)) = doit (nil,nil,defs) hscaq2
11700 val space = rev sp
11701
11702 (* generate the defaults for unspecified attributes *)
11703 val (all,a4) = genMissingAtts dtd (a3,qe) (todo,rev specd)
11704 in
11705 ((((startPos,getPos q3),eidx,all,space,empty),elemInfo),(c3,a4,q3))
11706 end
11707
11708 (*--------------------------------------------------------------------*)
11709 (* skip a tag, the initial "<" or "</" already read, the first arg *)
11710 (* being a string describing the tag. *)
11711 (* don't care about whether it is a start- or end-tag. Ignore ">" and *)
11712 (* "/>" if within a literal. *)
11713 (* *)
11714 (* print an error and finish if an entity end is found. *)
11715 (* *)
11716 (* return the remaining char and state. *)
11717 (*--------------------------------------------------------------------*)
11718 (* might raise: none *)
11719 (*--------------------------------------------------------------------*)
11720 fun skipTag loc aq =
11721 let
11722 fun do_lit ch (c,a,q) =
11723 if c=0wx00 then let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE loc))
11724 in (c,a1,q)
11725 end
11726 else if c=ch then doit (getChar(a,q))
11727 else do_lit ch (getChar(a,q))
11728
11729 and doit (c,a,q) =
11730 case c
11731 of 0wx00 => let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE loc))
11732 in (c,a1,q)
11733 end
11734 | 0wx22 (* #"\""*) => do_lit c (getChar(a,q))
11735 | 0wx27 (* #"'" *) => do_lit c (getChar(a,q))
11736 | 0wx2F (* #"/" *) => (case getChar(a,q)
11737 of (0wx3E,a1,q1) (* #">" *) => getChar(a1,q1)
11738 | caq1 => doit caq1)
11739 | 0wx3E (* #">" *) => getChar(a,q)
11740 | _ => doit(getChar(a,q))
11741 in doit (getChar aq)
11742 end
11743 end
11744
11745 (* stop of ../../Parser/Parse/parseTags.sml *)
11746 (* start of ../../Parser/Parse/parseDecl.sml *)
11747 signature ParseDecl =
11748 sig
11749 (*----------------------------------------------------------------------
11750 include ParseBase
11751
11752 val parseName : UniChar.Char * AppData * State
11753 -> UniChar.Data * (UniChar.Char * AppData * State)
11754
11755 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
11756 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
11757 val skipS : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11758 val skipSopt : UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11759 val skipSmay : UniChar.Char * AppData * State -> bool * (UniChar.Char * AppData * State)
11760
11761 val openExtern : int * Uri.Uri -> AppData * State
11762 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
11763 val openDocument : Uri.Uri option -> AppData
11764 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
11765 val openSubset : Uri.Uri -> AppData
11766 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
11767
11768 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
11769 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
11770 val parseGenRef : Dtd -> UniChar.Char * AppData * State
11771 -> (int * Base.GenEntity) * (AppData * State)
11772 val parseParRef : Dtd -> UniChar.Char * AppData * State
11773 -> (int * Base.ParEntity) * (AppData * State)
11774 val parseCharRefLit : UniChar.Data -> AppData * State
11775 -> UniChar.Data * (UniChar.Char * AppData * State)
11776 val skipPSopt : Dtd -> UniChar.Char * AppData * State
11777 -> UniChar.Char * AppData * State
11778
11779 val skipTag : Errors.Location -> AppData * State -> (UniChar.Char * AppData * State)
11780 val parseETag : Dtd -> AppData * State
11781 -> int * UniChar.Data * Errors.Position * (UniChar.Char * AppData * State)
11782 val parseSTag : Dtd -> Errors.Position -> UniChar.Char * AppData * State
11783 -> (HookData.StartTagInfo * Base.ElemInfo) * (UniChar.Char * AppData * State)
11784 ----------------------------------------------------------------------*)
11785 include ParseTags
11786
11787 val skipDecl : bool -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11788
11789 val parseExtIdSub : Dtd -> UniChar.Char * AppData * State
11790 -> Base.ExternalId * bool * (UniChar.Char * AppData * State)
11791
11792 val parseEntityDecl : Dtd -> EntId * Errors.Position * bool
11793 -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11794 val parseElementDecl : Dtd -> EntId * Errors.Position * bool
11795 -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11796 val parseNotationDecl : Dtd -> EntId * Errors.Position * bool
11797 -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11798 val parseAttListDecl : Dtd -> EntId * Errors.Position * bool
11799 -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
11800 end
11801
11802 (*--------------------------------------------------------------------------*)
11803 (* Structure: ParseDecl *)
11804 (* *)
11805 (* Exceptions raised by functions in this structure: *)
11806 (* skipDecl : none *)
11807 (* parseExtIdSub : NotFound SyntaxError *)
11808 (* parseEntityDecl : none *)
11809 (* parseElementDecl : none *)
11810 (* parseNotationDecl : none *)
11811 (* parseAttListDecl : none *)
11812 (*--------------------------------------------------------------------------*)
11813 functor ParseDecl (structure ParseBase : ParseBase)
11814 : ParseDecl =
11815 struct
11816 structure ParseTags = ParseTags (structure ParseBase = ParseBase)
11817
11818 open
11819 UtilInt UtilList
11820 Base Errors HookData
11821 ParseTags
11822
11823 (*--------------------------------------------------------------------*)
11824 (* skip a markup declaration, the initial "<!" and name already read. *)
11825 (* ignore ">" if within a literal. yake care of internal subset if *)
11826 (* the first arg is true. *)
11827 (* *)
11828 (* print an error and finish if an entity end is found. *)
11829 (* *)
11830 (* return the remaining char and state. *)
11831 (*--------------------------------------------------------------------*)
11832 (* might raise: none *)
11833 (*--------------------------------------------------------------------*)
11834 fun skipDecl hasSubset caq =
11835 let
11836 fun do_lit ch (c,a,q) =
11837 if c=0wx00 then (c,a,q)
11838 else if c=ch then getChar (a,q)
11839 else do_lit ch (getChar(a,q))
11840 fun do_decl (c,a,q) =
11841 case c
11842 of 0wx00 => (c,a,q)
11843 | 0wx22 (* #"\""" *) => do_decl (do_lit c (getChar(a,q)))
11844 | 0wx27 (* #"'" *) => do_decl (do_lit c (getChar(a,q)))
11845 | 0wx3E (* #">" *) => getChar(a,q)
11846 | _ => do_decl (getChar(a,q))
11847 fun do_subset (c,a,q) =
11848 case c
11849 of 0wx00 => (c,a,q)
11850 | 0wx3C (* #"<" *) => do_subset (do_decl (getChar(a,q)))
11851 | 0wx5D (* #"]" *) => getChar(a,q)
11852 | _ => do_subset (getChar(a,q))
11853 fun doit (c,a,q) =
11854 case c
11855 of 0wx00 => (c,hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_DECL)),q)
11856 | 0wx22 (* #"\"""*) => doit (do_lit c (getChar(a,q)))
11857 | 0wx27 (* #"'" *) => doit (do_lit c (getChar(a,q)))
11858 | 0wx3E (* #">" *) => getChar(a,q)
11859 | 0wx5B (* #"[" *) => if hasSubset then doit (do_subset (getChar(a,q)))
11860 else doit (getChar(a,q))
11861 | _ => doit (getChar(a,q))
11862 in doit caq
11863 end
11864
11865 (*--------------------------------------------------------------------*)
11866 (* parse an external id, or a public id if the first arg is true. *)
11867 (* Cf. 4.2.2 and 4.7: *)
11868 (* *)
11869 (* [75] ExternalID ::= 'SYSTEM' S SystemLiteral *)
11870 (* | 'PUBLIC' S PubidLiteral S SystemLiteral *)
11871 (* *)
11872 (* [83] PublicID ::= 'PUBLIC' S PubidLiteral *)
11873 (* *)
11874 (* raise NotFound if no name is found first. *)
11875 (* print an error if white space is missing. *)
11876 (* print an error and raise SyntaxState if a wrong name is found. *)
11877 (* print an Error and raise SyntaxState if a required literal is not *)
11878 (* found (depends on optSys). *)
11879 (* *)
11880 (* return the public and system identifiers as string options, *)
11881 (* a boolean, whether whit space followed the external id, *)
11882 (* and the next character and the remaining state. *)
11883 (*--------------------------------------------------------------------*)
11884 (* might raise: NotFound SyntaxState *)
11885 (*--------------------------------------------------------------------*)
11886 fun parseExternalId dtd optSys (caq as (_,_,q))=
11887 let
11888 (* do not handle NotFound: in this case no extId was found *)
11889 val (name,caq1) = parseName caq
11890 val caq2 as (_,_,q2)= skipPS dtd caq1
11891 in
11892 case name
11893 of [0wx50,0wx55,0wx42,0wx4c,0wx49,0wx43] => (* "PUBLIC" *)
11894 let
11895 val (pub,pquote,caq3) = parsePubidLiteral caq2
11896 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expLitQuote,[c])
11897 val a1 = hookError(a,(getPos q,err))
11898 in raise SyntaxError (c,a1,q)
11899 end
11900 val (hadS,caq4 as (_,_,q4)) = skipPSmay dtd caq3
11901 in let
11902 val (sys,squote,(c5,a5,q5)) = parseSystemLiteral caq4
11903 val base = getUri q4
11904 val a6 = if hadS then a5 else hookError(a5,(getPos q4,ERR_MISSING_WHITE))
11905 val (hadS6,caq6) = skipPSmay dtd (c5,a6,q5)
11906 in
11907 (EXTID(SOME(pub,pquote),SOME(base,sys,squote)),hadS6,caq6)
11908 end
11909 handle NotFound (c,a,q) => (* no system id *)
11910 if optSys then (EXTID(SOME(pub,pquote),NONE),hadS,(c,a,q))
11911 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expLitQuote,[c])))
11912 in raise SyntaxError (c,a1,q)
11913 end
11914 end
11915
11916 | [0wx53,0wx59,0wx53,0wx54,0wx45,0wx4d] => (* "SYSTEM" *)
11917 let
11918 val (sys,squote,caq3) = parseSystemLiteral caq2
11919 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expLitQuote,[c])
11920 val a1 = hookError(a,(getPos q,err))
11921 in raise SyntaxError (c,a1,q)
11922 end
11923 val base = getUri q2
11924 val (hadS,caq4) = skipPSmay dtd caq3
11925 in
11926 (EXTID(NONE,SOME(base,sys,squote)),hadS,caq4)
11927 end
11928
11929 | _ => let val (c2,a2,q2) = caq2
11930 val a3 = hookError(a2,(getPos q,ERR_EXPECTED(expExtId,name)))
11931 in raise SyntaxError (c2,a3,q2)
11932 end
11933 end
11934 (*--------------------------------------------------------------------*)
11935 (* parse an external id in an entity definition. Cf. 4.2.2: *)
11936 (* *)
11937 (* print an Error and raise SyntaxState if no external id is found. *)
11938 (*--------------------------------------------------------------------*)
11939 (* might raise: SyntaxState *)
11940 (*--------------------------------------------------------------------*)
11941 fun parseExtIdEnt dtd caq = parseExternalId dtd false caq
11942 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expLitQuotExt,[c])
11943 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
11944 end
11945 (*--------------------------------------------------------------------*)
11946 (* parse an external or public id in a notation declaration. *)
11947 (* *)
11948 (* print an Error and raise SyntaxState if neither external nor *)
11949 (* public id is found. *)
11950 (*--------------------------------------------------------------------*)
11951 (* might raise: SyntaxState *)
11952 (*--------------------------------------------------------------------*)
11953 fun parseExtIdNot dtd caq = parseExternalId dtd true caq
11954 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expExtId,[c])
11955 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
11956 end
11957 (*--------------------------------------------------------------------*)
11958 (* parse an external id for the external subset. *)
11959 (* *)
11960 (* raise NotFound if no external id is found. *)
11961 (*--------------------------------------------------------------------*)
11962 (* might raise: NotFound SyntaxState *)
11963 (*--------------------------------------------------------------------*)
11964 fun parseExtIdSub dtd caq = parseExternalId dtd false caq
11965
11966 (*--------------------------------------------------------------------*)
11967 (* parse a parameter entity declaration, starting after the '%'. The *)
11968 (* unique entity id of the initial '<' is given as first arg. 4.2: *)
11969 (* *)
11970 (* [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' *)
11971 (* [74] PEDef ::= EntityValue | ExternalID *)
11972 (* *)
11973 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
11974 (* *)
11975 (* print an error if white space is missing. *)
11976 (* print an error and raise SyntaxState if neither entity value nor *)
11977 (* external identifier is found. *)
11978 (* print an error and raise SyntaxState if the closing '>' is missing.*)
11979 (* print an error if the '>' is not in the same entity as the '<!'. *)
11980 (* *)
11981 (* enter the declared entity into the entity table. *)
11982 (* return the remaining char and state. *)
11983 (*--------------------------------------------------------------------*)
11984 (* might raise: SyntaxState *)
11985 (*--------------------------------------------------------------------*)
11986 fun parseParEntDecl dtd (startEnt,startPos,ext) caq =
11987 let
11988 val caq1 as (_,_,q1) = skipPS dtd caq
11989
11990 val (name,caq2) = parseName caq1
11991 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnEntName,[c])
11992 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
11993 end
11994 val idx = ParEnt2Index dtd name
11995 val caq3 = skipPS dtd caq2
11996
11997 val (ent,(c4,a4,q4)) =
11998 let val (ent,caq4) = parseEntityValue dtd PE_INTERN caq3
11999 val caq5 = skipPSopt dtd caq4
12000 in (ent,caq5)
12001 end
12002 handle NotFound caq =>
12003 let val (extId,_,caq1) = parseExtIdEnt dtd caq
12004 in (PE_EXTERN extId,caq1)
12005 end
12006
12007 val a5 = if useParamEnts() orelse not ext then addParEnt dtd (a4,q1) (idx,ent,ext) else a4
12008 val a6 = hookDecl(a5,((startPos,getPos q4),DEC_PAR_ENT(idx,ent,ext)))
12009 in
12010 if c4<>0wx3E (* #">" *)
12011 then let val a7 = hookError(a6,(getPos q4,ERR_EXPECTED(expGt,[c4])))
12012 in raise SyntaxError(c4,a7,q4)
12013 end
12014 else let val a7 = if not (!O_VALIDATE) orelse getEntId q4=startEnt then a6
12015 else hookError(a6,(getPos q4,ERR_DECL_ENT_NESTING LOC_ENT_DECL))
12016 in getChar(a7,q4)
12017 end
12018 end
12019
12020 (*--------------------------------------------------------------------*)
12021 (* parse a general entity declaration, starting with the name. The *)
12022 (* unique entity id of the initial '<' is given as first arg. 4.2: *)
12023 (* *)
12024 (* [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' *)
12025 (* [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) *)
12026 (* *)
12027 (* [76] NDataDecl ::= S 'NDATA' S Name [ VC: Notation *)
12028 (* Declared ] *)
12029 (* *)
12030 (* If the NDataDecl is present, this is a general unparsed entity; *)
12031 (* otherwise it is a parsed entity. *)
12032 (* *)
12033 (* Validity Constraint: Notation Declared *)
12034 (* The Name must match the declared name of a notation. *)
12035 (* *)
12036 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
12037 (* *)
12038 (* print an error if white space is missing. *)
12039 (* print an error and raise SyntaxState if neither entity value nor *)
12040 (* external identifier is found. *)
12041 (* print an error if name other then 'NDATA' is found after ext. id. *)
12042 (* print an error and raise SyntaxState if no name is found after the *)
12043 (* 'NDATA'. *)
12044 (* print an error if the notation is not declared. *)
12045 (* print an error and raise SyntaxState if the closing '>' is missing.*)
12046 (* print an error if the '>' is not in the same entity as the '<!'. *)
12047 (* *)
12048 (* enter the declared entity into the entity table. *)
12049 (* return the remaining char and state. *)
12050 (*--------------------------------------------------------------------*)
12051 (* might raise: SyntaxState *)
12052 (*--------------------------------------------------------------------*)
12053 fun parseGenEntDecl dtd (startEnt,startPos,ext) (caq as (_,_,q)) =
12054 let
12055 val (name,caq1) = parseName caq
12056 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expEntNamePero,[c])
12057 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
12058 end
12059 val idx = GenEnt2Index dtd name
12060 val caq2 = skipPS dtd caq1
12061
12062 val (ent,expEnd,(c3,a3,q3)) =
12063 (*-----------------------------------------------------------*)
12064 (* Try for an internal entity. Then '>' must follow. *)
12065 (*-----------------------------------------------------------*)
12066 let
12067 val (ent,caq3) = parseEntityValue dtd GE_INTERN caq2
12068 val caq4 = skipPSopt dtd caq3
12069 in
12070 (ent,expGt,caq4)
12071 end
12072 handle NotFound cq => (* raised by parseEntityValue *)
12073 (*-----------------------------------------------------------*)
12074 (* Must be external. First parse the external identifier. *)
12075 (*-----------------------------------------------------------*)
12076 let
12077 val (extId,hadS,caq1 as (_,_,q1)) = parseExtIdEnt dtd caq2
12078 in let
12079 (*-----------------------------------------------------*)
12080 (* Does a name follow? Then is must be 'NDATA' and the *)
12081 (* notation name follows. Thus the entity is unparsed. *)
12082 (* Also, only '>' may come next. *)
12083 (* NotFound is handled at the end of the let. *)
12084 (*-----------------------------------------------------*)
12085 val (key,(c2,a2,q2)) = parseName caq1
12086 val a3 = if hadS then a2 else hookError(a2,(getPos q1,ERR_MISSING_WHITE))
12087 val a4 = if key = [0wx4e,0wx44,0wx41,0wx54,0wx41] (* "NDATA" *) then a3
12088 else hookError(a3,(getPos q1,ERR_EXPECTED(expGtNdata,key)))
12089
12090 val caq5 as (_,_,q5) = skipPS dtd (c2,a4,q2)
12091
12092 val (not,caq6) = parseName caq5
12093 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expANotName,[c])
12094 val a1 = hookError(a,(getPos q,err))
12095 in raise SyntaxError (c,a1,q)
12096 end
12097 val notIdx = AttNot2Index dtd not
12098 val caq7 = skipPSopt dtd caq6
12099 in
12100 (GE_UNPARSED(extId,notIdx,getPos q5),expGt,caq7)
12101 end
12102 handle NotFound caq =>
12103 (*--------------------------------------------------------*)
12104 (* No 'NDATA' present, so it's parsed external entity. *)
12105 (* A 'NDATA' might have followed. *)
12106 (*--------------------------------------------------------*)
12107 (GE_EXTERN extId,expGtNdata,caq)
12108 end
12109
12110 val a4 = if useParamEnts() orelse not ext then addGenEnt dtd (a3,q) (idx,ent,ext) else a3
12111 val a5 = hookDecl(a4,((startPos,getPos q3),DEC_GEN_ENT(idx,ent,ext)))
12112 in
12113 if c3<>0wx3E (* #">" *)
12114 then let val a6 = hookError(a5,(getPos q3,ERR_EXPECTED(expGt,[c3])))
12115 in raise SyntaxError(c3,a6,q3)
12116 end
12117 else let val a6 = if not (!O_VALIDATE) orelse getEntId q3=startEnt then a5
12118 else hookError(a5,(getPos q3,ERR_DECL_ENT_NESTING LOC_ENT_DECL))
12119 in getChar(a6,q3)
12120 end
12121 end
12122
12123 (*--------------------------------------------------------------------*)
12124 (* parse an entity declaration, the initial '<!ENTITY' already read. *)
12125 (* The unique entity id of the initial '<' is given as 1st arg. 4.2: *)
12126 (* *)
12127 (* [70] EntityDecl ::= GEDecl | PEDecl *)
12128 (* [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' *)
12129 (* [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' *)
12130 (* *)
12131 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
12132 (* *)
12133 (* raise SyntaxState in case of a syntax error. *)
12134 (* print an error if white space is missing. *)
12135 (* *)
12136 (* print an error for entity end exceptions in subfunctions. *)
12137 (* catch syntax errors by recovering to the next possible state. *)
12138 (* *)
12139 (* pass control to parseParEntDecl or parseGenEntDecl, depending on *)
12140 (* whether the S is followed by a '%'. *)
12141 (* return the remaining char and state. *)
12142 (*--------------------------------------------------------------------*)
12143 (* might raise: none *)
12144 (*--------------------------------------------------------------------*)
12145 fun parseEntityDecl dtd pars caq =
12146 let
12147 val (hadPero,caq1) = skipPSdec dtd caq
12148 in
12149 if hadPero then parseParEntDecl dtd pars caq1
12150 else parseGenEntDecl dtd pars caq1
12151 end
12152 handle exn as SyntaxError (c,a,q) =>
12153 let val a1 = if c=0wx00 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_ENT_DECL))
12154 else a
12155 in recoverDecl false (c,a1,q)
12156 end
12157
12158 (*--------------------------------------------------------------------*)
12159 (* parse a notation declaration, the initial '<!NOTATION' already *)
12160 (* read. The unique entity id of the '<!' is given as first arg. 4.7: *)
12161 (* *)
12162 (* [82] NotationDecl ::= '<!NOTATION' S Name S *)
12163 (* (ExternalID | PublicID) S? '>' *)
12164 (* *)
12165 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
12166 (* *)
12167 (* print an error and raise SyntaxState if no notation name, no *)
12168 (* external/public identifier or no final '>' is found. *)
12169 (* print an error if the '>' is not in the same entity as the '<!'. *)
12170 (* print an error if white space is missing. *)
12171 (* *)
12172 (* print an error for entity end exceptions in subfunctions. *)
12173 (* catch syntax errors by recovering to the next possible state. *)
12174 (* *)
12175 (* enter the declared notation into the notation table. *)
12176 (* return the remaining char and state. *)
12177 (*--------------------------------------------------------------------*)
12178 (* might raise: none *)
12179 (*--------------------------------------------------------------------*)
12180 fun parseNotationDecl dtd (startEnt,startPos,ext) caq =
12181 let
12182 val caq1 as (_,_,q1) = skipPS dtd caq
12183 val (name,caq2) = parseName caq1
12184 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expANotName,[c])
12185 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
12186 end
12187 val idx = AttNot2Index dtd name
12188 val caq3 = skipPS dtd caq2
12189
12190 val (extId,_,(c4,a4,q4)) = parseExtIdNot dtd caq3
12191
12192 val a5 = if useParamEnts() orelse not ext then addNotation dtd (a4,q1) (idx,extId) else a4
12193 val a6 = hookDecl(a5,((startPos,getPos q4),DEC_NOTATION(idx,extId,ext)))
12194 in
12195 if c4<>0wx3E (* #">" *)
12196 then let val a7 = hookError(a6,(getPos q4,ERR_EXPECTED(expGt,[c4])))
12197 in raise SyntaxError (c4,a7,q4)
12198 end
12199 else let val a7 = if not (!O_VALIDATE) orelse getEntId q4=startEnt then a6
12200 else hookError(a6,(getPos q4,ERR_DECL_ENT_NESTING LOC_NOT_DECL))
12201 in getChar(a7,q4)
12202 end
12203 end
12204 handle exn as SyntaxError(c,a,q) =>
12205 let val a1 = if c=0wx00 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_NOT_DECL))
12206 else a
12207 in recoverDecl false (c,a1,q)
12208 end
12209
12210 (*--------------------------------------------------------------------*)
12211 (* parse a mixed-content specification, the initial '(', S? and '#' *)
12212 (* already read. The unique id of the openening paren's entity is *)
12213 (* given as first arg. Cf. 3.2.1/2: *)
12214 (* *)
12215 (* Validity Constraint: Proper Group/PE Nesting *)
12216 (* Parameter-entity replacement text must be properly nested with *)
12217 (* parenthetized groups. That is to say, if either of the opening *)
12218 (* or closing parentheses in a choice, seq, or Mixed construct is *)
12219 (* contained in the replacement text for a parameter entity, both *)
12220 (* must be contained in the same replacement text. *)
12221 (* ... *)
12222 (* [51] Mixed ::= '(' S? '#PCDATA' [ VC: Proper Group/PE *)
12223 (* (S? '|' S? Name)* S? ')*' Nesting ] *)
12224 (* | '(' S? '#PCDATA' S? ')' [ VC: No Duplicate *)
12225 (* Types ] *)
12226 (* *)
12227 (* print an error and raise SyntaxState if no name is found first. *)
12228 (* print an error if a name other than 'PCDATA' is found. *)
12229 (* is found in the first place. *)
12230 (* print an error if element names are specified but no '*' follows. *)
12231 (* print an error if an element name is specified more than once. *)
12232 (* print an error and raise SyntaxState if neither '|' nor ')' is *)
12233 (* found after the 'PCDATA' or after an element name. *)
12234 (* print an error if the closing parenthesis is not in the same *)
12235 (* as the opening one. *)
12236 (* *)
12237 (* return the mixed-content specification, togther with the next *)
12238 (* character and state. *)
12239 (*--------------------------------------------------------------------*)
12240 (* might raise: SyntaxState *)
12241 (*--------------------------------------------------------------------*)
12242 fun parseMixed dtd lparEnt (caq as (_,_,q)) =
12243 let
12244 fun doit is (c,a,q) =
12245 case c
12246 of 0wx29 (* #")" *) =>
12247 let val a1 = if not (!O_VALIDATE) orelse getEntId q=lparEnt then a
12248 else hookError(a,(getPos q,ERR_GROUP_ENT_NESTING LOC_MIXED))
12249 in (rev is,getChar(a1,q))
12250 end
12251 | 0wx7C (* #"|" *) =>
12252 let
12253 val caq1 as (_,_,q1) = skipPSopt dtd (getChar(a,q))
12254
12255 val (name,(c2,a2,q2)) = parseName caq1
12256 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAName,[c])
12257 val a1 = hookError(a,(getPos q,err))
12258 in raise SyntaxError (c,a1,q)
12259 end
12260 val i = Element2Index dtd name
12261 val (newis,a3) =
12262 if not (member i is) then (i::is,a2)
12263 else let val a3 = if !O_VALIDATE
12264 then hookError(a2,(getPos q1,ERR_MULT_MIXED name))
12265 else a2
12266 in (is,a3)
12267 end
12268 val caq3 = skipPSopt dtd (c2,a3,q2)
12269 in doit newis caq3
12270 end
12271 | _ => let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expBarRpar,[c])))
12272 in raise SyntaxError (c,a1,q)
12273 end
12274
12275 val (name,(c1,a1,q1)) = parseName caq
12276 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expPcdata,[c])
12277 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
12278 end
12279 val a2 = case name
12280 of [0wx50,0wx43,0wx44,0wx41,0wx54,0wx41] (* "PCDATA" *) => a1
12281 | _ => hookError(a1,(getPos q,ERR_EXPECTED(expPcdata,name)))
12282
12283 val caq2 = skipPSopt dtd (c1,a2,q1)
12284 val (is,(c3,a3,q3)) = doit nil caq2
12285
12286 val caq4 = if c3=0wx2A (* #"*" *) then getChar(a3,q3)
12287 else let val a4 = if null is then a3
12288 else hookError(a3,(getPos q3,ERR_EXPECTED(expRep,[c3])))
12289 in (c3,a4,q3)
12290 end
12291 in
12292 (CT_MIXED is,caq4)
12293 end
12294
12295 (*--------------------------------------------------------------------*)
12296 (* parse an optional occurrence indicator afer a content particle or *)
12297 (* a content model, given as first argument. Cf. 3.2.1: *)
12298 (* *)
12299 (* [47] children ::= (choice | seq) ('?' | '*' | '+')? *)
12300 (* [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? *)
12301 (* *)
12302 (* return the (possibly modified) content particle, together with the *)
12303 (* next char and state. *)
12304 (*--------------------------------------------------------------------*)
12305 (* might raise: none *)
12306 (*--------------------------------------------------------------------*)
12307 fun parseOcc cm (c,a,q) =
12308 case c
12309 of 0wx3F (* #"?" *) => (CM_OPT cm,getChar(a,q))
12310 | 0wx2A (* #"*" *) => (CM_REP cm,getChar(a,q))
12311 | 0wx2B (* #"+" *) => (CM_PLUS cm,getChar(a,q))
12312 | _ => (cm,(c,a,q))
12313
12314 (*--------------------------------------------------------------------*)
12315 (* parse a content particle. Cf. 3.2.1: *)
12316 (* *)
12317 (* Validity Constraint: Proper Group/PE Nesting *)
12318 (* Parameter-entity replacement text must be properly nested with *)
12319 (* parenthetized groups. ... *)
12320 (* *)
12321 (* (see also parseMixed) *)
12322 (* *)
12323 (* [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? *)
12324 (* [49] choice ::= '(' S? cp [ VC: Proper Group/ *)
12325 (* ( S? '|' S? cp )* S? ')' PE Nesting ] *)
12326 (* [50] seq ::= '(' S? cp [ VC: Proper Group/ *)
12327 (* ( S? ',' S? cp )* S? ')' PE Nesting ] *)
12328 (* *)
12329 (* print an error and raise SyntaxState if no element name or "(" is *)
12330 (* found in the first place. *)
12331 (* *)
12332 (* return the content particle together with the next char and state. *)
12333 (*--------------------------------------------------------------------*)
12334 (* might raise: SyntaxState *)
12335 (*--------------------------------------------------------------------*)
12336 fun parseCP dtd (c,a,q) =
12337 case c
12338 of 0wx28 (* #"(" *) =>
12339 let
12340 val lparEnt = getEntId q
12341 val caq1 = skipPSopt dtd (getChar (a,q))
12342 in parseGroup dtd lparEnt caq1
12343 end
12344 | _ => (* must be an element name *)
12345 let
12346 val (name,caq1) = parseName (c,a,q)
12347 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expElemLpar,[c])
12348 val a1 = hookError(a,(getPos q,err))
12349 in raise SyntaxError (c,a1,q)
12350 end
12351 val idx = Element2Index dtd name
12352 in
12353 parseOcc (CM_ELEM idx) caq1
12354 end
12355
12356 (*--------------------------------------------------------------------*)
12357 (* parse a seq/choice, the first content particle and the connector *)
12358 (* already parsed; the connector, the type of group and the entity id *)
12359 (* of the opening parenthesis are given in first arg. Cf. 3.2.1: *)
12360 (* *)
12361 (* Validity Constraint: Proper Group/PE Nesting *)
12362 (* Parameter-entity replacement text must be properly nested with *)
12363 (* parenthetized groups. ... *)
12364 (* *)
12365 (* (see also parseMixed) *)
12366 (* *)
12367 (* [49] choice ::= '(' S? cp [ VC: Proper Group/ *)
12368 (* ( S? '|' S? cp )* S? ')' PE Nesting ] *)
12369 (* [50] seq ::= '(' S? cp [ VC: Proper Group/ *)
12370 (* ( S? ',' S? cp )* S? ')' PE Nesting ] *)
12371 (* *)
12372 (* print an error and raise SyntaxState if something other than the *)
12373 (* connector or ')' is found after a content particle. *)
12374 (* print an error if the closing parenthesis of a group is not in the *)
12375 (* same entity as the opening one. *)
12376 (* *)
12377 (* return the list of content particles parsed, together with the *)
12378 (* remaining character and state. *)
12379 (*--------------------------------------------------------------------*)
12380 (* might raise: SyntaxState *)
12381 (*--------------------------------------------------------------------*)
12382 and parseGroup' dtd (con,loc,lparEnt) caq =
12383 let fun doit caq =
12384 let
12385 val caq1 = skipPSopt dtd caq
12386 val (cp,caq2) = parseCP dtd caq1
12387 val (c3,a3,q3) = skipPSopt dtd caq2
12388 in
12389 if c3=0wx29 (* #")" ( *)
12390 then let val a4 = if not (!O_VALIDATE) orelse getEntId q3=lparEnt then a3
12391 else hookError(a3,(getPos q3,ERR_GROUP_ENT_NESTING loc))
12392 in ([cp],getChar(a4,q3))
12393 end
12394 else (if c3=con then let val (cps,caq4) = doit (getChar(a3,q3))
12395 in (cp::cps,caq4)
12396 end
12397 else let val err = ERR_EXPECTED(expConCRpar con,[c3])
12398 in raise SyntaxError (c3,hookError(a3,(getPos q3,err)),q3)
12399 end)
12400 end
12401 in
12402 doit caq
12403 end
12404
12405 (*--------------------------------------------------------------------*)
12406 (* parse a seq/choice, the first content particle parsed; the entity *)
12407 (* id of the opening parenthesis are given in first arg. Cf. 3.2.1: *)
12408 (* *)
12409 (* (see also parseMixed) *)
12410 (* *)
12411 (* [49] choice ::= '(' S? cp [ VC: Proper Group/ *)
12412 (* ( S? '|' S? cp )* S? ')' PE Nesting ] *)
12413 (* [50] seq ::= '(' S? cp [ VC: Proper Group/ *)
12414 (* ( S? ',' S? cp )* S? ')' PE Nesting ] *)
12415 (* *)
12416 (* print an error and raise SyntaxState if neither '|' nor ',' nor *)
12417 (* ')' follows the first content particle in a seq/choice. *)
12418 (* *)
12419 (* return the list of as a ContentModel, together with the remaining *)
12420 (* character and state. *)
12421 (*--------------------------------------------------------------------*)
12422 (* might raise: SyntaxState *)
12423 (*--------------------------------------------------------------------*)
12424 and parseGroup dtd lparEnt caq =
12425 let
12426 val (cp,caq1) = parseCP dtd caq
12427 val (c2,a2,q2) = skipPSopt dtd caq1
12428 val (group,caq3) =
12429 case c2
12430 of 0wx29 (* #")" *) =>
12431 let val a3 = if not (!O_VALIDATE) orelse getEntId q2=lparEnt then a2
12432 else hookError(a2,(getPos q2,ERR_GROUP_ENT_NESTING LOC_SEQ))
12433 in (CM_SEQ[cp],getChar(a3,q2))
12434 end
12435 | 0wx2C (* #"," *) =>
12436 let val (cps,caq3) = parseGroup' dtd (c2,LOC_SEQ,lparEnt) (getChar(a2,q2))
12437 in (CM_SEQ(cp::cps),caq3)
12438 end
12439 | 0wx7C (* #"|" *) =>
12440 let val (cps,caq3) = parseGroup' dtd (c2,LOC_CHOICE,lparEnt) (getChar(a2,q2))
12441 in (CM_ALT(cp::cps),caq3)
12442 end
12443 | _ => let val a3 = hookError(a2,(getPos q2,ERR_EXPECTED(expConRpar,[c2])))
12444 in raise SyntaxError (c2,a3,q2)
12445 end
12446 in parseOcc group caq3
12447 end
12448
12449 (*--------------------------------------------------------------------*)
12450 (* parse a content specification. Cf. 3.2/3.2.1: *)
12451 (* *)
12452 (* Validity Constraint: Proper Group/PE Nesting *)
12453 (* Parameter-entity replacement text must be properly nested with *)
12454 (* parenthetized groups. That is to say, if either of the opening *)
12455 (* or closing parentheses in a choice, seq, or Mixed construct is *)
12456 (* contained in the replacement text for a parameter entity, both *)
12457 (* must be contained in the same replacement text. *)
12458 (* ... *)
12459 (* [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children *)
12460 (* *)
12461 (* [47] children ::= (choice | seq) ('?' | '*' | '+')? *)
12462 (* *)
12463 (* [49] choice ::= '(' S? cp ( S? '|' S? cp )* S? ')' [ VC:Proper *)
12464 (* [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' Group/PE *)
12465 (* Nesting ]*)
12466 (* *)
12467 (* [51] Mixed ::= '(' S? '#PCDATA' [ VC: Proper Group/PE *)
12468 (* (S? '|' S? Name)* S? ')*' Nesting ] *)
12469 (* | '(' S? '#PCDATA' S? ')' [ VC: No Duplicate *)
12470 (* Types ] *)
12471 (* *)
12472 (* print an error and raise SyntaxState if no children, Mixed, or *)
12473 (* name is found. *)
12474 (* print an error and assume ANY if an ambiguous content model is *)
12475 (* specified. *)
12476 (* print an error and assume ANY if a name other than EMPTY or ANY *)
12477 (* is found. *)
12478 (* print an error if the closing parenthesis of a Mixed is not in the *)
12479 (* same entity as the opening one. *)
12480 (* *)
12481 (* return the parsed content specification, togther with the next *)
12482 (* character and state. *)
12483 (*--------------------------------------------------------------------*)
12484 (* might raise: SyntaxState *)
12485 (*--------------------------------------------------------------------*)
12486 fun parseContentSpec dtd curr (c,a,q) =
12487 case c
12488 of 0wx28 (* #"(" *) =>
12489 let
12490 val (c1,a1,q1) = skipPSopt dtd (getChar(a,q))
12491 val lparEnt = getEntId q
12492 in
12493 if c1=0wx23 (* #"#" *)
12494 then parseMixed dtd lparEnt (getChar(a1,q1))
12495 else let val (cm,(c2,a2,q2)) = parseGroup dtd lparEnt (c1,a1,q1)
12496 val (dfa,a3) = (makeDfa cm,a2) handle Ambiguous(a,n1,n2)
12497 => if !O_COMPATIBILITY
12498 then let val err = ERR_AMBIGUOUS(Index2Element dtd a,n1,n2)
12499 val a3 = hookError(a2,(getPos q,err))
12500 val dfa = makeChoiceDfa cm
12501 in (dfa,a3)
12502 end
12503 else (makeAmbiguous cm,a2) handle DfaTooLarge max
12504 => let val a3 = if !O_DFA_WARN_TOO_LARGE
12505 then hookWarning
12506 (a2,(getPos q,WARN_DFA_TOO_LARGE(curr,max)))
12507 else a2
12508 val dfa = makeChoiceDfa cm
12509 in (dfa,a3)
12510 end
12511 in (CT_ELEMENT(cm,dfa),(c2,a3,q2))
12512 end
12513 end
12514 | _ => (* must be ANY or EMPTY *)
12515 let
12516 val (name,caq1 as (c1,a1,q1)) = parseName (c,a,q)
12517 handle NotFound (c,a,q) =>
12518 let val err = ERR_EXPECTED(expContSpec,[c])
12519 in raise SyntaxError(c,hookError(a,(getPos q,err)),q)
12520 end
12521 in case name
12522 of [0wx41,0wx4e,0wx59] (* "ANY" *) => (CT_ANY,caq1)
12523 | [0wx45,0wx4d,0wx50,0wx54,0wx59] (* "EMPTY" *) => (CT_EMPTY,caq1)
12524 | _ => let val a2 = hookError(a1,(getPos q,ERR_EXPECTED(expContSpec,name)))
12525 in (CT_ANY,(c1,a2,q1))
12526 end
12527 end
12528
12529 (*--------------------------------------------------------------------*)
12530 (* parse an element declaration, the initial '<!ELEMENT' already *)
12531 (* read. The unique entity id of the '<!' is given as first arg. 3.2: *)
12532 (* *)
12533 (* [45] elementdecl ::= '<!ELEMENT' S Name [ VC: Unique *)
12534 (* S contentspec S? '>' Element Type *)
12535 (* Declaration ] *)
12536 (* *)
12537 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
12538 (* *)
12539 (* print an error and raise SyntaxState if no element name, no *)
12540 (* content specification, or no final '>' is found. *)
12541 (* print an error if the '>' is not in the same entity as the '<!'. *)
12542 (* print an error if white space is missing. *)
12543 (* *)
12544 (* print an error for entity end exceptions in subfunctions. *)
12545 (* catch syntax errors by recovering to the next possible state. *)
12546 (* *)
12547 (* enter the declared element into the notation table. *)
12548 (* return the remaining char and state. *)
12549 (*--------------------------------------------------------------------*)
12550 (* might raise: none *)
12551 (*--------------------------------------------------------------------*)
12552 fun parseElementDecl dtd (startEnt,startPos,ext) caq =
12553 let
12554 val (caq1 as (_,_,q1))= skipPS dtd caq
12555 val (name,(c2,a2,q2)) = parseName caq1
12556 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnElemName,[c])
12557 in raise SyntaxError(c,hookError(a,(getPos q,err)),q)
12558 end
12559 val a3 = checkElemName (a2,q1) name
12560 val idx = Element2Index dtd name
12561 val caq3 = skipPS dtd (c2,a3,q2)
12562
12563 val (contSpec,(c4,a4,q4)) = parseContentSpec dtd name caq3
12564
12565 val a5 = if useParamEnts() orelse not ext then addElement dtd (a4,q1) (idx,contSpec,ext)
12566 else a4
12567 val a5' = hookDecl(a5,((startPos,getPos q4),DEC_ELEMENT(idx,contSpec,ext)))
12568
12569 val (c6,a6,q6) = skipPSopt dtd (c4,a5',q4)
12570 in
12571 if c6<>0wx3E (* #">" *)
12572 then let val a7 = hookError(a6,(getPos q6,ERR_EXPECTED(expGt,[c6])))
12573 in raise SyntaxError(c6,a7,q6)
12574 end
12575 else let val a7 = if not (!O_VALIDATE) orelse getEntId q6=startEnt then a6
12576 else hookError(a6,(getPos q6,ERR_DECL_ENT_NESTING LOC_ELEM_DECL))
12577 in getChar(a7,q6)
12578 end
12579 end
12580 handle exn as SyntaxError (c,a,q) =>
12581 let val a1 = if c=0wx00 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_ELEM_DECL))
12582 else a
12583 in recoverDecl false (c,a1,q)
12584 end
12585
12586 (*--------------------------------------------------------------------*)
12587 (* parse an enumerated attribute type, the '(' already consumed. the *)
12588 (* 1st arg is a string describing the attribute (nmtoken or notation),*)
12589 (* the 2nd arg is a function that parses a single token, the 3rd arg *)
12590 (* a function for converting the token to its index. 3.3.1: *)
12591 (* *)
12592 (* [58] NotationType ::= 'NOTATION' S *)
12593 (* '(' S? Name (S? '|' S? Name)* S? ')' *)
12594 (* [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' *)
12595 (* *)
12596 (* print an error and raise SyntaxState if no token is found after a *)
12597 (* '(' or '|', or if neither '|' nor ')' follows a token. *)
12598 (* *)
12599 (* return the (sorted) list of indices of the parsed tokens. *)
12600 (*--------------------------------------------------------------------*)
12601 (* might raise: SyntaxState *)
12602 (*--------------------------------------------------------------------*)
12603 fun parseEnumerated dtd (expWhat,parseToken,Token2Index) caq =
12604 let fun doit idxs caq =
12605 let
12606 val caq1 as (_,_,q1) = skipPSopt dtd caq
12607 val (nt,(c2,a2,q2)) = parseToken caq1
12608 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expWhat,[c])
12609 in raise SyntaxError(c,hookError(a,(getPos q,err)),q)
12610 end
12611 val (idx,a3) = Token2Index dtd (a2,q1) nt
12612 val (c4,a4,q4) = skipPSopt dtd (c2,a3,q2)
12613 val newIdxs = insertInt(idx,idxs)
12614 in case c4
12615 of 0wx7C (* #"|" *) => doit newIdxs (getChar(a4,q4))
12616 | 0wx29 (* #")" *) => (newIdxs,getChar(a4,q4))
12617 | _ => let val a5 = hookError(a4,(getPos q4,ERR_EXPECTED(expBarRpar,[c4])))
12618 in raise SyntaxError (c4,a5,q4)
12619 end
12620 end
12621 in doit nil caq
12622 end
12623
12624 (*--------------------------------------------------------------------*)
12625 (* Convert a (name) token to its index as an enumerated attribute. *)
12626 (* 3.3.1: *)
12627 (* *)
12628 (* Validity Constraint: Notation Attributes *)
12629 (* ... all notation names in the declaration must be declared. *)
12630 (* *)
12631 (* print an error if a notation is not declared. *)
12632 (*--------------------------------------------------------------------*)
12633 (* might raise: SyntaxState *)
12634 (*--------------------------------------------------------------------*)
12635 fun Token2NmtokenIndex dtd (a,_) token = (AttNot2Index dtd token,a)
12636 fun Token2NotationIndex dtd (a,q) token =
12637 let
12638 val idx = AttNot2Index dtd token
12639 val a1 = if not (!O_VALIDATE) orelse hasNotation dtd idx then a
12640 else hookError(a,(getPos q,ERR_UNDECLARED(IT_NOTATION,token,LOC_NONE)))
12641 in (idx,a1)
12642 end
12643
12644 (*--------------------------------------------------------------------*)
12645 (* parse an attribute type, the 1st arg being the element this decl. *)
12646 (* refers to. 3.3.1: *)
12647 (* *)
12648 (* [54] AttType ::= StringType | TokenizedType | EnumeratedType *)
12649 (* *)
12650 (* [55] StringType ::= 'CDATA' *)
12651 (* [56] TokenizedType ::= 'ID' [VC: One ID per Element Type ] *)
12652 (* | 'IDREF' *)
12653 (* | 'IDREFS' *)
12654 (* | 'ENTITY' *)
12655 (* | 'ENTITIES' *)
12656 (* | 'NMTOKEN' *)
12657 (* | 'NMTOKENS' *)
12658 (* *)
12659 (* Validity Constraint: One ID per Element Type *)
12660 (* No element type may have more than one ID attribute specified. *)
12661 (* *)
12662 (* Enumerated Attribute Types *)
12663 (* *)
12664 (* [57] EnumeratedType ::= NotationType | Enumeration *)
12665 (* [58] NotationType ::= 'NOTATION' S '(' ... *)
12666 (* [59] Enumeration ::= '(' ... *)
12667 (* *)
12668 (* print an error and raise SyntaxState if no '(', or name is found *)
12669 (* in the first place, or the name does not start an attribute type, *)
12670 (* or if no '(' follows a 'NOTATION'. *)
12671 (* print an error and assume NMTOKEN instead of ID if the element *)
12672 (* already has an ID attribute. *)
12673 (* *)
12674 (* return the attribute type together with the next char and state. *)
12675 (*--------------------------------------------------------------------*)
12676 (* might raise: SyntaxState *)
12677 (*--------------------------------------------------------------------*)
12678 fun parseAttType dtd elem (c,a,q) =
12679 if c=0wx28 (* #"(" *) then
12680 let val (idxs,caq1) = parseEnumerated dtd
12681 (expANameToken,parseNmtoken,Token2NmtokenIndex) (getChar(a,q))
12682 in (AT_GROUP idxs,caq1)
12683 end
12684 else let val (name,caq1 as (c1,a1,q1)) = parseName (c,a,q)
12685 handle NotFound cq => let val err = ERR_EXPECTED(expAttType,[c])
12686 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
12687 end
12688 in case name
12689 of [0wx43,0wx44,0wx41,0wx54,0wx41] (* "CDATA" *) =>
12690 (AT_CDATA,caq1)
12691 | [0wx49,0wx44] (* "ID" *) =>
12692 (AT_ID,caq1)
12693 | [0wx49,0wx44,0wx52,0wx45,0wx46] (* "IDREF" *) =>
12694 (AT_IDREF,caq1)
12695 | [0wx49,0wx44,0wx52,0wx45,0wx46,0wx53] (* "IDREFS" *) =>
12696 (AT_IDREFS,caq1)
12697 | [0wx45,0wx4e,0wx54,0wx49,0wx54,0wx59] (* "ENTITY" *) =>
12698 (AT_ENTITY,caq1)
12699 | [0wx45,0wx4e,0wx54,0wx49,0wx54,0wx49,0wx45,0wx53] (* "ENTITIES" *) =>
12700 (AT_ENTITIES,caq1)
12701 | [0wx4e,0wx4d,0wx54,0wx4f,0wx4b,0wx45,0wx4e] (* "NMTOKEN" *) =>
12702 (AT_NMTOKEN,caq1)
12703 | [0wx4e,0wx4d,0wx54,0wx4f,0wx4b,0wx45,0wx4e,0wx53] (* "NMTOKEN" *) =>
12704 (AT_NMTOKENS,caq1)
12705 | [0wx4e,0wx4f,0wx54,0wx41,0wx54,0wx49,0wx4f,0wx4e] (* "NOTATION" *) =>
12706 let val (c2,a2,q2) = skipPSopt dtd caq1
12707 in case c2
12708 of 0wx28 (* #"(" *) =>
12709 let val (idxs,caq3) = parseEnumerated dtd
12710 (expANotName,parseName,Token2NotationIndex) (getChar(a2,q2))
12711 in (AT_NOTATION idxs,caq3)
12712 end
12713 | _ => let val err = ERR_EXPECTED(expLpar,[c2])
12714 in raise SyntaxError(c2,hookError(a2,(getPos q2,err)),q2)
12715 end
12716 end
12717 | _ => let val a2 = hookError(a1,(getPos q,ERR_EXPECTED(expAttType,name)))
12718 in raise SyntaxError (c1,a2,q1)
12719 end
12720 end
12721
12722 (*--------------------------------------------------------------------*)
12723 (* parse an attribute default, for an attribute whose type is given *)
12724 (* the 1st argument. Cf. 3.3.2: *)
12725 (* *)
12726 (* [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' *)
12727 (* | (('#FIXED' S)? AttValue) *)
12728 (* *)
12729 (* Validity Constraint: Attribute Default Legal *)
12730 (* The declared default value must meet the lexical constraints of *)
12731 (* the declared attribute type. *)
12732 (* *)
12733 (* and 3.3.1: *)
12734 (* *)
12735 (* Validity Constraint: ID Attribute Default *)
12736 (* An ID attribute must have a declared default of #IMPLIED or *)
12737 (* #REQUIRED. *)
12738 (* *)
12739 (* print an error and raise SyntaxState if no '#' or literal is found *)
12740 (* in the first place, or no name or a wrong name is found after the *)
12741 (* '#', or if no literal follows the 'FIXED'. *)
12742 (* print an error if white space is missing. *)
12743 (* print an error and assume IMPLIED if the default for an ID attrib. *)
12744 (* is not IMPLIED or REQUIRED. *)
12745 (* *)
12746 (* return the default together with the remaining char and state. *)
12747 (*--------------------------------------------------------------------*)
12748 (* might raise: SyntaxState *)
12749 (*--------------------------------------------------------------------*)
12750 fun parseDefaultDecl dtd (aidx,attType) (c,a,q) =
12751 if c=0wx23 (* #"#" *) then
12752 let
12753 val caq0 as (_,_,q0) = (getChar(a,q))
12754 val (name,caq1) = parseName caq0
12755 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAttDefKey,[c])
12756 in raise SyntaxError(c,hookError(a,(getPos q,err)),q)
12757 end
12758 in case name
12759 of [0wx46,0wx49,0wx58,0wx45,0wx44] (* "FIXED" *) =>
12760 let
12761 val caq2 as (_,_,q2) = skipPS dtd caq1
12762 val (lit,text,(c3,a3,q3)) = parseAttValue dtd caq2
12763 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expLitQuote,[c])
12764 val a1 = hookError(a,(getPos q,err))
12765 in raise SyntaxError (c,a1,q)
12766 end
12767 in
12768 if !O_VALIDATE andalso isIdType attType
12769 then let val a4 = hookError(a3,(getPos q,ERR_ID_DEFAULT))
12770 in (AD_IMPLIED,(c3,a4,q3))
12771 end
12772 else
12773 let val (cv,(av,a4)) = makeAttValue dtd (a3,q2)
12774 (aidx,attType,false,true,text)
12775 in (AD_FIXED((lit,cv,av),(getPos q2,ref false)),(c3,a4,q3))
12776 end
12777 handle AttValue a => (AD_IMPLIED,(c3,a,q3))
12778 end
12779
12780 | [0wx49,0wx4d,0wx50,0wx4c,0wx49,0wx45,0wx44] (* "IMPLIED" *) =>
12781 (AD_IMPLIED,caq1)
12782 | [0wx52,0wx45,0wx51,0wx55,0wx49,0wx52,0wx45,0wx44] (* "REQUIRED" *) =>
12783 (AD_REQUIRED,caq1)
12784 | _ => let val (c1,a1,q1) = caq1
12785 val a2 = hookError(a1,(getPos q0,ERR_EXPECTED(expAttDefKey,name)))
12786 in raise SyntaxError (c1,a2,q1)
12787 end
12788 end
12789 else let
12790 val (lit,text,(c1,a1,q1)) = parseAttValue dtd (c,a,q)
12791 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expQuoteRni,[c])
12792 val a1 = hookError(a,(getPos q,err))
12793 in raise SyntaxError(c,a1,q)
12794 end
12795 in
12796 if !O_VALIDATE andalso isIdType attType
12797 then let val a2 = hookError(a1,(getPos q,ERR_ID_DEFAULT))
12798 in (AD_IMPLIED,(c1,a2,q1))
12799 end
12800 else let val (cv,(av,a2)) = makeAttValue dtd (a1,q) (aidx,attType,false,true,text)
12801 in (AD_DEFAULT((lit,cv,av),(getPos q,ref false)),(c1,a2,q1))
12802 end
12803 handle AttValue a => (AD_IMPLIED,(c1,a,q1))
12804 end
12805
12806 (*--------------------------------------------------------------------*)
12807 (* parse an attribute definition, the referred element given as 1st *)
12808 (* argument. 3.3: *)
12809 (* *)
12810 (* [53] AttDef ::= S Name S AttType S DefaultDecl *)
12811 (* *)
12812 (* raise NotFound if no name is found (and thus no attribute def.) *)
12813 (* print an error if white space is missing. *)
12814 (* *)
12815 (* enter the attribute definition into the element table. *)
12816 (* return the next character and the remaining state. *)
12817 (*--------------------------------------------------------------------*)
12818 (* might raise: NotFound SyntaxState *)
12819 (*--------------------------------------------------------------------*)
12820 fun parseAttDef dtd (elem,ext) caq =
12821 let
12822 val (hadS,caq1 as (_,_,q1)) = skipPSmay dtd caq
12823
12824 val (name,(c2,a2,q2)) = parseName caq1 (* NotFound falls through to the next level *)
12825 val a3 = if hadS then a2 else hookError(a2,(getPos q1,ERR_MISSING_WHITE))
12826 val a4 = checkAttName (a3,q1) name
12827 val idx = AttNot2Index dtd name
12828
12829 val caq5 = skipPS dtd (c2,a4,q2)
12830 val (attType,caq6) = parseAttType dtd elem caq5
12831 val caq7 = skipPS dtd caq6
12832
12833 val (attDef,(c8,a8,q8)) = parseDefaultDecl dtd (idx,attType) caq7
12834
12835 val a9 = if useParamEnts() orelse not ext
12836 then addAttribute dtd (a8,q1) (elem,(idx,attType,attDef,ext)) else a8
12837 in
12838 ((idx,attType,attDef),(c8,a9,q8))
12839 end
12840
12841 (*--------------------------------------------------------------------*)
12842 (* parse an attribute-list declaration, the initial '<!ATTLIST' *)
12843 (* already read. The unique entity id of the '<!' is given as first *)
12844 (* arg. Cf. 3.3: *)
12845 (* *)
12846 (* [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' *)
12847 (* *)
12848 (* (see also the comments for ParseDtd.parseMarkupDecl). *)
12849 (* *)
12850 (* check whether the element already had an attlist declaration. (cf. *)
12851 (* DtdElements.enterAttDecl) *)
12852 (* *)
12853 (* print an error and raise SyntaxState if no element name, or no *)
12854 (* final '>' is found. *)
12855 (* print an error if the '>' is not in the same entity as the '<!'. *)
12856 (* print an error if white space is missing. *)
12857 (* *)
12858 (* print an error for entity end exceptions in subfunctions. *)
12859 (* catch syntax errors by recovering to the next possible state. *)
12860 (* *)
12861 (* enter the declared attributes into the element table. *)
12862 (* return the remaining char and state. *)
12863 (*--------------------------------------------------------------------*)
12864 (* might raise: none *)
12865 (*--------------------------------------------------------------------*)
12866 fun parseAttListDecl dtd (startEnt,startPos,ext) caq =
12867 let
12868 val caq1 as (_,_,q1) = skipPS dtd caq
12869 val (name,(c2,a2,q2)) = parseName caq1
12870 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAnElemName,[c])
12871 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
12872 end
12873 val a3 = checkElemName (a2,q1) name
12874 val idx = Element2Index dtd name
12875
12876 val a4 = if !O_VALIDATE orelse not ext then enterAttList dtd (a3,q1) idx else a3
12877
12878 fun doit attDefs caq =
12879 let val (attDef,caq1) = parseAttDef dtd (idx,ext) caq
12880 handle NotFound (c,a,q) => raise NotFound
12881 (c,hookDecl(a,((startPos,getPos q),DEC_ATTLIST(idx,rev attDefs,ext))),q)
12882 | SyntaxError (c,a,q) => raise SyntaxError
12883 (c,hookDecl(a,((startPos,getPos q),DEC_ATTLIST(idx,rev attDefs,ext))),q)
12884 in doit (attDef::attDefs) caq1
12885 end
12886
12887 val (c5,a5,q5) = doit nil (c2,a4,q2) handle NotFound caq => caq
12888 in
12889 if c5 <> 0wx3E (* #">" *)
12890 then let val a6 = hookError(a5,(getPos q5,ERR_EXPECTED(expAttNameGt,[c5])))
12891 in raise SyntaxError (c5,a6,q5)
12892 end
12893 else let val a6 = if not (!O_VALIDATE) orelse getEntId q5=startEnt then a5
12894 else hookError(a5,(getPos q5,ERR_DECL_ENT_NESTING LOC_ATT_DECL))
12895 in getChar(a6,q5)
12896 end
12897 end
12898 handle exn as SyntaxError (c,a,q) =>
12899 let val a1 = if c=0wx00 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_ATT_DECL))
12900 else a
12901 in recoverDecl false (c,a,q)
12902 end
12903 end
12904 (* stop of ../../Parser/Parse/parseDecl.sml *)
12905 (* start of ../../Parser/Parse/parseDtd.sml *)
12906 signature ParseDtd =
12907 sig
12908 (*----------------------------------------------------------------------
12909 include ParseBase
12910
12911 val parseName : UniChar.Char * AppData * State
12912 -> UniChar.Data * (UniChar.Char * AppData * State)
12913
12914 val openExtern : int * Uri.Uri -> AppData * State
12915 -> Encoding.Encoding * HookData.TextDecl option * (UniChar.Char * AppData * State)
12916 val openDocument : Uri.Uri option -> AppData
12917 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
12918
12919 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
12920 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
12921 val parseGenRef : Dtd -> UniChar.Char * AppData * State
12922 -> (int * Base.GenEntity) * (AppData * State)
12923 val parseCharRefLit : UniChar.Data -> AppData * State
12924 -> UniChar.Data * (UniChar.Char * AppData * State)
12925
12926 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
12927 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
12928
12929 val skipTag : Errors.Location -> AppData * State -> (UniChar.Char * AppData * State)
12930 val parseETag : Dtd -> AppData * State
12931 -> int * UniChar.Data * Errors.Position * (UniChar.Char * AppData * State)
12932 val parseSTag : Dtd -> Errors.Position -> UniChar.Char * AppData * State
12933 -> (HookData.StartTagInfo * Base.ElemInfo) * (UniChar.Char * AppData * State)
12934
12935 val skipDecl : bool -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
12936 ----------------------------------------------------------------------*)
12937 include ParseDecl
12938
12939 val parseDocTypeDecl : Dtd -> (UniChar.Char * AppData * State)
12940 -> int option * (UniChar.Char * AppData * State)
12941 end
12942
12943 (*--------------------------------------------------------------------------*)
12944 (* Structure: ParseDtd *)
12945 (* *)
12946 (* Exceptions raised by functions in this structure: *)
12947 (* parseDocTypeDecl : none *)
12948 (*--------------------------------------------------------------------------*)
12949 functor ParseDtd (structure ParseBase : ParseBase)
12950 : ParseDtd =
12951 struct
12952 structure ParseDecl = ParseDecl (structure ParseBase = ParseBase)
12953
12954 open
12955 Base UniChar Errors
12956 ParseDecl
12957
12958 (*--------------------------------------------------------------------*)
12959 (* parse a markup declaration other than a processing instruction, *)
12960 (* "<!" already consumed. The unique entity id of the initial '<!' is *)
12961 (* given as first arg. Cf. 2.8: *)
12962 (* *)
12963 (* [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl *)
12964 (* | NotationDecl | PI | Comment *)
12965 (* ... *)
12966 (* Validity Constraint: Proper Declaration/PE Nesting *)
12967 (* Parameter-entity replacement text must be properly nested with *)
12968 (* markup declarations. That is to say, if either the first *)
12969 (* character or the last character of a markup declaration *)
12970 (* (markupdecl above) is contained in the replacement text for a *)
12971 (* parameter-entity reference, both must be contained in the same *)
12972 (* replacement text. *)
12973 (* *)
12974 (* and 3.2,3.3,4.2,4.7: *)
12975 (* *)
12976 (* [45] elementdecl ::= '<!ELEMENT' ... *)
12977 (* [52] AttlistDecl ::= '<!ATTLIST' ... *)
12978 (* [70] EntityDecl ::= GEDecl | PEDecl *)
12979 (* [71] GEDecl ::= '<!ENTITY' ... *)
12980 (* [72] PEDecl ::= '<!ENTITY' ... *)
12981 (* [82] NotationDecl ::= '<!NOTATION' ... *)
12982 (* *)
12983 (* print an error an recover if something other than "--", "ELEMENT", *)
12984 (* "ENTITY", "ATTLIST", or"NOTATION" is found. *)
12985 (* *)
12986 (* return the remaining character and state. *)
12987 (*--------------------------------------------------------------------*)
12988 (* might raise: none *)
12989 (*--------------------------------------------------------------------*)
12990 fun parseMarkupDecl dtd (startEnt,startPos) (c,a,q) =
12991 case c
12992 of 0wx2D => (* #"-" *)
12993 let val (c1,a1,q1) = getChar (a,q)
12994 in if c1<>0wx2D (* #"-" *)
12995 then let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expDash,[c1])))
12996 in recoverDecl false (c1,a2,q1)
12997 end
12998 else parseComment startPos (a1,q1)
12999 end
13000 | _ => let
13001 val (name,caq1) = parseName (c,a,q)
13002 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expStartMarkup,[c])
13003 val a1 = hookError(a,(getPos q,err))
13004 in raise SyntaxError (c,a1,q)
13005 end
13006 val ext = hasExternal dtd
13007 in case name
13008 of [0wx45,0wx4c,0wx45,0wx4d,0wx45,0wx4e,0wx54] (* "ELEMENT" *) =>
13009 parseElementDecl dtd (startEnt,startPos,ext) caq1
13010 | [0wx41,0wx54,0wx54,0wx4c,0wx49,0wx53,0wx54] (* "ATTLIST" *) =>
13011 parseAttListDecl dtd (startEnt,startPos,ext) caq1
13012 | [0wx4e,0wx4f,0wx54,0wx41,0wx54,0wx49,0wx4f,0wx4e] (* "NOTATION" *) =>
13013 parseNotationDecl dtd (startEnt,startPos,ext) caq1
13014 | [0wx45,0wx4e,0wx54,0wx49,0wx54,0wx59] (* "ENTITY" *) =>
13015 parseEntityDecl dtd (startEnt,startPos,ext) caq1
13016 | _ => let val (c1,a1,q1) = caq1
13017 val err = ERR_EXPECTED(expStartMarkup,name)
13018 val a2 = hookError(a1,(getPos q,err))
13019 in recoverDecl false (c1,a2,q1)
13020 end
13021 end
13022
13023 (*--------------------------------------------------------------------*)
13024 (* skip an ignored section, starting after the '<![IGNORE[', consume *)
13025 (* the finishing "]]>". 3.4: *)
13026 (* *)
13027 (* [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' *)
13028 (* ignoreSectContents* ']]>' *)
13029 (* [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents *)
13030 (* ']]>' Ignore)* *)
13031 (* [65] Ignore ::= Char* - (Char* ('<!['|']]>') Char* ) *)
13032 (* *)
13033 (* ... If the keyword of the conditional section is IGNORE, then *)
13034 (* the contents of the conditional section are not logically part *)
13035 (* of the DTD. Note that for reliable parsing, the contents of even *)
13036 (* ignored conditional sections must be read in order to detect *)
13037 (* nested conditional sections and ensure that the end of the *)
13038 (* outermost (ignored) conditional section is properly detected. *)
13039 (* If a conditional section with a keyword of INCLUDE occurs within *)
13040 (* a larger conditional section with a keyword of IGNORE, both the *)
13041 (* outer and the inner conditional sections are ignored. *)
13042 (* *)
13043 (* print an error an finish if an entity end is encountered. *)
13044 (* *)
13045 (* return the next char and state. *)
13046 (*--------------------------------------------------------------------*)
13047 (* might raise: none *)
13048 (*--------------------------------------------------------------------*)
13049 fun skipIgnored caq =
13050 let
13051 (*--------------------------------------------------------------*)
13052 (* level counts the nesting of conditional sections. *)
13053 (* if the second char after a "<" ("]") is not a "[" ("]"), it *)
13054 (* can nevertheless start another delimiter and is therefore *)
13055 (* fed into a recursive call of doit. *)
13056 (*--------------------------------------------------------------*)
13057 fun doit level (c,a,q) =
13058 case c
13059 of 0wx00 => (c,hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_IGNORED)),q)
13060 | 0wx3C (* #"<" *) =>
13061 let val (c1,a1,q1) = getChar (a,q)
13062 in if c1=0wx21 (* #"!" *)
13063 then let val (c2,a2,q2) = (getChar(a1,q1))
13064 in if c2=0wx5B (* #"[" *) then doit (level+1) (getChar(a2,q2))
13065 else doit level (c2,a2,q2)
13066 end
13067 else doit level (c1,a1,q1)
13068 end
13069 | 0wx5D (* #"]" *) =>
13070 let val (c1,a1,q1) = getChar (a,q)
13071 in if c1=0wx5D (* #"]" *) then doit' level (getChar (a1,q1))
13072 else doit level (c1,a1,q1)
13073 end
13074 | _ => doit level (getChar (a,q))
13075 (*--------------------------------------------------------------*)
13076 (* if the second "]" is followed by a "]", then this might be *)
13077 (* the real second "]". Therefore doit' loops as long as it *)
13078 (* finds "]"'s. *)
13079 (*--------------------------------------------------------------*)
13080 and doit' level (c,a,q) =
13081 case c
13082 of 0wx3E (* #">" *) => if level>0 then doit (level-1) (getChar (a,q))
13083 else getChar (a,q)
13084 | 0wx5D (* #"]" *) => doit' level (getChar (a,q))
13085 | _ => doit level (c,a,q)
13086 in
13087 doit 0 caq
13088 end
13089
13090 (*--------------------------------------------------------------------*)
13091 (* parse the internal or external subset of the dtd. handle included *)
13092 (* sections by counting their nesting level. Cf 2.8: *)
13093 (* *)
13094 (* Validity Constraint: Proper Declaration/PE Nesting *)
13095 (* Parameter-entity replacement text must be properly nested with *)
13096 (* markup declarations. That is to say, if either the first *)
13097 (* character or the last character of a markup declaration *)
13098 (* (markupdecl above) is contained in the replacement text for a *)
13099 (* parameter-entity reference, both must be contained in the same *)
13100 (* replacement text. *)
13101 (* ... *)
13102 (* [28] doctypedecl ::= '<!DOCTYPE'[Image] S Name (S ExternalID)? *)
13103 (* S? ('[' (markupdecl | PEReference | S)* ']' S?)? '>' *)
13104 (* [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl *)
13105 (* | NotationDecl | PI | Comment *)
13106 (* [30] extSubset ::= TextDecl? extSubsetDecl *)
13107 (* [31] extSubsetDecl ::= ( markupdecl | conditionalSect *)
13108 (* | PEReference | S )* *)
13109 (* and 3.4: *)
13110 (* *)
13111 (* [61] conditionalSect ::= includeSect | ignoreSect *)
13112 (* [62] includeSect ::= '<![' S? 'INCLUDE' S? *)
13113 (* '[' extSubsetDecl ']]>' *)
13114 (* [63] ignoreSect ::= '<![' S? 'IGNORE' S? *)
13115 (* '[' ignoreSectContents* ']]>' *)
13116 (* *)
13117 (* print an error and finish if the end of document is encountered in *)
13118 (* the internal subset. *)
13119 (* print an error and raise SyntaxState if a "<" is not followed by a *)
13120 (* "!" or a "?". *)
13121 (* print an error and raise SyntaxState if a "]" is not followed by *)
13122 (* "]>". *)
13123 (* print an error if a "<![" is found in the internal subset. *)
13124 (* print an error if a "]" is found outside the internal subset. *)
13125 (* print an error if a "]]>" is found outside an included section. *)
13126 (* print an error an raise SyntaxState if something other than a *)
13127 (* markup declaration, parameter entity reference, white space or *)
13128 (* a conditional section is encountered. *)
13129 (* print an error and raise SyntaxState if a "<![" is not followed by *)
13130 (* "INCLUDE" or "IGNORE", or if the second "[" is missing. *)
13131 (* *)
13132 (* catch entity end exceptions in subfunctions by printing an error *)
13133 (* and recovering. *)
13134 (* *)
13135 (* return the remaining state and char. *)
13136 (*--------------------------------------------------------------------*)
13137 (* might raise: none *)
13138 (*--------------------------------------------------------------------*)
13139 fun parseSubset dtd caq =
13140 let
13141 datatype CondStatus = IGNORE | INCLUDE
13142
13143 fun do_data caq =
13144 let fun doit hadError ws (c,a,q) =
13145 case c
13146 of 0wx00 => (ws,(c,a,q))
13147 | 0wx09 => doit false (c::ws) (getChar(a,q))
13148 | 0wx0A => doit false (c::ws) (getChar(a,q))
13149 | 0wx20 => doit false (c::ws) (getChar(a,q))
13150 | 0wx25 => (ws,(c,a,q))
13151 | 0wx3C => (ws,(c,a,q))
13152 | 0wx5D => (ws,(c,a,q))
13153 | _ => if hadError then doit true ws (getChar(a,q))
13154 else let val err = ERR_FORBIDDEN_HERE(IT_DATA nil,LOC_SUBSET)
13155 val a1 = hookError (a,(getPos q,err))
13156 in doit true ws (getChar(a1,q))
13157 end
13158
13159 val (ws,(c1,a1,q1)) = doit false nil caq
13160 val a2 = if null ws then a1
13161 else hookWhite(a1,Data2Vector (rev ws))
13162 in (c1,a2,q1)
13163 end
13164
13165 fun doit cond (c,a,q) =
13166 case c
13167 of 0wx00 =>
13168 if isSpecial q
13169 (*---------------------------------------------------*)
13170 (* the external subset ends at and of special entity.*)
13171 (* so does the internal subset, but with error. *)
13172 (*---------------------------------------------------*)
13173 then
13174 let val a1 =
13175 if inDocEntity q
13176 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_INT_SUBSET))
13177 else if cond=0 then a
13178 else hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_INCLUDED))
13179 in (c,a1,q)
13180 end
13181 else let val a1 = hookEntEnd (a,getPos q)
13182 in doit cond (getChar(a1,q))
13183 end
13184
13185 (* ignore errors in parameter references -----------------*)
13186 | 0wx25 (* #"%" *) =>
13187 let
13188 val caq2 =
13189 let val ((id,ent),(a1,q1)) = parseParRef dtd (getChar(a,q))
13190 in if !O_VALIDATE orelse !O_INCLUDE_PARAM_ENTS then
13191 case ent
13192 of PE_NULL => getChar(a1,q1)
13193 | PE_INTERN(_,rep) =>
13194 let
13195 val q2 = pushIntern(q1,id,true,rep)
13196 val a2 = hookParRef(a1,((getPos q,getPos q1),id,ent,true))
13197 in getChar(a2,q2)
13198 end
13199 | PE_EXTERN extId =>
13200 let
13201 val a2 = hookParRef(a1,((getPos q,getPos q1),id,ent,true))
13202 val caq3 =
13203 #3(openExtern (id,true,resolveExtId extId) (a2,q1))
13204 handle CantOpenFile(fmsg,a)
13205 => let val err = ERR_NO_SUCH_FILE fmsg
13206 val a1 = hookError(a,(getPos q1,err))
13207 val a2 = hookEntEnd (a1,getPos q1)
13208 in (getChar(a2,q1))
13209 end
13210 in caq3
13211 end
13212 (* changed 080600: setExternal is already called by parseParRef *)
13213 else let val a2 = hookParRef(a1,((getPos q,getPos q1),id,ent,false))
13214 in getChar(a2,q1)
13215 end
13216 end
13217 handle SyntaxError caq => caq
13218 | NoSuchEntity aq => getChar aq
13219 in doit cond caq2
13220 end
13221
13222 | 0wx3C (* #"<" *) =>
13223 let val (c1,a1,q1) = getChar(a,q)
13224 in case c1
13225 of 0wx3F => (* #"?" *)
13226 let val caq2 = parseProcInstr (getPos q) (a1,q1)
13227 in doit cond caq2
13228 end
13229 | 0wx21 => (* #"!" *)
13230 let val (c2,a2,q2) = (getChar(a1,q1))
13231 in if c2=0wx5B (* #"[" *)
13232 then do_cond cond q (a2,q2)
13233 else
13234 let val caq3 = parseMarkupDecl dtd
13235 (getEntId q,getPos q) (c2,a2,q2)
13236 in doit cond caq3
13237 end
13238 end
13239 | _ => let val err = ERR_EXPECTED(expExclQuest,[c1])
13240 val a2 = hookError(a1,(getPos q1,err))
13241 val caq3 = recoverDecl false (c1,a2,q1)
13242 in doit cond caq3
13243 end
13244 end
13245
13246 | 0wx5D (* #"]" *) => do_brack cond q (getChar(a,q))
13247 | _ => let val caq1 = do_data (c,a,q)
13248 in doit cond caq1
13249 end
13250
13251 and do_brack cond q0 (c,a,q) =
13252 if inDocEntity q then (c,a,q)
13253 else if c=0wx5D (* #"]" *)
13254 then let val (c1,a1,q1) = getChar(a,q)
13255 in if c1=0wx3E (* #">" *)
13256 (* ignore wrong "]]>"'s ------------------*)
13257 then if cond=0
13258 then let val err = ERR_FORBIDDEN_HERE(IT_DATA [c,c,c1],
13259 LOC_OUT_COND)
13260 val a2 = hookError(a1,(getPos q0,err))
13261 in doit cond (getChar(a2,q1))
13262 end
13263 else doit (cond-1) (getChar(a1,q1))
13264 (* the second "]" may start another "]]>" ---*)
13265 else let val a2 = hookError(a1,(getPos q1,ERR_EXPECTED(expGt,[c1])))
13266 in do_brack cond q (c1,a2,q1)
13267 end
13268 end
13269 else let val a1 = hookError(a,(getPos q,ERR_EXPECTED(expRbrack,[c])))
13270 in doit cond (c,a1,q)
13271 end
13272
13273 and do_cond cond q0 (a,q) =
13274 let
13275 (* marked sections are forbidden in the internal subset. -*)
13276 val inInt = inDocEntity q
13277 val a1 = if inInt then hookError (a,(getPos q0,ERR_FORBIDDEN_HERE
13278 (IT_COND,LOC_INT_SUBSET)))
13279 else a
13280
13281 val caq2 as (_,_,q2) = skipPSopt dtd (getChar(a1,q))
13282
13283 val (status,caq3) =
13284 let
13285 val (name,(c3,a3,q3)) = parseName caq2
13286 (* ignore sections with bad status keyword ---------*)
13287 val (status,a4) =
13288 case name
13289 of [0wx49,0wx47,0wx4e,0wx4f,0wx52,0wx45] => (IGNORE,a3)
13290 | [0wx49,0wx4e,0wx43,0wx4c,0wx55,0wx44,0wx45] => (INCLUDE,a3)
13291 | _ => let val err = ERR_EXPECTED(expCondStatus,name)
13292 val a4 = hookError(a3,(getPos q2,err))
13293 in (IGNORE,a4)
13294 end
13295 val (c5,a5,q5) = skipPSopt dtd (c3,a4,q3)
13296 in (* ignore sections without "[" after keyword -------*)
13297 if c5=0wx5B then (status,getChar(a5,q5))
13298 else let val a6 = hookError(a5,(getPos q5,ERR_EXPECTED(expLbrack,[c5])))
13299 in (IGNORE,(c5,a6,q5))
13300 end
13301 end
13302 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expCondStatus,[c])
13303 val a1 = hookError(a,(getPos q,err))
13304 in (IGNORE,(c,a1,q))
13305 end
13306 in
13307 (* ignore sections in the internal subset ----------------*)
13308 case (status,inInt)
13309 of (INCLUDE,_) => doit (cond+1) caq3
13310 | (_,_) => doit cond (skipIgnored caq3)
13311 end
13312 in
13313 doit 0 caq
13314 end
13315
13316 (*--------------------------------------------------------------------*)
13317 (* parse the internal subset of the dtd. Cf 2.8: *)
13318 (* *)
13319 (* return the remaining character and state. *)
13320 (*--------------------------------------------------------------------*)
13321 (* might raise: none *)
13322 (*--------------------------------------------------------------------*)
13323 fun parseInternalSubset dtd (a,q) =
13324 let val a1 = hookSubset (a,getPos q)
13325 in parseSubset dtd (getChar(a1,q))
13326 end
13327
13328 (*--------------------------------------------------------------------*)
13329 (* parse the external subset of the dtd, the filename given as first *)
13330 (* argument. handle included sections by counting their nesting level.*)
13331 (* the file is opened on its own stack, and closed at the end. *)
13332 (* Cf 2.8: *)
13333 (* *)
13334 (* print an error and do nothing if the file cannot be opened. *)
13335 (* *)
13336 (* return nothing. *)
13337 (*--------------------------------------------------------------------*)
13338 (* might raise: none *)
13339 (*--------------------------------------------------------------------*)
13340 fun parseExternalSubset dtd (a,q) extId =
13341 let
13342 val uri = resolveExtId extId
13343 val (enc,textDecl,(c1,a1,q1)) = openSubset uri a
13344 val a2 = hookExtSubset (a1,(uri,enc,textDecl))
13345 val (_,a3,q3) = parseSubset dtd (c1,a2,q1)
13346 val _ = closeAll q3
13347 in a3
13348 end
13349 handle CantOpenFile(fmsg,a) => hookError(a,(getPos q,ERR_NO_SUCH_FILE fmsg))
13350
13351 (*--------------------------------------------------------------------*)
13352 (* Parse the document type declaration, the <!DOCTYPE already read. *)
13353 (* Cf. 2.8: *)
13354 (* *)
13355 (* [28] doctypedecl ::= '<!DOCTYPE'[Image] S Name (S ExternalID)? *)
13356 (* S? ('[' (markupdecl | PEReference | S)* ']' S?)? '>' *)
13357 (* *)
13358 (* print an error and raise SyntaxState if no name is found. *)
13359 (* print an error and raise SyntaxState if no final ">" is found. *)
13360 (* external identifier is found. *)
13361 (* print an error if white space is missing. *)
13362 (* *)
13363 (* return nothing. *)
13364 (*--------------------------------------------------------------------*)
13365 (* might raise: none *)
13366 (*--------------------------------------------------------------------*)
13367 fun parseDocTypeDecl dtd caq =
13368 let
13369 val _ = setHasDtd dtd
13370 val caq1 = skipS caq
13371
13372 val (doc,caq2) = parseName caq1
13373 handle NotFound (c,a,q) => let val err = ERR_EXPECTED(expAName,[c])
13374 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
13375 end
13376 val idx = Element2Index dtd doc
13377
13378 val (hadS,caq3 as (_,_,q3)) = skipSmay caq2
13379 val (ext,(c4,a4,q4)) = let val (extId,_,(c4,a4,q4)) = parseExtIdSub dtd caq3
13380 val a5 = if hadS then a4
13381 else hookError(a4,(getPos q3,ERR_MISSING_WHITE))
13382 in (SOME extId,(c4,a5,q4))
13383 end
13384 handle NotFound caq => (NONE,caq)
13385
13386 val a4' = hookDocType(a4,(idx,ext))
13387 val (c5,a5,q5) = case c4
13388 of 0wx5B (* #"[" *) =>
13389 let val caq5 = parseInternalSubset dtd (a4',q4)
13390 in skipSopt caq5
13391 end
13392 | _ => (c4,a4',q4)
13393
13394 val a6 = case ext
13395 of NONE => a5
13396 | SOME extId => let val _ = setExternal dtd
13397 in if !O_VALIDATE orelse !O_INCLUDE_PARAM_ENTS
13398 then parseExternalSubset dtd (a5,q5) extId
13399 else a5
13400 end
13401
13402 val a7 = checkMultEnum dtd (a6,q5)
13403 val a7'= checkPreDefined dtd (a7,q5)
13404 val a8 = checkUnparsed dtd a7'
13405
13406 val (c9,a9,q9) = if c5=0wx3E (* #">" *) then getChar(a8,q5)
13407 else let val err = expectedOrEnded(expGt,LOC_DOC_DECL) c5
13408 val a9 = hookError(a8,(getPos q5,err))
13409 in recoverDecl false (c5,a9,q5)
13410 end
13411 in
13412 (SOME idx,(c9,hookEndDtd(a9,getPos q9),q9))
13413 end
13414 handle exn as SyntaxError(c,a,q) =>
13415 let val a1 = if c=0wx00 then hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_DOC_DECL))
13416 else a
13417 val (c2,a2,q2) = recoverDecl true (c,a1,q)
13418 in (NONE,(c2,hookEndDtd(a2,getPos q2),q2))
13419 end
13420 end
13421 (* stop of ../../Parser/Parse/parseDtd.sml *)
13422 (* start of ../../Parser/Parse/parseContent.sml *)
13423 signature ParseContent =
13424 sig
13425 (*----------------------------------------------------------------------
13426 include ParseBase
13427
13428 val parseName : UniChar.Char * AppData * State
13429 -> UniChar.Data * (UniChar.Char * AppData * State)
13430
13431 val openDocument : Uri.Uri option -> AppData
13432 -> Encoding.Encoding * HookData.XmlDecl option * (UniChar.Char * AppData * State)
13433
13434 val skipCharRef : AppData * State -> (UniChar.Char * AppData * State)
13435 val skipReference : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
13436
13437 val parseComment : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
13438 val parseProcInstr : Errors.Position -> AppData * State -> (UniChar.Char * AppData * State)
13439
13440 val skipTag : Errors.Location -> AppData * State -> (UniChar.Char * AppData * State)
13441 val parseSTag : Dtd -> Errors.Position -> UniChar.Char * AppData * State
13442 -> (HookData.StartTagInfo * Base.ElemInfo) * (UniChar.Char * AppData * State)
13443
13444 val skipDecl : bool -> UniChar.Char * AppData * State -> UniChar.Char * AppData * State
13445
13446 val parseDocTypeDecl : Dtd -> (UniChar.Char * AppData * State)
13447 -> int option * (UniChar.Char * AppData * State)
13448 ----------------------------------------------------------------------*)
13449 include ParseDtd
13450
13451 val skipBadSection : UniChar.Char * AppData * State -> (UniChar.Char * AppData * State)
13452
13453 val parseElement : Dtd * int list * State * (HookData.StartTagInfo * Base.ElemInfo)
13454 * (UniChar.Char * AppData * State)
13455 -> (int * UniChar.Data * Errors.Position * Errors.Position) option
13456 * (UniChar.Char * AppData * State)
13457 end
13458
13459 (*--------------------------------------------------------------------------*)
13460 (* Structure: ParseContent *)
13461 (* *)
13462 (* Exceptions raised by functions in this structure: *)
13463 (* skipBadSection : none *)
13464 (* parseElement : none *)
13465 (*--------------------------------------------------------------------------*)
13466 functor ParseContent (structure ParseBase : ParseBase)
13467 : ParseContent =
13468 struct
13469 structure ParseDtd = ParseDtd (structure ParseBase = ParseBase)
13470
13471 open
13472 Base Errors UniChar UniClasses UtilList
13473 ParseDtd
13474
13475 val THIS_MODULE = "ParseContent"
13476 val DATA_BUFSIZE = 1024
13477 val dataBuffer = Array.array(DATA_BUFSIZE,0w0:UniChar.Char)
13478
13479 (*--------------------------------------------------------------------*)
13480 (* skip a cdata section, the initial "<![" already consumed. The first*)
13481 (* arg is the type of section to be skipped. cf. 2.5: *)
13482 (* *)
13483 (* [18] CDSect ::= CDStart CData CDEnd *)
13484 (* [19] CDStart ::= '<![CDATA[' *)
13485 (* [20] CData ::= (Char* - (Char* ']]>' Char* )) [[ *)
13486 (* [21] CDEnd ::= ']]>' *)
13487 (* *)
13488 (* don't care abeout whether "CDATA[" is present. just skip until the *)
13489 (* next "]]>" or entity end. *)
13490 (* *)
13491 (* return the remaining char and state. *)
13492 (*--------------------------------------------------------------------*)
13493 (* might raise: none *)
13494 (*--------------------------------------------------------------------*)
13495 fun skipBadSection caq =
13496 let(*--------------------------------------------------------------*)
13497 (* for a sequence of "]"s, check whether the last two are *)
13498 (* followed by a ">" *)
13499 (*--------------------------------------------------------------*)
13500 fun checkEnd aq =
13501 let val (c1,a1,q1) = getChar aq
13502 in case c1
13503 of 0wx3E (* #">" *) => getChar(a1,q1)
13504 | 0wx5D (* #"]" *) => checkEnd(a1,q1)
13505 | _ => doit(c1,a1,q1)
13506 end
13507 and doit (c,a,q) =
13508 case c
13509 of 0wx00 => let val a1 = hookError(a,(getPos q,ERR_ENDED_BY_EE LOC_CDATA))
13510 in (c,a1,q)
13511 end
13512 | 0wx5D (* #"]" *) => let val (c1,a1,q1) = getChar(a,q)
13513 in if c1=0wx5D (* #"]" *) then checkEnd(a1,q1)
13514 else doit (c1,a1,q1)
13515 end
13516 | _ => doit (getChar(a,q))
13517 in doit caq
13518 end
13519
13520 (*--------------------------------------------------------------------*)
13521 (* parse a cdata section, the initial "<![CDATA[" already consumed. *)
13522 (* cf. 2.5: *)
13523 (* *)
13524 (* [18] CDSect ::= CDStart CData CDEnd *)
13525 (* [19] CDStart ::= '<![CDATA[' *)
13526 (* [20] CData ::= (Char* - (Char* ']]>' Char* )) [[ *)
13527 (* [21] CDEnd ::= ']]>' *)
13528 (* *)
13529 (* print an error and finish if an entity end is found. *)
13530 (* *)
13531 (* return the data as a Vector option and the next char & state. *)
13532 (*--------------------------------------------------------------------*)
13533 (* might raise: none *)
13534 (*--------------------------------------------------------------------*)
13535 fun parseCDataSection' (aq as (_,q)) =
13536 let
13537 (*--------------------------------------------------------------*)
13538 (* for a sequence of "]"s, check whether the last two are *)
13539 (* followed by a ">" *)
13540 (*--------------------------------------------------------------*)
13541 fun doEnd (text,q0,q1) (a2,q2) =
13542 let val (c3,a3,q3) = getChar (a2,q2)
13543 in case c3
13544 of 0wx00 =>
13545 let val a4 = hookError(a3,(getPos q3,ERR_ENDED_BY_EE LOC_CDATA))
13546 in (0wx5D::text,getPos q2,(c3,a4,q3))
13547 end
13548 | 0wx3E => (* #">" *) (text,getPos q0,getChar(a3,q3))
13549 | 0wx5D => doEnd (0wx5D::text,q1,q2) (a3,q3)
13550 | _ => doit (c3::0wx5D::0wx5D::text) (a3,q3)
13551 end
13552 and doBrack (text,q0) (a1,q1) =
13553 let val (c2,a2,q2) = getChar(a1,q1)
13554 in case c2
13555 of 0wx00 =>
13556 let val a3 = hookError(a2,(getPos q2,ERR_ENDED_BY_EE LOC_CDATA))
13557 in (0wx5D::text,getPos q1,(c2,a3,q2))
13558 end
13559 | 0wx5D (* #"]" *) => doEnd (text,q0,q1) (a2,q2)
13560 | _ => doit (c2::0wx5D::text) (a2,q2)
13561 end
13562 and doit text (a,q) =
13563 let val (c1,a1,q1) = getChar(a,q)
13564 in case c1
13565 of 0wx00 =>
13566 let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_CDATA))
13567 in (text,getPos q,(c1,a2,q1))
13568 end
13569 | 0wx5D (* #"]" *) => doBrack (text,q) (a1,q1)
13570 | _ => doit (c1::text) (a1,q1)
13571 end
13572 val (c1,a1,q1) = getChar aq
13573 val startPos = getPos q1
13574 val (cs,endPos,(c2,a2,q2)) =
13575 case c1
13576 of 0wx00 =>
13577 let val a2 = hookError(a1,(getPos q1,ERR_ENDED_BY_EE LOC_CDATA))
13578 in (nil,getPos q,(c1,a2,q1))
13579 end
13580 | 0wx5D (* #"]" *) => doBrack (nil,q) (a1,q1)
13581 | _ => doit [c1] (a1,q1)
13582 val text = Data2Vector(rev cs)
13583 val a3 = hookCData(a1,((startPos,endPos),text))
13584 in (c2,a3,q2)
13585 end
13586 (*--------------------------------------------------------------------*)
13587 (* parse a cdata section, the initial "<![" already consumed. *)
13588 (* cf. 2.5: *)
13589 (* *)
13590 (* [18] CDSect ::= CDStart CData CDEnd *)
13591 (* [19] CDStart ::= '<![CDATA[' *)
13592 (* [20] CData ::= (Char* - (Char* ']]>' Char* )) [[ *)
13593 (* [21] CDEnd ::= ']]>' *)
13594 (* *)
13595 (* print an error and skip the section if no name or a name other *)
13596 (* than CDATA comes first, or no '[' follows the name. *)
13597 (* *)
13598 (* return the text of the section together with the remaining state. *)
13599 (*--------------------------------------------------------------------*)
13600 (* might raise: none *)
13601 (*--------------------------------------------------------------------*)
13602 fun parseCDataSection startPos aq =
13603 let
13604 val caq0 as (_,_,q0) = (getChar aq)
13605 val (name,(c1,a1,q1)) = parseName caq0
13606 handle NotFound (c,a,q) => let val err = expectedOrEnded(expCdata,LOC_CDATA) c
13607 in raise SyntaxError(c,hookError(a,(getPos q,err)),q)
13608 end
13609
13610 val _ = if name = [0wx43,0wx44,0wx41,0wx54,0wx41] (* "CDATA" *) then ()
13611 else let val err = ERR_EXPECTED(expCdata,name)
13612 in raise SyntaxError(c1,hookError(a1,(getPos q0,err)),q1)
13613 end
13614
13615 val _ = if c1=0wx5B (* #"[" *) then ()
13616 else let val err = expectedOrEnded(expLbrack,LOC_CDATA) c1
13617 in raise SyntaxError(c1,hookError(a1,(getPos q1,err)),q1)
13618 end
13619 in
13620 parseCDataSection'(a1,q1)
13621 end
13622 handle SyntaxError caq => skipBadSection caq
13623
13624 (*--------------------------------------------------------------------*)
13625 (* parse element or empty content. The second arg holds the unique *)
13626 (* number of the element's first characters's entity, the index of *)
13627 (* the current element, and the dfa for its content. Cf. 3: *)
13628 (* *)
13629 (* [39] element ::= EmptyElemTag *)
13630 (* | STag content ETag *)
13631 (* ... *)
13632 (* Well-Formedness Constraint: Element Type Match *)
13633 (* The Name in an element's end-tag must match the element type in *)
13634 (* the start-tag. *)
13635 (* *)
13636 (* Validity Constraint: Element Valid *)
13637 (* An element is valid if there is a declaration matching *)
13638 (* elementdecl where the Name matches the element type, and one of *)
13639 (* the following holds: *)
13640 (* *)
13641 (* 1. The declaration matches EMPTY and the element has no content. *)
13642 (* 2. The declaration matches children and the sequence of child *)
13643 (* elements belongs to the language generated by the regular *)
13644 (* expression in the content model, with optional white space *)
13645 (* (characters matching the nonterminal S) between each pair of *)
13646 (* child elements. *)
13647 (* *)
13648 (* and 3.1: *)
13649 (* *)
13650 (* [43] content ::= (element | CharData | Reference | CDSect | PI *)
13651 (* | Comment)* *)
13652 (* 2.4: *)
13653 (* The ampersand character (&) and the left angle bracket (<) may *)
13654 (* appear in their literal form only when used as markup delimiters,*)
13655 (* or within a comment, a processing instruction, or a CDATA *)
13656 (* section... If they are needed elsewhere, they must be escaped *)
13657 (* using either numeric character references or the strings "&amp;" *)
13658 (* and "&lt;" respectively... *)
13659 (* *)
13660 (* consume the content of the element, accumulating it via the user *)
13661 (* data functions (parameter a in subfunctions). trace the content *)
13662 (* model of the element with a dfa transitions on a dfa state (para- *)
13663 (* meter p in subfunctions). finish at the first end-tag, whether *)
13664 (* matching or not, or at the document end. *)
13665 (* *)
13666 (* handle all syntax and other recoverable errors from subfunctions *)
13667 (* and try to continue. *)
13668 (* *)
13669 (* return the accumulated user data and the next char and state. *)
13670 (*--------------------------------------------------------------------*)
13671 (* might raise: none *)
13672 (*--------------------------------------------------------------------*)
13673 fun parseElementContent dtd (openElems,startEnt,curr,dfa,ext,mt) caq =
13674 let
13675 (*--------------------------------------------------------------*)
13676 (* check whether the dfa allows a transition/an end tag here. *)
13677 (* print an error if not. After a transition return the new *)
13678 (* dfa state. *)
13679 (*--------------------------------------------------------------*)
13680 fun fin_elem (a,pos,dfa,p) =
13681 if dfaFinal(dfa,p) then a
13682 else hookError(a,(pos,ERR_ENDED_EARLY(Index2Element dtd curr)))
13683 fun trans_elem (a,q,dfa,p,el) =
13684 let val p1 = dfaTrans(dfa,p,el)
13685 in if p1<>dfaError then (p1,a)
13686 else let val err = ERR_BAD_ELEM(Index2Element dtd curr,Index2Element dtd el)
13687 in (p1,hookError(a,(getPos q,err)))
13688 end
13689 end
13690
13691 (*--------------------------------------------------------------*)
13692 (* consume all white space and skip all data until the next "<" *)
13693 (* or "&". print an error for each sequence of data encountered.*)
13694 (* *)
13695 (* add the white space as data to the user data. *)
13696 (* return the next char and state. *)
13697 (*--------------------------------------------------------------*)
13698 fun do_char_elem (c0,a0,q0) =
13699 let
13700 (*--------------------------------------------------------------*)
13701 (* read data characters until the next "<", "&" or entity end. *)
13702 (* add the data to the user data when an error occurs or no *)
13703 (* more data follows. *)
13704 (* *)
13705 (* return the modified user data with the next char and state. *)
13706 (*--------------------------------------------------------------*)
13707 fun data_hook(a,q,cs) =
13708 if null cs then a
13709 else hookData(a,((getPos q0,getPos q),Data2Vector(rev cs),true))
13710 fun after_error (caq as (c,a,q)) =
13711 case c
13712 of 0wx00 => caq
13713 | 0wx26 (* #"&" *) => caq
13714 | 0wx3C (* #"<" *) => caq
13715 | _ => after_error(getChar(a,q))
13716 fun do_data (yet,aq as (_,q)) =
13717 let val (c1,a1,q1) = getChar aq
13718 in case c1
13719 of 0wx00 => (c1,data_hook(a1,q,yet),q1)
13720 | 0wx26 (* #"&" *) => (c1,data_hook(a1,q,yet),q1)
13721 | 0wx3C (* #"<" *) => (c1,data_hook(a1,q,yet),q1)
13722 | _ =>
13723 if isS c1 then do_data (c1::yet,(a1,q1))
13724 else let val a2 = data_hook(a1,q,yet)
13725 val err = ERR_ELEM_CONTENT(IT_DATA nil)
13726 val a3 = hookError(a2,(getPos q1,err))
13727 in after_error (getChar(a3,q1))
13728 end
13729 end
13730 in
13731 if isS c0 then
13732 let val a1 = if not (ext andalso standsAlone dtd) then a0
13733 else let val err = ERR_STANDALONE_ELEM(Index2Element dtd curr)
13734 val _ = setStandAlone dtd (not (!O_ERROR_MINIMIZE))
13735 in hookError(a0,(getPos q0,err))
13736 end
13737 in do_data ([c0],(a1,q0))
13738 end
13739 else let val a1 = hookError(a0,(getPos q0,ERR_ELEM_CONTENT(IT_DATA nil)))
13740 in after_error(getChar(a1,q0))
13741 end
13742 end
13743 (*--------------------------------------------------------------*)
13744 (* consume a reference, handling errors by ignoring them. *)
13745 (*--------------------------------------------------------------*)
13746 fun do_ref (q,(c1,a1,q1)) =
13747 if c1=0wx23 (* #"#" *)
13748 (*------------------------------------------------------*)
13749 (* it's a character reference. *)
13750 (*------------------------------------------------------*)
13751 then let val err = ERR_ELEM_CONTENT IT_CHAR_REF
13752 val a2 = hookError(a1,(getPos q,err))
13753 in skipCharRef(a2,q1)
13754 end
13755 (*---------------------------------------------------------*)
13756 (* it's a general entity reference. *)
13757 (*---------------------------------------------------------*)
13758 else let val ((id,ent),(a2,q2)) = parseGenRef dtd (c1,a1,q1)
13759 in case ent
13760 of GE_NULL =>
13761 let val a3 = hookGenRef(a2,((getPos q,getPos q2),id,ent,false))
13762 in (getChar(a3,q2))
13763 end
13764 | GE_INTERN(_,rep) =>
13765 let
13766 val q3 = pushIntern(q2,id,false,rep)
13767 val a3 = hookGenRef(a2,((getPos q,getPos q2),id,ent,true))
13768 in (getChar(a3,q3))
13769 end
13770 | GE_EXTERN ext =>
13771 if !O_VALIDATE orelse !O_INCLUDE_EXT_PARSED
13772 then
13773 let
13774 val a3 = hookGenRef(a2,((getPos q,getPos q2),id,ent,true))
13775 val caq4 = #3(openExtern (id,false,resolveExtId ext) (a3,q2))
13776 handle CantOpenFile(fmsg,a)
13777 => let val err = ERR_NO_SUCH_FILE fmsg
13778 val a2 = hookError(a,(getPos q2,err))
13779 val a3 = hookEntEnd(a2,getPos q2)
13780 in (getChar(a3,q2))
13781 end
13782 in caq4
13783 end
13784 else let val a3 = hookGenRef(a2,((getPos q,getPos q2),id,ent,false))
13785 in getChar(a3,q2)
13786 end
13787 | GE_UNPARSED _ =>
13788 raise InternalError
13789 (THIS_MODULE,"parseElementContent",
13790 "parseGenRef returned GE_UNPARSED")
13791 end
13792 (*-------------------------------------------------------*)
13793 (* handle any errors in references by ignoring them. *)
13794 (*-------------------------------------------------------*)
13795 handle SyntaxError caq => caq
13796 | NoSuchEntity aq => getChar aq
13797
13798 (*--------------------------------------------------------------*)
13799 (* handle an end-tag. finish the element in the user data and *)
13800 (* return. *)
13801 (* *)
13802 (* print an error if the element's content is not yet finished. *)
13803 (* print an error if the end-tag is for another element. *)
13804 (* print an error if the element's first character was not in *)
13805 (* the same entity. *)
13806 (*--------------------------------------------------------------*)
13807 and do_etag (p,etag as (elem,space,startPos,endPos),(c,a,q)) =
13808 let
13809 fun checkNesting a =
13810 if getEntId q=startEnt then a
13811 else hookError(a,(startPos,ERR_ELEM_ENT_NESTING(Index2Element dtd curr)))
13812 in
13813 if elem=curr then let val a1 = fin_elem (a,startPos,dfa,p)
13814 val a2 = checkNesting a1
13815 val a3 = hookEndTag
13816 (a2,((startPos,endPos),curr,SOME(elem,space)))
13817 in (NONE,(c,a3,q))
13818 end
13819 else if member elem openElems
13820 then let val err = ERR_OMITTED_END_TAG(Index2Element dtd curr)
13821 val a1 = hookError(a,(startPos,err))
13822 val a2 = fin_elem (a1,startPos,dfa,p)
13823 val a3 = hookEndTag(a2,((startPos,endPos),curr,NONE))
13824 in (SOME etag,(c,a3,q))
13825 end
13826 else if dfaFinal(dfa,p)
13827 then let val err = ERR_ELEM_TYPE_MATCH(Index2Element dtd curr,
13828 Index2Element dtd elem)
13829 val a1 = hookError(a,(startPos,err))
13830 val a2 = checkNesting a1
13831 val a3 = hookEndTag(a2,((startPos,endPos),curr,SOME(elem,space)))
13832 in (NONE,(c,a3,q))
13833 end
13834 else let val err = ERR_IGNORED_END_TAG(Index2Element dtd curr,
13835 Index2Element dtd elem)
13836 val a1 = hookError(a,(startPos,err))
13837 in do_elem(p,(c,a1,q))
13838 end
13839 end
13840
13841 (*--------------------------------------------------------------*)
13842 (* handle a declaration, proc. instr or tag. *)
13843 (*--------------------------------------------------------------*)
13844 and do_lt (p,q,(c1,a1,q1)) =
13845 case c1
13846 of 0wx21 (* #"!" *) =>
13847 (*------------------------------------------------------*)
13848 (* its a declaration, cdata section or comment. *)
13849 (* Only comments are valid. *)
13850 (*------------------------------------------------------*)
13851 let val (c2,a2,q2) = getChar(a1,q1)
13852 val caq3 =
13853 case c2
13854 of 0wx2D (* #"-" *) =>
13855 let val (c3,a3,q3) = getChar(a2,q2)
13856 in if c3=0wx2D then parseComment (getPos q) (a3,q3)
13857 else let val err = ERR_EXPECTED(expDash,[c3])
13858 val a4 = hookError(a3,(getPos q3,err))
13859 in recoverDecl false (c3,a4,q3)
13860 end
13861 end
13862 | 0wx5B (* #"[" *) =>
13863 let val a3 = hookError(a2,(getPos q2,ERR_ELEM_CONTENT IT_CDATA))
13864 in skipBadSection (getChar(a3,q2))
13865 end
13866 | _ => (c2,hookError(a2,(getPos q2,ERR_EXPECTED(expDash,[c2]))),q2)
13867 in do_elem(p,caq3)
13868 end
13869 | 0wx2F (* #"/" *) =>
13870 (let val (elem,space,endPos,caq2) = parseETag dtd (a1,q1)
13871 in do_etag (p,(elem,space,getPos q,endPos),caq2)
13872 end
13873 handle SyntaxError caq => do_elem(p,caq))
13874 | 0wx3F (* #"?" *) => do_elem (p,parseProcInstr (getPos q) (a1,q1))
13875 | _ =>
13876 (*------------------------------------------------------*)
13877 (* it's a start tag. the recursive call to parseElement *)
13878 (* might return an end-tag that has to be consumed. *)
13879 (*------------------------------------------------------*)
13880 if isNms c1 then
13881 let val (p1,(opt,caq2)) =
13882 (let val (stag as ((_,elem,_,_,_),_),(c2,a2,q2)) =
13883 parseSTag dtd (getPos q) (c1,a1,q1)
13884 val (p1,a3) = trans_elem (a2,q1,dfa,p,elem)
13885 in (p1,parseElement (dtd,curr::openElems,q,stag,(c2,a3,q2)))
13886 end)
13887 handle SyntaxError caq => (p,(NONE,caq))
13888 in case opt
13889 of NONE => do_elem (p1,caq2)
13890 | SOME etag => do_etag (p1,etag,caq2)
13891 end
13892 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR 0wx3C,LOC_CONTENT)
13893 val a2 = hookError(a1,(getPos q,err))
13894 in do_elem (p,(c1,a2,q1))
13895 end
13896
13897 (*--------------------------------------------------------------*)
13898 (* do element content. handle the document end by printing an *)
13899 (* error and finishing like with an end-tag. *)
13900 (*--------------------------------------------------------------*)
13901 and do_elem (p,(c,a,q)) =
13902 case c
13903 of 0wx00 => if isSpecial q
13904 then let val err = ERR_OMITTED_END_TAG(Index2Element dtd curr)
13905 val a1 = hookError(a,(getPos q,err))
13906 val pos = getPos q
13907 val a2 = fin_elem (a1,pos,dfa,p)
13908 val a3 = hookEndTag(a2,((pos,pos),curr,NONE))
13909 in (NONE,(c,a3,q))
13910 end
13911 else let val a1 = hookEntEnd(a,getPos q)
13912 in do_elem (p,getChar(a1,q))
13913 end
13914 | 0wx26 (* #"&" *) => do_elem (p,do_ref (q,getChar(a,q)))
13915 | 0wx3C (* #"<" *) => do_lt (p,q,getChar(a,q))
13916 | _ => do_elem (p,do_char_elem (c,a,q))
13917
13918 (*--------------------------------------------------------------*)
13919 (* do empty content. if the first thing to come is the current *)
13920 (* element's end-tag, finish it. Otherwise print an error and *)
13921 (* continue as for element content. *)
13922 (*--------------------------------------------------------------*)
13923 and do_empty (c,a,q) =
13924 if c<>0wx3C (* #"<" *)
13925 then let val a1 = hookError(a,(getPos q,ERR_NONEMPTY(Index2Element dtd curr)))
13926 in do_elem (dfaInitial,(c,a1,q))
13927 end
13928 else
13929 let val (c1,a1,q1) = getChar(a,q)
13930 in if c1<>0wx2F (* #"/" *)
13931 then let val err = ERR_NONEMPTY(Index2Element dtd curr)
13932 val a2 = hookError(a1,(getPos q,err))
13933 in do_lt (dfaInitial,q,(c1,a2,q1))
13934 end
13935 else let val (elem,space,endPos,caq2) = parseETag dtd (a1,q1)
13936 in do_etag (dfaInitial,(elem,space,getPos q,endPos),caq2)
13937 end
13938 handle SyntaxError caq => do_elem (dfaInitial,caq)
13939 end
13940
13941 in if mt then do_empty caq
13942 else do_elem (dfaInitial,caq)
13943 end
13944
13945 (*--------------------------------------------------------------------*)
13946 (* parse mixed or any content. The second arg holds the unique number *)
13947 (* of the element's first characters's entity, the idx of the current *)
13948 (* element, and a function for validating child elements. Cf. 3: *)
13949 (* *)
13950 (* [39] element ::= EmptyElemTag *)
13951 (* | STag content ETag *)
13952 (* ... *)
13953 (* Well-Formedness Constraint: Element Type Match *)
13954 (* The Name in an element's end-tag must match the element type in *)
13955 (* the start-tag. *)
13956 (* *)
13957 (* Validity Constraint: Element Valid *)
13958 (* An element is valid if there is a declaration matching *)
13959 (* elementdecl where the Name matches the element type, and one of *)
13960 (* the following holds: *)
13961 (* ... *)
13962 (* 3. The declaration matches Mixed and the content consists of *)
13963 (* character data and child elements whose types match names in *)
13964 (* the content model. *)
13965 (* 4. The declaration matches ANY, and the types of any child *)
13966 (* elements have been declared. *)
13967 (* *)
13968 (* 3.1: *)
13969 (* *)
13970 (* [43] content ::= (element | CharData | Reference | CDSect | PI *)
13971 (* | Comment)* *)
13972 (* 2.4: *)
13973 (* The ampersand character (&) and the left angle bracket (<) may *)
13974 (* appear in their literal form only when used as markup delimiters,*)
13975 (* or within a comment, a processing instruction, or a CDATA *)
13976 (* section... If they are needed elsewhere, they must be escaped *)
13977 (* using either numeric character references or the strings "&amp;" *)
13978 (* and "&lt;" respectively. The right angle bracket (>) may be *)
13979 (* represented using the string "&gt;", and must, for compatibility,*)
13980 (* be escaped using "&gt;" or a character reference when it appears *)
13981 (* in the string "]]>" in content, when that string is not marking *)
13982 (* the end of a CDATA section. *)
13983 (* *)
13984 (* consume the content of the element, accumulating it via the user *)
13985 (* data functions (parameter a in subfunctions). for each child, *)
13986 (* check whether it was specified in the element's Mixed content *)
13987 (* specification (validate). finish at the first end-tag, whether *)
13988 (* matching or not, or at the document end. *)
13989 (* *)
13990 (* handle all syntax and other recoverable errors from subfunctions *)
13991 (* and try to continue. *)
13992 (* *)
13993 (* return the accumulated user data and the next char and state. *)
13994 (*--------------------------------------------------------------------*)
13995 (* might raise: none *)
13996 (*--------------------------------------------------------------------*)
13997 and parseMixedContent dtd (openElems,startEnt,curr,validate) caq =
13998 let
13999 (*--------------------------------------------------------------*)
14000 (* read data characters until the next "<", "&" or entity end. *)
14001 (* add the data to the user data when an error occurs or no *)
14002 (* more data follows. *)
14003 (* *)
14004 (* return the modified user data with the next char and state. *)
14005 (*--------------------------------------------------------------*)
14006 fun do_data (br,(c0,a0,q0)) =
14007 let
14008 val pos0 = ref (getPos q0)
14009 val _ = Array.update(dataBuffer,0,c0)
14010
14011 fun data_hook (i,(a,q)) =
14012 hookData(a,((!pos0,getPos q),Array.extract(dataBuffer,0,SOME i),false))
14013 fun takeOne (c,qE,i,aq as (a,q)) =
14014 if i<DATA_BUFSIZE then (i+1,aq) before Array.update(dataBuffer,i,c)
14015 else let val a1 = data_hook(i,(a,qE))
14016 val _ = pos0 := getPos q
14017 val _ = Array.update(dataBuffer,0,c)
14018 in (1,(a1,q))
14019 end
14020 fun do_br (n,(i,aq as (_,q))) =
14021 let val (c1,a1,q1) = getChar aq
14022 in case c1
14023 of 0wx00 => (c1,data_hook(i,(a1,q)),q1)
14024 | 0wx26 (* #"&" *) => (c1,data_hook(i,(a1,q)),q1)
14025 | 0wx3C (* #"<" *) => (c1,data_hook(i,(a1,q)),q1)
14026 | 0wx5D (* #"]" *) => do_br (n+1,takeOne(c1,q,i,(a1,q1)))
14027 | 0wx3E (* #">" *) =>
14028 let val a2 = if n=1 then a1
14029 else hookError(a1,(getPos q1,ERR_MUST_ESCAPE c1))
14030 in doit (takeOne(c1,q,i,(a2,q1)))
14031 end
14032 | _ => doit (takeOne(c1,q,i,(a1,q1)))
14033 end
14034 and doit (i,aq as (_,q)) =
14035 let val (c1,a1,q1) = getChar aq
14036 in case c1
14037 of 0wx00 => (c1,data_hook(i,(a1,q)),q1)
14038 | 0wx26 (* #"&" *) => (c1,data_hook(i,(a1,q)),q1)
14039 | 0wx3C (* #"<" *) => (c1,data_hook(i,(a1,q)),q1)
14040 | 0wx5D (* #"]" *) => if !O_COMPATIBILITY
14041 then do_br (1,takeOne(c1,q,i,(a1,q1)))
14042 else doit (takeOne(c1,q,i,(a1,q1)))
14043 | _ => doit (takeOne(c1,q,i,(a1,q1)))
14044 end
14045 in
14046 if br then do_br (1,(1,(a0,q0)))
14047 else doit (1,(a0,q0))
14048 end
14049 (*
14050 fun do_data (br,(c0,a0,q0)) =
14051 let
14052 fun data_hook (yet,(a,q)) =
14053 hookData(a,((getPos q0,getPos q),Data2Vector(rev yet),false))
14054 fun do_br (n,yet,aq as (_,q)) =
14055 let val (c1,a1,q1) = getChar aq
14056 in case c1
14057 of 0wx00 => (c1,data_hook(yet,(a1,q)),q1)
14058 | 0wx26 (* #"&" *) => (c1,data_hook(yet,(a1,q)),q1)
14059 | 0wx3C (* #"<" *) => (c1,data_hook(yet,(a1,q)),q1)
14060 | 0wx5D (* #"]" *) => do_br (n+1,c1::yet,(a1,q1))
14061 | 0wx3E (* #">" *) =>
14062 let val a2 = if n=1 then a1
14063 else hookError(a1,(getPos q1,ERR_MUST_ESCAPE c1))
14064 in doit (c1::yet,(a2,q1))
14065 end
14066 | _ => doit (c1::yet,(a1,q1))
14067 end
14068 and doit (yet,aq as (_,q)) =
14069 let val (c1,a1,q1) = getChar aq
14070 in case c1
14071 of 0wx00 => (c1,data_hook(yet,(a1,q)),q1)
14072 | 0wx26 (* #"&" *) => (c1,data_hook(yet,(a1,q)),q1)
14073 | 0wx3C (* #"<" *) => (c1,data_hook(yet,(a1,q)),q1)
14074 | 0wx5D (* #"]" *) => if !O_COMPATIBILITY
14075 then do_br (1,c1::yet,(a1,q1))
14076 else doit (c1::yet,(a1,q1))
14077 | _ => doit (c1::yet,(a1,q1))
14078 end
14079 in
14080 if br then do_br (1,[0wx5D],(a0,q0))
14081 else doit ([c0],(a0,q0))
14082 end
14083 *)
14084
14085 (*--------------------------------------------------------------*)
14086 (* consume a reference, handling errors by ignoring them. *)
14087 (*--------------------------------------------------------------*)
14088 fun do_ref (q0,(c,a,q)) =
14089 if c=0wx23 (* #"#" *)
14090 (*------------------------------------------------------*)
14091 (* it's a character reference. *)
14092 (*------------------------------------------------------*)
14093 then let val (cs,(ch,a1,q1)) = parseCharRefLit [0wx23,0wx26] (a,q)
14094 val cv = Data2Vector(rev cs)
14095 val a2 = hookCharRef(a1,((getPos q0,getPos q1),ch,cv))
14096 in getChar(a2,q1)
14097 end
14098 handle SyntaxError caq => caq
14099 | NoSuchChar aq => getChar aq
14100 (*---------------------------------------------------------*)
14101 (* it's a general entity reference. *)
14102 (*---------------------------------------------------------*)
14103 else let val ((id,ent),(a1,q1)) = parseGenRef dtd (c,a,q)
14104 in case ent
14105 of GE_NULL =>
14106 let val a2 = hookGenRef(a1,((getPos q0,getPos q1),id,ent,false))
14107 in getChar(a2,q1)
14108 end
14109 | GE_INTERN(_,rep) =>
14110 let
14111 val q2 = pushIntern(q1,id,false,rep)
14112 val a2 = hookGenRef(a1,((getPos q0,getPos q1),id,ent,true))
14113 in getChar(a2,q2)
14114 end
14115 | GE_EXTERN ext =>
14116 if !O_VALIDATE orelse !O_INCLUDE_EXT_PARSED
14117 then
14118 let
14119 val a2 = hookGenRef(a1,((getPos q0,getPos q1),id,ent,true))
14120 val caq3 = #3(openExtern (id,false,resolveExtId ext) (a2,q1))
14121 handle CantOpenFile(fmsg,a)
14122 => let val err = ERR_NO_SUCH_FILE fmsg
14123 val a1 = hookError(a,(getPos q1,err))
14124 val a2 = hookEntEnd(a1,getPos q1)
14125 in (getChar(a2,q1))
14126 end
14127 in caq3
14128 end
14129 else let val a2 = hookGenRef(a1,((getPos q0,getPos q1),id,ent,false))
14130 in getChar(a2,q1)
14131 end
14132 | GE_UNPARSED _ =>
14133 raise InternalError
14134 ("THIS_MODULE","parseMixedContent",
14135 "parseGenRef returned GE_UNPARSED")
14136 end
14137 (*-------------------------------------------------------*)
14138 (* handle any errors in references by ignoring them. *)
14139 (*-------------------------------------------------------*)
14140 handle SyntaxError caq => caq
14141 | NoSuchEntity aq => getChar aq
14142
14143 (*--------------------------------------------------------------*)
14144 (* handle an end-tag. finish the element in the user data and *)
14145 (* return. *)
14146 (* *)
14147 (* print an error if the element's content is not yet finished. *)
14148 (* print an error if the end-tag is for another element. *)
14149 (* print an error if the element's first character was not in *)
14150 (* the same entity. *)
14151 (*--------------------------------------------------------------*)
14152 and do_etag (etag as (elem,space,startPos,endPos),(c,a,q)) =
14153 let
14154 fun checkNesting a =
14155 if getEntId q=startEnt then a
14156 else hookError(a,(startPos,ERR_ELEM_ENT_NESTING(Index2Element dtd curr)))
14157 in
14158 if elem=curr then let val a1 = checkNesting a
14159 val a2 = hookEndTag
14160 (a1,((startPos,endPos),curr,SOME(elem,space)))
14161 in (NONE,(c,a2,q))
14162 end
14163 else if member elem openElems
14164 then let val err = ERR_OMITTED_END_TAG(Index2Element dtd curr)
14165 val a1 = hookError(a,(startPos,err))
14166 val a2 = hookEndTag(a1,((startPos,endPos),curr,NONE))
14167 in (SOME etag,(c,a2,q))
14168 end
14169 else let val err = ERR_ELEM_TYPE_MATCH(Index2Element dtd curr,
14170 Index2Element dtd elem)
14171 val a1 = hookError(a,(startPos,err))
14172 val a2 = checkNesting a1
14173 val a3 = hookEndTag(a2,((startPos,endPos),curr,SOME(elem,space)))
14174 in (NONE,(c,a3,q))
14175 end
14176 end
14177
14178 (*--------------------------------------------------------------*)
14179 (* handle a declaration, proc. instr or tag. If it is an end- *)
14180 (* tag, finish the element in the user data and return. *)
14181 (* *)
14182 (* print an error if the element's content is not yet finished. *)
14183 (* print an error if the end-tag is for another element. *)
14184 (* print an error if the element's first character was not in *)
14185 (* the same entity. *)
14186 (*--------------------------------------------------------------*)
14187 and do_lt (q,(c1,a1,q1)) =
14188 case c1
14189 of 0wx21 (* #"!" *) =>
14190 (*------------------------------------------------------*)
14191 (* its a declaration, cdata section or comment. *)
14192 (* Only comments and cdata sections are valid. *)
14193 (*------------------------------------------------------*)
14194 let val (c2,a2,q2) = getChar(a1,q1)
14195 val caq3 =
14196 case c2
14197 of 0wx2D (* #"-" *) =>
14198 let val (c3,a3,q3) = getChar(a2,q2)
14199 in if c3=0wx2D then parseComment (getPos q) (a3,q3)
14200 else let val err = ERR_EXPECTED(expDash,[c3])
14201 val a4 = hookError(a3,(getPos q3,err))
14202 in recoverDecl false (c3,a4,q3)
14203 end
14204 end
14205 | 0wx5B (* #"[" *) => parseCDataSection (getPos q) (a2,q2)
14206 | _ =>
14207 (c2,hookError(a2,(getPos q2,ERR_EXPECTED(expDashLbrack,[c2]))),q2)
14208 in do_mixed caq3
14209 end
14210 | 0wx2F (* #"/" *) =>
14211 (let val (elem,space,endPos,caq2) = parseETag dtd (a1,q1)
14212 in do_etag ((elem,space,getPos q,endPos),caq2)
14213 end
14214 handle SyntaxError caq => do_mixed caq)
14215 | 0wx3F (* #"?" *) => do_mixed (parseProcInstr (getPos q) (a1,q1))
14216 | _ =>
14217 (*------------------------------------------------------*)
14218 (* it's a start tag. the recursive call to parseElement *)
14219 (* might return an end-tag that has to be consumed. *)
14220 (*------------------------------------------------------*)
14221 if isNms c1 then
14222 let val (opt,caq2) =
14223 (let val (stag as ((_,elem,_,_,_),_),(c2,a2,q2)) =
14224 parseSTag dtd (getPos q) (c1,a1,q1)
14225 val a3 = validate (a2,q1) elem
14226 in parseElement (dtd,curr::openElems,q,stag,(c2,a3,q2))
14227 end
14228 handle SyntaxError caq => (NONE,caq))
14229 in case opt
14230 of NONE => do_mixed caq2
14231 | SOME etag => do_etag (etag,caq2)
14232 end
14233 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR 0wx3C,LOC_CONTENT)
14234 val a2 = hookError(a1,(getPos q,err))
14235 in do_mixed (c1,a2,q1)
14236 end
14237
14238 (*--------------------------------------------------------------*)
14239 (* do mixed content. handle the document end by printing an *)
14240 (* error and finishing like with an end-tag. *)
14241 (*--------------------------------------------------------------*)
14242 and do_mixed (c,a,q) =
14243 case c
14244 of 0wx00 => if isSpecial q
14245 then let val err = ERR_OMITTED_END_TAG(Index2Element dtd curr)
14246 val a1 = hookError(a,(getPos q,err))
14247 val pos = getPos q
14248 val a2 = hookEndTag(a1,((pos,pos),curr,NONE))
14249 in (NONE,(c,a2,q))
14250 end
14251 else let val a1 = hookEntEnd(a,getPos q)
14252 in do_mixed (getChar(a1,q))
14253 end
14254 | 0wx26 (* #"&" *) => do_mixed (do_ref (q,getChar(a,q)))
14255 | 0wx3C (* #"<" *) => do_lt (q,getChar(a,q))
14256 | 0wx5D => do_mixed (do_data (!O_COMPATIBILITY,(c,a,q)))
14257 | _ => do_mixed (do_data (false,(c,a,q)))
14258 in
14259 do_mixed caq
14260 end
14261
14262 (*--------------------------------------------------------------------*)
14263 (* parse an element, the start tag already read. the second arg holds *)
14264 (* the number of the entity of the start-tag's first char, and the *)
14265 (* start-tag information. The 1st arg is the start value for the user *)
14266 (* data. 3: *)
14267 (* *)
14268 (* [39] element ::= EmptyElemTag *)
14269 (* | STag content ETag *)
14270 (* and 3.1: *)
14271 (* *)
14272 (* Empty-element tags may be used for any element which has no *)
14273 (* content, whether or not it is declared using the keyword EMPTY. *)
14274 (* For interoperability, the empty-element tag must be used, and *)
14275 (* can only be used, for elements which are declared EMPTY. *)
14276 (*--------------------------------------------------------------------*)
14277 and parseElement (dtd,openElems,q0,(stag as (_,curr,_,_,mt),elemInfo),(c,a,q)) =
14278 let
14279 (*--------------------------------------------------------------*)
14280 (* validate whether an element is allowed in mixed/any content. *)
14281 (*--------------------------------------------------------------*)
14282 fun trans_any (a,_) _ = a
14283 fun trans_mixed is (a,q) i =
14284 if member i is then a
14285 else let val err = ERR_BAD_ELEM(Index2Element dtd curr,Index2Element dtd i)
14286 in hookError(a,(getPos q,err))
14287 end
14288 in
14289 (*-----------------------------------------------------------*)
14290 (* For empty-element tags, verify that the element's declar. *)
14291 (* allows empty content. *)
14292 (*-----------------------------------------------------------*)
14293 if mt then
14294 let val a1 =
14295 if not (!O_VALIDATE andalso hasDtd dtd) then a
14296 else
14297 case #decl elemInfo
14298 of (SOME(CT_EMPTY,_)) => a
14299 | (SOME(CT_ELEMENT(_,dfa),_)) =>
14300 if not (dfaFinal(dfa,dfaInitial))
14301 then hookError(a,(getPos q0,ERR_EMPTY_TAG(Index2Element dtd curr)))
14302 else if not (!O_INTEROPERABILITY) then a
14303 else hookError
14304 (a,(getPos q0,ERR_EMPTY_TAG_INTER (Index2Element dtd curr)))
14305 | _ => if not (!O_INTEROPERABILITY) then a
14306 else hookError(a,(getPos q0,ERR_EMPTY_TAG_INTER
14307 (Index2Element dtd curr)))
14308 in (NONE,(c,hookStartTag(a1,stag),q))
14309 end
14310 (*-----------------------------------------------------------*)
14311 (* for normal start-tags, check whether the element's decl. *)
14312 (* requires an empty-element tag, or empty content, then *)
14313 (* call the appropriate function that parses the content. *)
14314 (*-----------------------------------------------------------*)
14315 else
14316 let val startEnt = getEntId q0
14317 in if !O_VALIDATE then
14318 case getOpt(#decl elemInfo,(CT_ANY,false))
14319 of (CT_ANY,_) => parseMixedContent dtd
14320 (openElems,startEnt,curr,trans_any) (c,hookStartTag(a,stag),q)
14321 | (CT_MIXED is,_) => parseMixedContent dtd
14322 (openElems,startEnt,curr,trans_mixed is) (c,hookStartTag(a,stag),q)
14323 | (CT_ELEMENT(_,dfa),ext) => parseElementContent dtd
14324 (openElems,startEnt,curr,dfa,ext,false)
14325 (c,hookStartTag(a,stag),q)
14326 | (CT_EMPTY,_) =>
14327 let val a1 = if not (!O_INTEROPERABILITY) then a
14328 else let val err = ERR_MUST_BE_EMPTY(Index2Element dtd curr)
14329 in hookError(a,(getPos q0,err))
14330 end
14331 val a2 = hookStartTag(a1,stag)
14332 in parseElementContent dtd
14333 (openElems,startEnt,curr,emptyDfa,false,true) (c,a2,q)
14334 end
14335 else parseMixedContent dtd
14336 (openElems,startEnt,curr,trans_any) (c,hookStartTag(a,stag),q)
14337 end
14338 end
14339 end
14340 (* stop of ../../Parser/Parse/parseContent.sml *)
14341 (* start of ../../Parser/Parse/parseDocument.sml *)
14342 (*--------------------------------------------------------------------------*)
14343 (* Structure: ParseDocument *)
14344 (* *)
14345 (* Exceptions raised by functions in this structure: *)
14346 (* parseDocTypeDecl : none *)
14347 (*--------------------------------------------------------------------------*)
14348 functor Parse
14349 (structure Dtd : Dtd
14350 structure Hooks : Hooks
14351 structure Resolve : Resolve
14352 structure ParserOptions : ParserOptions) :
14353 sig
14354 val parseDocument : Uri.Uri option -> Dtd.Dtd option -> Hooks.AppData -> Hooks.AppFinal
14355 end
14356 =
14357 struct
14358 structure ParseBase = ParseBase (structure Dtd = Dtd
14359 structure Hooks = Hooks
14360 structure Resolve = Resolve
14361 structure ParserOptions = ParserOptions)
14362
14363 structure ParseContent = ParseContent (structure ParseBase = ParseBase)
14364
14365 open
14366 Base UniChar Errors UniClasses Uri
14367 ParseContent
14368
14369 val THIS_MODULE = "ParseContent"
14370
14371 datatype Where =
14372 PROLOG
14373 | EPILOG
14374 | INSTANCE of int option
14375
14376 fun locOf wher =
14377 case wher
14378 of PROLOG => LOC_PROLOG
14379 | INSTANCE _ => LOC_PROLOG
14380 | EPILOG => LOC_EPILOG
14381
14382 fun checkRoot dtd (a,q) (doc,stag as ((_,elem,_,_,_),_)) =
14383 if !O_VALIDATE
14384 then case doc
14385 of NONE => a
14386 | SOME doc =>
14387 if doc=elem then a
14388 else let val err = ERR_ROOT_ELEM(Index2Element dtd doc,
14389 Index2Element dtd elem)
14390 in hookError(a,(getPos q,err))
14391 end
14392 else a
14393
14394 fun parseDoc dtd caq =
14395 let
14396 fun do_data wher caq =
14397 let fun doit hadError ws (c,a,q) =
14398 case c
14399 of 0wx00 => (ws,(c,a,q))
14400 | 0wx26 (* #"&" *) => (ws,(c,a,q))
14401 | 0wx3C (* #"<" *) => (ws,(c,a,q))
14402 | 0wx09 (* #"\t"*) => doit hadError (c::ws) (getChar(a,q))
14403 | 0wx0A (* #"\n"*) => doit hadError (c::ws) (getChar(a,q))
14404 | 0wx20 (* #" " *) => doit hadError (c::ws) (getChar(a,q))
14405 | _ => let val a1 = if hadError then a
14406 else hookError(a,(getPos q,ERR_FORBIDDEN_HERE
14407 (IT_DATA nil,locOf wher)))
14408 in doit true ws (getChar(a1,q))
14409 end
14410
14411 val (ws,(c1,a1,q1)) = doit false nil caq
14412 val a2 = if null ws then a1
14413 else hookWhite(a1,Data2Vector (rev ws))
14414 in (c1,a2,q1)
14415 end
14416
14417 fun do_decl wher q0 (c,a,q) =
14418 case c
14419 of 0wx2D (* #"-" *) =>
14420 let val (c1,a1,q1) = getChar(a,q)
14421 in if c1=0wx2D then (wher,parseComment (getPos q0) (a1,q1))
14422 else let val err = ERR_EXPECTED(expDash,[c1])
14423 val a2 = hookError(a1,(getPos q1,err))
14424 val caq2 = recoverDecl false (c1,a2,q1)
14425 in (wher,caq2)
14426 end
14427 end
14428 | 0wx5B (* #"[" *) =>
14429 let
14430 val err = ERR_FORBIDDEN_HERE (IT_CDATA,locOf wher)
14431 val a1 = hookError(a,(getPos q0,err))
14432 val caq2 = skipBadSection (getChar(a1,q))
14433 in (wher,caq2)
14434 end
14435 | _ =>
14436 case wher
14437 of PROLOG =>
14438 (let val (name,(c1,a1,q1)) = parseName (c,a,q)
14439 handle NotFound (c,a,q) =>
14440 let val err = expectedOrEnded(expDashDocLbrk,LOC_DECL) c
14441 in raise SyntaxError (c,hookError(a,(getPos q,err)),q)
14442 end
14443
14444 val _ = if name=[0wx44,0wx4f,0wx43,0wx54,0wx59,0wx50,0wx45]
14445 (* "DOCTYPE" *) then ()
14446 else let val err = ERR_EXPECTED(expDashDocLbrk,name)
14447 val a2 = hookError(a1,(getPos q,err))
14448 in raise SyntaxError (c1,a2,q1)
14449 end
14450
14451 val (doc,caq2) = parseDocTypeDecl dtd (c1,a1,q1)
14452 in (INSTANCE doc,caq2)
14453 end
14454 handle SyntaxError caq => (PROLOG,recoverDecl true caq))
14455
14456 | _ => let val loc = if wher=EPILOG then LOC_EPILOG else LOC_AFTER_DTD
14457 val err = ERR_FORBIDDEN_HERE (IT_DECL,loc)
14458 val a1 = hookError(a,(getPos q0,err))
14459 val caq2 = skipDecl true (c,a1,q)
14460 in (wher,caq2)
14461 end
14462
14463 and doit wher (c,a,q) =
14464 case c
14465 of 0wx00 => if isSpecial q then (wher,(a,q))
14466 else doit wher (getChar(a,q))
14467 (*--------------------------------------------------------------*)
14468 (* References are forbidden outside the document element *)
14469 (*--------------------------------------------------------------*)
14470 | 0wx26 (* #"&" *) =>
14471 let
14472 val (c1,a1,q1) = getChar(a,q)
14473 val caq2 =
14474 if c1=0wx23 (* #"#" *)
14475 then let val err = ERR_FORBIDDEN_HERE(IT_CHAR_REF,locOf wher)
14476 val a2 = hookError(a1,(getPos q,err))
14477 in skipCharRef (a2,q1)
14478 end
14479 else let val err = ERR_FORBIDDEN_HERE(IT_REF,locOf wher)
14480 val a2 = hookError(a1,(getPos q,err))
14481 in skipReference (c1,a2,q1)
14482 end
14483 in doit wher caq2
14484 end
14485 | 0wx3C (* #"<" *) =>
14486 let val (c1,a1,q1) = getChar (a,q)
14487 in case c1
14488 of 0wx21 (* #"!" *) =>
14489 let val (wher1,caq2) = do_decl wher q (getChar(a1,q1))
14490 in doit wher1 caq2
14491 end
14492 | 0wx2F (* #"/" *) =>
14493 let
14494 val err = ERR_FORBIDDEN_HERE(IT_ETAG,locOf wher)
14495 val a2 = hookError(a1,(getPos q,err))
14496 val caq3 = skipTag LOC_ETAG (a2,q1)
14497 in doit wher caq3
14498 end
14499 | 0wx3F (* #"?" *) => doit wher (parseProcInstr (getPos q) (a1,q1))
14500 | _ =>
14501 if isName c1 then
14502 let val wher1 =
14503 case wher
14504 of PROLOG => INSTANCE NONE
14505 | _ => wher
14506 in case wher1
14507 of PROLOG =>
14508 raise InternalError(THIS_MODULE,"parseDoc.doit","")
14509 | EPILOG =>
14510 let
14511 val err = ERR_FORBIDDEN_HERE(IT_STAG,LOC_EPILOG)
14512 val a2 = hookError(a1,(getPos q,err))
14513 val caq3 = skipTag LOC_STAG (a2,q1)
14514 in doit EPILOG caq3
14515 end
14516 | INSTANCE doc =>
14517 (let
14518 val a2 =
14519 if not (!O_VALIDATE) orelse isSome doc then a1
14520 else hookError(a1,(getPos q,ERR_NO_DTD))
14521 val (stag,(c3,a3,q3)) = parseSTag
14522 dtd (getPos q) (c1,a2,q1)
14523 val a4 = checkRoot dtd (a3,q1) (doc,stag)
14524 val (opt,(c5,a5,q5)) = parseElement
14525 (dtd,nil,q,stag,(c3,a4,q3))
14526 val a6 = checkDefinedIds dtd (a5,q5)
14527 in case opt
14528 of NONE => doit EPILOG (c5,a6,q5)
14529 | SOME (_,_,startPos,_) =>
14530 let
14531 val err = ERR_FORBIDDEN_HERE(IT_ETAG,LOC_EPILOG)
14532 val a7 = hookError(a6,(startPos,err))
14533 in doit EPILOG (c5,a7,q5)
14534 end
14535 end
14536 handle SyntaxError caq => doit wher1 caq)
14537 end
14538 else let val err = ERR_FORBIDDEN_HERE(IT_CHAR 0wx3C,locOf wher)
14539 val a2 = hookError(a1,(getPos q,err))
14540 in doit wher (c1,a2,q1)
14541 end
14542 end
14543 | _ => let val caq1 = do_data wher (c,a,q)
14544 in doit wher caq1
14545 end
14546 in
14547 doit PROLOG caq
14548 end
14549
14550 (* to false. (cf. 2.9) *)
14551 (* *)
14552 (* ... If ... there is no standalone document declaration, the *)
14553 (* value "no" is assumed. *)
14554 fun parseDocument uriOpt dtdOpt a =
14555 let
14556 val dtd = case dtdOpt
14557 of NONE => initDtdTables ()
14558 | SOME dtd => dtd
14559 val (enc,xmlDecl,(c1,a1,q1)) = openDocument uriOpt a
14560 val uri = getUri q1
14561 val alone = case xmlDecl
14562 of (SOME(_,_,SOME sa)) => sa
14563 | _ => false
14564 val _ = if alone then setStandAlone dtd true else ()
14565 val a2 = hookXml(a1,(uri,enc,xmlDecl))
14566 val (wher,(a3,q3)) = parseDoc dtd (c1,a2,q1)
14567 val _ = closeAll q3
14568 val a4 = case wher
14569 of EPILOG => a3
14570 | _ => hookError(a3,(getPos q3,ERR_ENDED_IN_PROLOG))
14571 in hookFinish a4
14572 end
14573 handle CantOpenFile(fmsg,a) =>
14574 let val a1 = hookError(a,(nullPosition,ERR_NO_SUCH_FILE fmsg))
14575 in hookFinish a1
14576 end
14577 end
14578 (* stop of ../../Parser/Parse/parseDocument.sml *)
14579 (* start of ../../Catalog/catError.sml *)
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590 signature CatError =
14591 sig
14592 type Position
14593 val nullPosition : Position
14594 val Position2String : Position -> string
14595
14596 datatype Location =
14597 LOC_CATALOG
14598 | LOC_COMMENT
14599 | LOC_NOCOMMENT
14600 | LOC_PUBID
14601 | LOC_SYSID
14602
14603 datatype Expected =
14604 EXP_NAME
14605 | EXP_LITERAL
14606
14607 datatype CatError =
14608 ERR_DECODE_ERROR of Decode.Error.DecodeError
14609 | ERR_NO_SUCH_FILE of string * string
14610 | ERR_ILLEGAL_HERE of UniChar.Char * Location
14611 | ERR_MISSING_WHITE
14612 | ERR_EOF of Location
14613 | ERR_EXPECTED of Expected * UniChar.Char
14614 | ERR_XML of Errors.Error
14615 | ERR_MISSING_ATT of UniChar.Data * UniChar.Data
14616 | ERR_NON_PUBID of UniChar.Data * UniChar.Data
14617
14618 val catMessage : CatError -> string list
14619 end
14620
14621 structure CatError : CatError =
14622 struct
14623 open Errors UtilError UtilString
14624
14625 type Position = string * int * int
14626 val nullPosition = ("",0,0)
14627
14628 fun Position2String (fname,l,c) =
14629 if fname="" then ""
14630 else String.concat ["[",fname,":",Int2String l,".",Int2String c,"]"]
14631
14632 datatype Location =
14633 LOC_CATALOG
14634 | LOC_COMMENT
14635 | LOC_NOCOMMENT
14636 | LOC_PUBID
14637 | LOC_SYSID
14638
14639 fun Location2String loc =
14640 case loc
14641 of LOC_CATALOG => "catalog file"
14642 | LOC_COMMENT => "comment"
14643 | LOC_NOCOMMENT => "something other than a comment"
14644 | LOC_PUBID => "public identifier"
14645 | LOC_SYSID => "system identifier"
14646
14647 fun InLocation2String loc =
14648 case loc
14649 of LOC_CATALOG => "in a catalog file"
14650 | LOC_COMMENT => "in a comment"
14651 | LOC_NOCOMMENT => "outside of comments"
14652 | LOC_PUBID => "in a public identifier"
14653 | LOC_SYSID => "in a system identifier"
14654
14655 datatype Expected =
14656 EXP_NAME
14657 | EXP_LITERAL
14658
14659 fun Expected2String exp =
14660 case exp
14661 of EXP_NAME => "a name"
14662 | EXP_LITERAL => "a literal"
14663
14664 datatype CatError =
14665 ERR_DECODE_ERROR of Decode.Error.DecodeError
14666 | ERR_NO_SUCH_FILE of string * string
14667 | ERR_ILLEGAL_HERE of UniChar.Char * Location
14668 | ERR_MISSING_WHITE
14669 | ERR_EOF of Location
14670 | ERR_EXPECTED of Expected * UniChar.Char
14671 | ERR_XML of Error
14672 | ERR_MISSING_ATT of UniChar.Data * UniChar.Data
14673 | ERR_NON_PUBID of UniChar.Data * UniChar.Data
14674
14675 fun catMessage err =
14676 case err
14677 of ERR_DECODE_ERROR err => Decode.Error.decodeMessage err
14678 | ERR_NO_SUCH_FILE(f,msg) => ["Could not open file",quoteErrorString f,"("^msg^")"]
14679
14680 | ERR_ILLEGAL_HERE (c,loc) =>
14681 ["Character",quoteErrorChar c,"is not allowed",InLocation2String loc]
14682
14683 | ERR_MISSING_WHITE => ["Missing white space"]
14684 | ERR_EOF loc => [toUpperFirst (Location2String loc),"ended by end of file"]
14685 | ERR_EXPECTED (exp,c) =>
14686 ["Expected",Expected2String exp,"but found",quoteErrorChar c]
14687
14688 | ERR_XML err => errorMessage err
14689 | ERR_MISSING_ATT(elem,att) =>
14690 ["Element",quoteErrorData elem,"has no",quoteErrorData att,"attribute"]
14691 | ERR_NON_PUBID(att,cs) =>
14692 ["Value specified for attribute",quoteErrorData att,"contains non-PublicId",
14693 case cs
14694 of [c] => "character"^quoteErrorChar c
14695 | cs => List2xString ("characters ",", ","") quoteErrorChar cs]
14696 end
14697 (* stop of ../../Catalog/catError.sml *)
14698 (* start of ../../Catalog/catParams.sml *)
14699
14700
14701
14702
14703
14704 signature CatParams =
14705 sig
14706 val O_CATALOG_FILES : Uri.Uri list ref
14707 val O_PREFER_SOCAT : bool ref
14708 val O_PREFER_SYSID : bool ref
14709 val O_PREFER_CATALOG : bool ref
14710 val O_SUPPORT_REMAP : bool ref
14711 val O_CATALOG_ENC : Encoding.Encoding ref
14712
14713 val catError : CatError.Position * CatError.CatError -> unit
14714 end
14715
14716 (* stop of ../../Catalog/catParams.sml *)
14717 (* start of ../../Unicode/Uri/uriDict.sml *)
14718
14719
14720
14721
14722
14723
14724
14725 structure KeyUri : Key =
14726 struct
14727 type Key = Uri.Uri
14728
14729 val null = Uri.emptyUri
14730 val compare = Uri.compareUri
14731 val toString = Uri.Uri2String
14732 val hash = Uri.hashUri
14733 end
14734
14735 structure UriDict = Dict (structure Key = KeyUri)
14736 (* stop of ../../Unicode/Uri/uriDict.sml *)
14737 (* start of ../../Catalog/catData.sml *)
14738
14739
14740 structure CatData =
14741 struct
14742 datatype CatEntry =
14743 E_BASE of Uri.Uri
14744 | E_DELEGATE of string * Uri.Uri
14745 | E_EXTEND of Uri.Uri
14746 | E_MAP of string * Uri.Uri
14747 | E_REMAP of Uri.Uri * Uri.Uri
14748
14749 type Catalog = Uri.Uri * CatEntry list
14750 end
14751 (* stop of ../../Catalog/catData.sml *)
14752 (* start of ../../Catalog/catFile.sml *)
14753
14754
14755
14756
14757
14758
14759
14760
14761
14762 signature CatFile =
14763 sig
14764 type CatFile
14765 type Position
14766
14767 val catOpenFile : Uri.Uri -> CatFile
14768 val catCloseFile : CatFile -> unit
14769 val catGetChar : CatFile -> UniChar.Char * CatFile
14770 val catPos : CatFile -> CatError.Position
14771 end
14772
14773 functor CatFile ( structure Params : CatParams ) : CatFile =
14774 struct
14775 open UniChar CatError Decode Params Uri UtilError
14776
14777 (* column, line, break *)
14778 type PosInfo = int * int * bool
14779 val startPos = (0,1,false)
14780
14781 datatype CatFile =
14782 NOFILE of string * PosInfo
14783 | DIRECT of DecFile * PosInfo
14784
14785 fun catPos cf =
14786 case cf
14787 of NOFILE (uri,(col,line,_)) => (uri,line,col)
14788 | DIRECT (dec,(col,line,_)) => (decName dec,line,col)
14789
14790 fun catOpenFile uri =
14791 let val dec = decOpenUni(SOME uri,!O_CATALOG_ENC)
14792 in DIRECT(dec,startPos)
14793 end
14794 handle NoSuchFile fmsg => let val _ = catError(nullPosition,ERR_NO_SUCH_FILE fmsg)
14795 in NOFILE(Uri2String uri,startPos)
14796 end
14797
14798 fun catCloseFile cf =
14799 case cf
14800 of NOFILE _ => ()
14801 | DIRECT(dec,_) => ignore (decClose dec)
14802
14803 fun catGetChar cf =
14804 case cf
14805 of NOFILE _ => (0wx00,cf)
14806 | DIRECT(dec,(col,line,brk)) =>
14807 (let val (c,dec1) = decGetChar dec
14808 in case c
14809 of 0wx09 => (c,DIRECT(dec1,(col+1,line,false)))
14810 | 0wx0A => if brk then catGetChar(DIRECT(dec1,(col,line,false)))
14811 else (c,DIRECT(dec1,(0,line+1,false)))
14812 | 0wx0D => (0wx0A,DIRECT(dec1,(0,line+1,true)))
14813 | _ => if c>=0wx20 then (c,DIRECT(dec1,(col+1,line,false)))
14814 else let val err = ERR_ILLEGAL_HERE(c,LOC_CATALOG)
14815 val _ = catError(catPos cf,err)
14816 in catGetChar(DIRECT(dec1,(col+1,line,false)))
14817 end
14818 end
14819 handle DecEof dec => (0wx00,NOFILE(decName dec,(col,line,brk)))
14820 | DecError(dec,_,err) =>
14821 let val _ = catError(catPos cf,ERR_DECODE_ERROR err)
14822 in catGetChar(DIRECT(dec,(col,line,false)))
14823 end
14824 )
14825 end
14826
14827 (* stop of ../../Catalog/catFile.sml *)
14828 (* start of ../../Catalog/socatParse.sml *)
14829
14830
14831
14832
14833
14834
14835
14836
14837
14838 signature SocatParse =
14839 sig
14840 val parseSoCat : Uri.Uri -> CatData.Catalog
14841 end
14842
14843 functor SocatParse ( structure Params : CatParams ) : SocatParse =
14844 struct
14845 structure CatFile = CatFile ( structure Params = Params )
14846
14847 open CatData CatError CatFile Params UniChar UniClasses Uri
14848
14849 exception SyntaxError of UniChar.Char * CatFile.CatFile
14850 exception NotFound of UniChar.Char * CatFile.CatFile
14851
14852 val getChar = catGetChar
14853
14854 fun parseName' (c,f) =
14855 if isName c then let val (cs,cf1) = parseName' (getChar f)
14856 in (c::cs,cf1)
14857 end
14858 else (nil,(c,f))
14859 fun parseName (c,f) =
14860 if isNms c then let val (cs,cf1) = parseName' (getChar f)
14861 in (c::cs,cf1)
14862 end
14863 else raise NotFound (c,f)
14864
14865 datatype Keyword =
14866 KW_BASE
14867 | KW_CATALOG
14868 | KW_DELEGATE
14869 | KW_PUBLIC
14870 | KW_SYSTEM
14871 | KW_OTHER of UniChar.Data
14872
14873 fun parseKeyword cf =
14874 let
14875 val (name,cf1) = parseName cf
14876 val kw = case name
14877 of [0wx42,0wx41,0wx53,0wx45] => KW_BASE
14878 | [0wx43,0wx41,0wx54,0wx41,0wx4c,0wx4f,0wx47] => KW_CATALOG
14879 | [0wx44,0wx45,0wx4c,0wx45,0wx47,0wx41,0wx54,0wx45] => KW_DELEGATE
14880 | [0wx50,0wx55,0wx42,0wx4c,0wx49,0wx43] => KW_PUBLIC
14881 | [0wx53,0wx59,0wx53,0wx54,0wx45,0wx4d] => KW_SYSTEM
14882 | _ => KW_OTHER name
14883 in (kw,cf1)
14884 end
14885
14886 fun parseSysLit' quote f =
14887 let
14888 fun doit text (c,f) =
14889 if c=quote then (text,getChar f)
14890 else if c<>0wx0 then doit (c::text) (getChar f)
14891 else let val _ = catError(catPos f,ERR_EOF LOC_SYSID)
14892 in (text,(c,f))
14893 end
14894 val (text,cf1) = doit nil (getChar f)
14895 in (Data2Uri(rev text),cf1)
14896 end
14897 fun parseSysLit req (c,f) =
14898 if c=0wx22 orelse c=0wx27 then parseSysLit' c f
14899 else if req then let val _ = catError(catPos f,ERR_EXPECTED(EXP_LITERAL,c))
14900 in raise SyntaxError (c,f)
14901 end
14902 else raise NotFound (c,f)
14903
14904 fun parsePubLit' quote f =
14905 let
14906 fun doit (hadSpace,atStart,text) (c,f) =
14907 case c
14908 of 0wx0 => let val _ = catError(catPos f,ERR_EOF LOC_PUBID)
14909 in (text,(c,f))
14910 end
14911 | 0wx0A => doit (true,atStart,text) (getChar f)
14912 | 0wx20 => doit (true,atStart,text) (getChar f)
14913 | _ =>
14914 if c=quote then (text,getChar f)
14915 else if isPubid c
14916 then if hadSpace andalso not atStart
14917 then doit (false,false,c::0wx20::text) (getChar f)
14918 else doit (false,false,c::text) (getChar f)
14919 else let val _ = catError(catPos f,ERR_ILLEGAL_HERE(c,LOC_PUBID))
14920 in doit (hadSpace,atStart,text) (getChar f)
14921 end
14922 val (text,cf1) = doit (false,true,nil) (getChar f)
14923 in (Latin2String(rev text),cf1)
14924 end
14925 fun parsePubLit (c,f) =
14926 if c=0wx22 orelse c=0wx27 then parsePubLit' c f
14927 else let val _ = catError(catPos f,ERR_EXPECTED(EXP_LITERAL,c))
14928 in raise SyntaxError (c,f)
14929 end
14930
14931 fun skipComment (c,f) =
14932 case c
14933 of 0wx00 => let val _ = catError(catPos f,ERR_EOF LOC_COMMENT)
14934 in (c,f)
14935 end
14936 | 0wx2D => let val (c1,f1) = getChar f
14937 in if c1 = 0wx2D then (getChar f1) else skipComment (c1,f1)
14938 end
14939 | _ => skipComment (getChar f)
14940 fun skipCopt (c,f) =
14941 case c
14942 of 0wx00 => (c,f)
14943 | 0wx2D => let val (c1,f1) = getChar f
14944 in if c1=0wx2D then skipComment (getChar f1)
14945 else let val _ = catError(catPos f,ERR_ILLEGAL_HERE(c,LOC_NOCOMMENT))
14946 in (c1,f1)
14947 end
14948 end
14949 | _ => (c,f)
14950
14951 fun skipScomm req0 cf =
14952 let
14953 fun endit req (c,f) =
14954 if req andalso c<>0wx00
14955 then let val _ = catError(catPos f,ERR_MISSING_WHITE)
14956 in (c,f)
14957 end
14958 else (c,f)
14959 fun doit req (c,f) =
14960 case c
14961 of 0wx00 => endit req (c,f)
14962 | 0wx09 => doit false (getChar f)
14963 | 0wx0A => doit false (getChar f)
14964 | 0wx20 => doit false (getChar f)
14965 | 0wx22 => endit req (c,f)
14966 | 0wx27 => endit req (c,f)
14967 | 0wx2D =>
14968 let val (c1,f1) = getChar f
14969 in if c1=0wx2D
14970 then let val _ = if not req then ()
14971 else catError(catPos f1,ERR_MISSING_WHITE)
14972 val cf1 = skipComment (getChar f1)
14973 in doit true cf1
14974 end
14975 else let val _ = catError(catPos f,ERR_ILLEGAL_HERE(c,LOC_NOCOMMENT))
14976 in doit req (c1,f1)
14977 end
14978 end
14979 | _ => if isNms c then endit req (c,f)
14980 else let val _ = catError(catPos f,ERR_ILLEGAL_HERE(c,LOC_NOCOMMENT))
14981 in doit req (getChar f)
14982 end
14983 in doit req0 cf
14984 end
14985
14986 val skipWS = skipScomm true
14987 val skipCommWS = (skipScomm false) o skipCopt
14988 val skipWSComm = skipScomm false
14989
14990 fun skipOther cf =
14991 let
14992 val cf1 = skipWS cf
14993 val cf2 = let val (_,cf') = parseName cf1
14994 in skipWS cf'
14995 end
14996 handle NotFound cf => cf
14997
14998 fun doit cf =
14999 let val (_,cf1) = parseSysLit false cf
15000 in doit (skipWS cf1)
15001 end
15002 handle NotFound(c,f) => (c,f)
15003 in
15004 (NONE,doit cf2)
15005 end
15006
15007 fun parseBase cf =
15008 let
15009 val cf1 = skipWS cf
15010 val (lit,cf2) = parseSysLit true cf1
15011 val cf3 = skipWS cf2
15012 in
15013 (SOME(E_BASE lit),cf3)
15014 end
15015
15016 fun parseExtend cf =
15017 let
15018 val cf1 = skipWS cf
15019 val (lit,cf2) = parseSysLit true cf1
15020 val cf3 = skipWS cf2
15021 in
15022 (SOME(E_EXTEND lit),cf3)
15023 end
15024
15025 fun parseDelegate cf =
15026 let
15027 val cf1 = skipWS cf
15028 val (pub,cf2) = parsePubLit cf1
15029 val cf3 = skipWS cf2
15030 val (sys,cf4) = parseSysLit true cf3
15031 val cf5 = skipWS cf4
15032 in
15033 (SOME(E_DELEGATE(pub,sys)),cf5)
15034 end
15035
15036 fun parseRemap cf =
15037 let
15038 val cf1 = skipWS cf
15039 val (sys0,cf2) = parseSysLit true cf1
15040 val cf3 = skipWS cf2
15041 val (sys,cf4) = parseSysLit true cf3
15042 val cf5 = skipWS cf4
15043 in
15044 (SOME(E_REMAP(sys0,sys)),cf5)
15045 end
15046
15047 fun parseMap cf =
15048 let
15049 val cf1 = skipWS cf
15050 val (pub,cf2) = parsePubLit cf1
15051 val cf3 = skipWS cf2
15052 val (sys,cf4) = parseSysLit true cf3
15053 val cf5 = skipWS cf4
15054 in
15055 (SOME(E_MAP(pub,sys)),cf5)
15056 end
15057
15058 fun recover cf =
15059 let
15060 fun do_lit q (c,f) =
15061 if c=0wx00 then (c,f)
15062 else if c=q then getChar f
15063 else do_lit q (getChar f)
15064 fun do_com (c,f) =
15065 case c
15066 of 0wx00 => (c,f)
15067 | 0wx2D => let val (c1,f1) = getChar f
15068 in if c1=0wx2D then getChar f1
15069 else do_com (c1,f1)
15070 end
15071 | _ => do_com (getChar f)
15072 fun doit (c,f) =
15073 case c
15074 of 0wx00 => (c,f)
15075 | 0wx22 => doit (do_lit c (getChar f))
15076 | 0wx27 => doit (do_lit c (getChar f))
15077 | 0wx2D => let val (c1,f1) = getChar f
15078 in if c1=0wx2D then doit (do_com (getChar f1))
15079 else doit (c1,f1)
15080 end
15081 | _ => if isNms c then (c,f)
15082 else doit (getChar f)
15083 in doit cf
15084 end
15085
15086 fun parseEntry (cf as (c,f)) =
15087 let val (kw,cf1) = parseKeyword cf handle NotFound cf => raise SyntaxError cf
15088 in case kw
15089 of KW_BASE => parseBase cf1
15090 | KW_CATALOG => parseExtend cf1
15091 | KW_DELEGATE => parseDelegate cf1
15092 | KW_SYSTEM => parseRemap cf1
15093 | KW_PUBLIC => parseMap cf1
15094 | KW_OTHER _ => skipOther cf1
15095 end
15096 handle SyntaxError cf => (NONE,recover cf)
15097
15098 fun parseDocument cf =
15099 let
15100 fun doit (c,f) =
15101 if c=0wx0 then nil before catCloseFile f
15102 else let val (opt,cf1) = parseEntry (c,f)
15103 val entries = doit cf1
15104 in case opt
15105 of NONE => entries
15106 | SOME entry => entry::entries
15107 end
15108
15109 val cf1 = skipCommWS cf
15110 in
15111 doit cf1
15112 end
15113
15114 fun parseSoCat uri =
15115 let
15116 val f = catOpenFile uri
15117 val cf1 = getChar f
15118 in
15119 (uri,parseDocument cf1)
15120 end
15121 end
15122 (* stop of ../../Catalog/socatParse.sml *)
15123 (* start of ../../Catalog/catDtd.sml *)
15124 signature CatDtd =
15125 sig
15126 type Dtd
15127
15128 val baseIdx : int
15129 val delegateIdx : int
15130 val extendIdx : int
15131 val mapIdx : int
15132 val remapIdx : int
15133
15134 val hrefIdx : int
15135 val pubidIdx : int
15136 val sysidIdx : int
15137
15138 val Index2AttNot : Dtd -> int -> UniChar.Data
15139 val Index2Element : Dtd -> int -> UniChar.Data
15140 end
15141
15142 structure CatDtd =
15143 struct
15144 open Dtd
15145
15146 val baseGi = UniChar.String2Data "Base"
15147 val delegateGi = UniChar.String2Data "Delegate"
15148 val extendGi = UniChar.String2Data "Extend"
15149 val mapGi = UniChar.String2Data "Map"
15150 val remapGi = UniChar.String2Data "Remap"
15151
15152 val hrefAtt = UniChar.String2Data "HRef"
15153 val pubidAtt = UniChar.String2Data "PublicId"
15154 val sysidAtt = UniChar.String2Data "SystemId"
15155
15156 fun initDtdTables () =
15157 let
15158 val dtd = Dtd.initDtdTables()
15159 val _ = app (ignore o (Element2Index dtd)) [baseGi,delegateGi,extendGi,mapGi,remapGi]
15160 val _ = app (ignore o (AttNot2Index dtd)) [hrefAtt,pubidAtt,sysidAtt]
15161 in dtd
15162 end
15163
15164 local
15165 val dtd = initDtdTables()
15166 in
15167 val baseIdx = Element2Index dtd baseGi
15168 val delegateIdx = Element2Index dtd delegateGi
15169 val extendIdx = Element2Index dtd extendGi
15170 val mapIdx = Element2Index dtd mapGi
15171 val remapIdx = Element2Index dtd remapGi
15172
15173 val hrefIdx = AttNot2Index dtd hrefAtt
15174 val pubidIdx = AttNot2Index dtd pubidAtt
15175 val sysidIdx = AttNot2Index dtd sysidAtt
15176 end
15177 end
15178 (* stop of ../../Catalog/catDtd.sml *)
15179 (* start of ../../Parser/Params/ignore.sml *)
15180 structure IgnoreHooks =
15181 struct
15182 type AppData = unit
15183 type AppFinal = unit
15184
15185 fun hookXml(a,_) = a
15186 fun hookFinish a = a
15187
15188 fun hookError(a,_) = a
15189 fun hookWarning(a,_) = a
15190
15191 fun hookProcInst(a,_) = a
15192 fun hookComment(a,_) = a
15193 fun hookWhite(a,_) = a
15194 fun hookDecl (a,_) = a
15195
15196 fun hookStartTag(a,_) = a
15197 fun hookEndTag(a,_) = a
15198 fun hookCData(a,_) = a
15199 fun hookData(a,_) = a
15200
15201 fun hookCharRef(a,_) = a
15202 fun hookGenRef(a,_) = a
15203 fun hookParRef(a,_) = a
15204 fun hookEntEnd(a,_) = a
15205
15206 fun hookDocType(a,_) = a
15207 fun hookSubset(a,_) = a
15208 fun hookExtSubset(a,_) = a
15209 fun hookEndDtd(a,_) = a
15210 end
15211 (* stop of ../../Parser/Params/ignore.sml *)
15212 (* start of ../../Catalog/catHooks.sml *)
15213 signature CatHooks =
15214 sig
15215 type AppData = CatData.CatEntry list
15216
15217 val initCatHooks : unit -> AppData
15218 end
15219
15220 functor CatHooks (structure Params : CatParams
15221 structure Dtd : CatDtd ) =
15222 struct
15223 open
15224 Dtd HookData IgnoreHooks Params UniChar UniClasses Uri UtilList
15225 CatData CatError
15226
15227 type AppData = Dtd * CatEntry list
15228 type AppFinal = CatEntry list
15229
15230 fun initCatHooks dtd = (dtd,nil)
15231
15232 fun hookError (a,(pos,err)) = a before catError (pos,ERR_XML err)
15233
15234 fun getAtt dtd (pos,elem,att,trans) atts =
15235 let
15236 val cvOpt = findAndMap
15237 (fn (i,ap,_) => if i<>att then NONE
15238 else case ap
15239 of AP_DEFAULT(_,cv,_) => SOME cv
15240 | AP_PRESENT(_,cv,_) => SOME cv
15241 | _ => NONE)
15242 atts
15243 in case cvOpt
15244 of SOME cv => trans (pos,att) cv
15245 | NONE => NONE before catError
15246 (pos,ERR_MISSING_ATT(Index2Element dtd elem,Index2AttNot dtd att))
15247 end
15248
15249 fun makePubid dtd (pos,att) cv =
15250 let val (cs,bad) =
15251 Vector.foldr
15252 (fn (c,(cs,bad)) => if isPubid c then (Char2char c::cs,bad)
15253 else (cs,c::bad))
15254 (nil,nil) cv
15255 in if null bad then SOME(String.implode cs)
15256 else NONE before catError(pos,ERR_NON_PUBID(Index2AttNot dtd att,bad))
15257 end
15258
15259 fun makeUri (pos,att) cv = SOME cv
15260
15261 fun hookStartTag (a as (dtd,items),((_,pos),elem,atts,_,_)) =
15262 if elem=baseIdx
15263 then let val hrefOpt = getAtt dtd (pos,elem,hrefIdx,makeUri) atts
15264 in case hrefOpt
15265 of NONE => a
15266 | SOME href => (dtd,E_BASE (Vector2Uri href)::items)
15267 end
15268 else if elem=delegateIdx
15269 then let val hrefOpt = getAtt dtd (pos,elem,hrefIdx,makeUri) atts
15270 val pubidOpt = getAtt dtd (pos,elem,pubidIdx,makePubid dtd) atts
15271 in case (hrefOpt,pubidOpt)
15272 of (SOME href,SOME pubid) =>
15273 (dtd,E_DELEGATE(pubid,Vector2Uri href)::items)
15274 | _ => a
15275 end
15276 else if elem=extendIdx
15277 then let val hrefOpt = getAtt dtd (pos,elem,hrefIdx,makeUri) atts
15278 in case hrefOpt
15279 of NONE => a
15280 | SOME href => (dtd,E_EXTEND (Vector2Uri href)::items)
15281 end
15282 else if elem=mapIdx
15283 then let val hrefOpt = getAtt dtd (pos,elem,hrefIdx,makeUri) atts
15284 val pubidOpt = getAtt dtd (pos,elem,pubidIdx,makePubid dtd) atts
15285 in case (hrefOpt,pubidOpt)
15286 of (SOME href,SOME pubid) =>
15287 (dtd,E_MAP(pubid,Vector2Uri href)::items)
15288 | _ => a
15289 end
15290 else if elem=remapIdx
15291 then let val hrefOpt = getAtt dtd (pos,elem,hrefIdx,makeUri) atts
15292 val sysidOpt = getAtt dtd (pos,elem,sysidIdx,makeUri) atts
15293 in case (hrefOpt,sysidOpt)
15294 of (SOME href,SOME sysid) =>
15295 (dtd,E_REMAP(Vector2Uri sysid,Vector2Uri href)::items)
15296 | _ => a
15297 end
15298 else a
15299
15300 fun hookFinish (_,items) = rev items
15301 end
15302 (* stop of ../../Catalog/catHooks.sml *)
15303 (* start of ../../Catalog/catParse.sml *)
15304 signature CatParse =
15305 sig
15306 val parseCatalog : Uri.Uri -> CatData.Catalog
15307 end
15308
15309 functor CatParse (structure Params : CatParams) : CatParse =
15310 struct
15311 structure SocatParse = SocatParse (structure Params = Params)
15312
15313 structure ParserOptions =
15314 struct
15315 structure Options = ParserOptions()
15316 open Options
15317
15318 local
15319 fun setDefaults() =
15320 let
15321 val _ = setParserDefaults()
15322
15323 val _ = O_WARN_MULT_ENUM := false
15324 val _ = O_WARN_XML_DECL := false
15325 val _ = O_WARN_ATT_NO_ELEM := false
15326 val _ = O_WARN_MULT_ENT_DECL := false
15327 val _ = O_WARN_MULT_NOT_DECL := false
15328 val _ = O_WARN_MULT_ATT_DEF := false
15329 val _ = O_WARN_MULT_ATT_DECL := false
15330 val _ = O_WARN_SHOULD_DECLARE := false
15331
15332 val _ = O_VALIDATE := false
15333 val _ = O_COMPATIBILITY := false
15334 val _ = O_INTEROPERABILITY := false
15335
15336 val _ = O_INCLUDE_EXT_PARSED := true
15337 in ()
15338 end
15339 in
15340 val setParserDefaults = setDefaults
15341 end
15342
15343 end
15344 structure CatHooks = CatHooks (structure Params = Params
15345 structure Dtd = CatDtd)
15346 structure Parse = Parse (structure Dtd = CatDtd
15347 structure Hooks = CatHooks
15348 structure Resolve = ResolveNull
15349 structure ParserOptions = ParserOptions)
15350
15351 open CatHooks CatDtd Parse ParserOptions SocatParse Uri
15352
15353 fun parseXmlCat uri =
15354 let
15355 val _ = setParserDefaults()
15356 val dtd = initDtdTables()
15357 val items = parseDocument (SOME uri) (SOME dtd) (initCatHooks dtd)
15358 in
15359 (uri,items)
15360 end
15361
15362 fun isSocatSuffix x = x="soc" orelse x="SOC"
15363 fun isXmlSuffix x = x="xml" orelse x="XML"
15364
15365 fun parseCatalog uri =
15366 let val suffix = uriSuffix uri
15367 in if isSocatSuffix suffix then parseSoCat uri
15368 else (if isXmlSuffix suffix then parseXmlCat uri
15369 else (if !O_PREFER_SOCAT then parseSoCat uri
15370 else parseXmlCat uri))
15371 end
15372 end
15373 (* stop of ../../Catalog/catParse.sml *)
15374 (* start of ../../Catalog/catalog.sml *)
15375
15376
15377
15378
15379
15380
15381
15382
15383
15384 signature Catalog =
15385 sig
15386 val resolveExtId : string option * (Uri.Uri * Uri.Uri) option -> Uri.Uri option
15387 end
15388
15389 functor Catalog ( structure Params : CatParams ) : Catalog =
15390 struct
15391 structure CatParse = CatParse ( structure Params = Params )
15392
15393 open CatData CatParse Params Uri UriDict
15394
15395 val catDict = makeDict("catalog",6,NONE:Catalog option)
15396
15397 fun getCatalog uri =
15398 let val idx = getIndex(catDict,uri)
15399 in case getByIndex(catDict,idx)
15400 of SOME cat => cat
15401 | NONE => let val cat = parseCatalog uri
15402 val _ = setByIndex(catDict,idx,SOME cat)
15403 in cat
15404 end
15405 end
15406
15407 datatype SearchType =
15408 SYS of Uri
15409 | PUB of string
15410 datatype SearchResult =
15411 FOUND of Uri * Uri
15412 | NOTFOUND of Uri list
15413
15414 fun searchId id =
15415 let
15416 fun searchOne (base,other) nil = NOTFOUND other
15417 | searchOne (base,other) (entry::entries) =
15418 case entry
15419 of E_BASE path =>
15420 let val newBase = uriJoin(base,path)
15421 in searchOne (newBase,other) entries
15422 end
15423 | E_EXTEND path =>
15424 let val fullPath = uriJoin(base,path)
15425 in searchOne (base,fullPath::other) entries
15426 end
15427 | E_DELEGATE(prefix,path) =>
15428 (case id
15429 of PUB pid => if String.isPrefix prefix pid
15430 then let val fullPath = uriJoin(base,path)
15431 in searchOne (base,fullPath::other) entries
15432 end
15433 else searchOne (base,other) entries
15434 | SYS _ => searchOne (base,other) entries)
15435 | E_MAP(pubid,path) =>
15436 (case id
15437 of PUB pid => if pubid=pid then FOUND (base,path)
15438 else searchOne (base,other) entries
15439 | _ => searchOne (base,other) entries)
15440 | E_REMAP(sysid,path) =>
15441 (case id
15442 of SYS sid => if sysid=sid then FOUND(base,path)
15443 else searchOne (base,other) entries
15444 | _ => searchOne (base,other) entries)
15445
15446 fun searchLevel other nil = NOTFOUND(rev other)
15447 | searchLevel other (fname::fnames) =
15448 let
15449 val (base,entries) = getCatalog fname
15450 in
15451 case searchOne (base,other) entries
15452 of FOUND bp => FOUND bp
15453 | NOTFOUND other' => searchLevel other' fnames
15454 end
15455
15456 fun searchAll fnames =
15457 if null fnames then NONE
15458 else case searchLevel nil fnames
15459 of FOUND bp => SOME bp
15460 | NOTFOUND other => searchAll other
15461
15462 val fnames = !O_CATALOG_FILES
15463 in
15464 case id
15465 of PUB _ => searchAll fnames
15466 | SYS _ => if !O_SUPPORT_REMAP then searchAll fnames else NONE
15467 end
15468
15469 fun resolveExtId (pub,sys) =
15470 let
15471 fun resolvePubCat () =
15472 case pub
15473 of NONE => NONE
15474 | SOME id => case searchId (PUB id)
15475 of NONE => NONE
15476 | SOME(base,sysid) => case searchId (SYS sysid)
15477 of NONE => SOME(base,sysid)
15478 | new => new
15479
15480 fun resolveSysCat () =
15481 case sys
15482 of NONE => NONE
15483 | SOME(base,id) => searchId (SYS id)
15484
15485 fun resolveCat () =
15486 if !O_PREFER_SYSID
15487 then case resolveSysCat ()
15488 of NONE => resolvePubCat ()
15489 | found => found
15490 else case resolvePubCat ()
15491 of NONE => resolveSysCat ()
15492 | found => found
15493
15494 fun resolve () =
15495 if !O_PREFER_CATALOG
15496 then case resolveCat ()
15497 of NONE => (case sys
15498 of NONE => NONE
15499 | SOME(base,id) => SOME(base,id))
15500 | found => found
15501 else case sys
15502 of NONE => resolvePubCat ()
15503 | SOME(base,id) => SOME(base,id)
15504 in
15505 if null (!O_CATALOG_FILES)
15506 then case sys
15507 of NONE => NONE
15508 | SOME(base,id) => SOME (uriJoin (base,id))
15509 else case resolve ()
15510 of NONE => NONE
15511 | SOME bp => SOME (uriJoin bp)
15512 end
15513 end
15514 (* stop of ../../Catalog/catalog.sml *)
15515 (* start of ../../Catalog/catResolve.sml *)
15516
15517
15518
15519
15520
15521
15522
15523 functor ResolveCatalog ( structure Params : CatParams ) : Resolve =
15524 struct
15525 structure Catalog = Catalog ( structure Params = Params )
15526
15527 open Base Errors
15528
15529 fun resolveExtId (id as EXTID(pub,sys)) =
15530 let val pub1 = case pub
15531 of NONE => NONE
15532 | SOME (str,_) => SOME str
15533 val sys1 = case sys
15534 of NONE => NONE
15535 | SOME (base,file,_) => SOME(base,file)
15536 in case Catalog.resolveExtId (pub1,sys1)
15537 of NONE => raise NoSuchFile ("","Could not generate system identifier")
15538 | SOME uri => uri
15539 end
15540 end
15541 (* stop of ../../Catalog/catResolve.sml *)
15542 (* start of ../../Catalog/catOptions.sml *)
15543 signature CatOptions =
15544 sig
15545 val O_CATALOG_FILES : Uri.Uri list ref
15546 val O_PREFER_SOCAT : bool ref
15547 val O_PREFER_SYSID : bool ref
15548 val O_PREFER_CATALOG : bool ref
15549 val O_SUPPORT_REMAP : bool ref
15550 val O_CATALOG_ENC : Encoding.Encoding ref
15551
15552 val setCatalogDefaults : unit -> unit
15553 val setCatalogOptions : Options.Option list * (string -> unit) -> Options.Option list
15554
15555 val catalogUsage : Options.Usage
15556 end
15557
15558 functor CatOptions () : CatOptions =
15559 struct
15560 open Encoding Options Uri
15561
15562 val O_CATALOG_FILES = ref nil: Uri list ref
15563 val O_PREFER_SOCAT = ref false
15564 val O_PREFER_SYSID = ref false
15565 val O_PREFER_CATALOG = ref true
15566 val O_SUPPORT_REMAP = ref true
15567 val O_CATALOG_ENC = ref LATIN1
15568
15569 fun setCatalogDefaults() =
15570 let
15571 val _ = O_CATALOG_FILES := nil
15572 val _ = O_PREFER_SOCAT := false
15573 val _ = O_PREFER_SYSID := false
15574 val _ = O_PREFER_CATALOG := true
15575 val _ = O_SUPPORT_REMAP := true
15576 val _ = O_CATALOG_ENC := LATIN1
15577 in ()
15578 end
15579
15580 val catalogUsage =
15581 [U_ITEM(["-C <url>","--catalog=<url>"],"Use catalog <url>"),
15582 U_ITEM(["--catalog-syntax=(soc|xml)"],"Default syntax for catalogs (xml)"),
15583 U_ITEM(["--catalog-encoding=<enc>"],"Default encoding for Socat catalogs (LATIN1)"),
15584 U_ITEM(["--catalog-remap=[(yes|no)]"],"Support remapping of system identifiers (yes)"),
15585 U_ITEM(["--catalog-priority=(map|remap|sys)"],"Resolving strategy in catalogs (map)")
15586 ]
15587
15588 fun setCatalogOptions (opts,doError) =
15589 let
15590 val catalogs = ref nil:string list ref
15591
15592 fun hasNoArg key = "option "^key^" has no argument"
15593 fun mustHave key = String.concat ["option ",key," must have an argument"]
15594 fun mustBe(key,what) = String.concat ["the argument to --",key," must be ",what]
15595
15596 val yesNo = "'yes' or 'no'"
15597 val mapRemapSys = "'map', 'remap' or 'sys'"
15598 val encName = "'ascii', 'latin1', 'utf8' or 'utf16'"
15599 val syntaxName = "'soc' or 'xml'"
15600
15601 fun do_catalog valOpt =
15602 case valOpt
15603 of NONE => doError(mustHave "--catalog")
15604 | SOME s => catalogs := s::(!catalogs)
15605
15606 fun do_prio valOpt =
15607 let fun set(cat,sys) = (O_PREFER_CATALOG := cat; O_PREFER_SYSID := sys)
15608 in case valOpt
15609 of NONE => doError(mustHave "--catalog-priority")
15610 | SOME "map" => set(true,false)
15611 | SOME "remap" => set(true,true)
15612 | SOME "sys" => set(false,true)
15613 | SOME s => doError(mustBe("catalog-priority",mapRemapSys))
15614 end
15615
15616 fun do_enc valOpt =
15617 case valOpt
15618 of NONE => doError(mustHave "--catalog-encoding")
15619 | SOME s => case isEncoding s
15620 of NOENC => doError("unsupported encoding "^s)
15621 | enc => O_CATALOG_ENC := enc
15622
15623 fun do_remap valOpt =
15624 case valOpt
15625 of NONE => doError(mustHave "--catalog-remap")
15626 | SOME "no" => O_SUPPORT_REMAP := false
15627 | SOME "yes" => O_SUPPORT_REMAP := true
15628 | SOME s => doError(mustBe("catalog-remap",yesNo))
15629
15630 fun do_syntax valOpt =
15631 case valOpt
15632 of NONE => doError(mustHave "--catalog-syntax")
15633 | SOME "soc" => O_PREFER_SOCAT := true
15634 | SOME "xml" => O_PREFER_SOCAT := false
15635 | SOME s => doError(mustBe("catalog-remap",syntaxName))
15636
15637 fun do_long(key,valOpt) =
15638 case key
15639 of "catalog" => true before do_catalog valOpt
15640 | "catalog-remap" => true before do_remap valOpt
15641 | "catalog-syntax" => true before do_syntax valOpt
15642 | "catalog-encoding" => true before do_enc valOpt
15643 | "catalog-priority" => true before do_prio valOpt
15644 | _ => false
15645
15646 fun do_short cs opts =
15647 case cs
15648 of nil => doit opts
15649 | [#"C"] =>
15650 (case opts
15651 of OPT_STRING s::opts1 => (catalogs := s::(!catalogs);
15652 doit opts1)
15653 | _ => let val _ = doError (mustHave "-C")
15654 in doit opts
15655 end)
15656 | cs =>
15657 let val cs1 = List.filter
15658 (fn c => if #"C"<>c then true
15659 else false before doError (mustHave "-C")) cs
15660 in if null cs1 then doit opts else (OPT_SHORT cs1)::doit opts
15661 end
15662
15663 and doit nil = nil
15664 | doit (opt::opts) =
15665 case opt
15666 of OPT_NOOPT => opts
15667 | OPT_LONG(key,value) => if do_long(key,value) then doit opts
15668 else opt::doit opts
15669 | OPT_SHORT cs => do_short cs opts
15670 | OPT_NEG cs => opt::doit opts
15671 | OPT_STRING s => opt::doit opts
15672
15673 val opts1 = doit opts
15674 val uris = map String2Uri (!catalogs)
15675 val _ = O_CATALOG_FILES := uris
15676 in opts1
15677 end
15678 end
15679 (* stop of ../../Catalog/catOptions.sml *)
15680 (* start of nullOptions.sml *)
15681 signature NullOptions =
15682 sig
15683 val O_SILENT : bool ref
15684 val O_ERROR_DEVICE : TextIO.outstream ref
15685 val O_ERROR_LINEWIDTH : int ref
15686
15687 val setNullDefaults : unit -> unit
15688 val setNullOptions : Options.Option list * (string -> unit)
15689 -> bool * bool * string option * string option
15690
15691 val nullUsage : Options.Usage
15692 end
15693
15694 structure NullOptions : NullOptions =
15695 struct
15696 open Options
15697
15698 val O_SILENT = ref false
15699 val O_ERROR_DEVICE = ref TextIO.stdErr
15700 val O_ERROR_LINEWIDTH = ref 80
15701
15702 val nullUsage =
15703 [U_ITEM(["-s","--silent"],"Suppress reporting of errors and warnings"),
15704 U_ITEM(["-e <file>","--error-output=<file>"],"Redirect errors to file (stderr)"),
15705 U_SEP,
15706 U_ITEM(["--version"],"Print the version number and exit"),
15707 U_ITEM(["-?","--help"],"Print this text and exit"),
15708 U_ITEM(["--"],"Do not recognize remaining arguments as options")
15709 ]
15710
15711 fun setNullDefaults () =
15712 let
15713 val _ = O_SILENT := false
15714 val _ = O_ERROR_DEVICE := TextIO.stdErr
15715 in ()
15716 end
15717
15718 fun setNullOptions (opts,optError) =
15719 let
15720 fun onlyOne what = "at most one "^what^" may be specified"
15721 fun unknown pre opt = String.concat ["unknown option ",pre,opt]
15722 fun hasNoArg pre key = String.concat ["option ",pre,key," expects no argument"]
15723 fun mustHave pre key = String.concat ["option ",pre,key," must have an argument"]
15724
15725 fun check_noarg(key,valOpt) =
15726 if isSome valOpt then optError (hasNoArg "--" key) else ()
15727
15728 fun do_long (pars as (v,h,e,f)) (key,valOpt) =
15729 case key
15730 of "help" => (v,true,e,f) before check_noarg(key,valOpt)
15731 | "version" => (true,h,e,f) before check_noarg(key,valOpt)
15732 | "silent" => pars before O_SILENT := true before check_noarg(key,valOpt)
15733 | "error-output" =>
15734 (case valOpt
15735 of NONE => pars before optError (mustHave "--" key)
15736 | SOME s => (v,h,SOME s,f))
15737 | _ => pars before optError(unknown "--" key)
15738
15739 fun do_short (pars as (v,h,e,f)) (cs,opts) =
15740 case cs
15741 of nil => doit pars opts
15742 | [#"e"] => (case opts
15743 of OPT_STRING s::opts1 => doit (v,h,SOME s,f) opts1
15744 | _ => (optError (hasNoArg "-" "e"); doit pars opts))
15745 | cs => doit (foldr
15746 (fn (c,pars)
15747 => case c
15748 of #"e" => pars before optError (hasNoArg "-" "e")
15749 | #"s" => pars before O_SILENT := true
15750 | #"?" => (v,true,e,f)
15751 | c => pars before
15752 optError (unknown "-" (String.implode [c])))
15753 pars cs) opts
15754
15755 and doit pars nil = pars
15756 | doit (pars as (v,h,e,f)) (opt::opts) =
15757 case opt
15758 of OPT_LONG(key,valOpt) => doit (do_long pars (key,valOpt)) opts
15759 | OPT_SHORT cs => do_short pars (cs,opts)
15760 | OPT_STRING s => if isSome f
15761 then let val _ = optError(onlyOne "input file")
15762 in doit pars opts
15763 end
15764 else doit (v,h,e,SOME s) opts
15765 | OPT_NOOPT => doit pars opts
15766 | OPT_NEG cs => let val _ = if null cs then ()
15767 else app (fn c => optError
15768 (unknown "-n" (String.implode[c]))) cs
15769 in doit pars opts
15770 end
15771 in doit (false,false,NONE,NONE) opts
15772 end
15773 end
15774 (* stop of nullOptions.sml *)
15775 (* start of nullHooks.sml *)
15776 structure NullHooks =
15777 struct
15778 open Errors IgnoreHooks NullOptions
15779
15780 type AppData = OS.Process.status
15781 type AppFinal = AppData
15782 val nullStart = OS.Process.success
15783
15784 fun printError(pos,err) = if !O_SILENT then () else TextIO.output
15785 (!O_ERROR_DEVICE,formatMessage (4,!O_ERROR_LINEWIDTH)
15786 (Position2String pos
15787 ::(if isFatalError err then "Fatal error:" else "Error:")
15788 ::errorMessage err))
15789 fun printWarning(pos,warn) = if !O_SILENT then () else TextIO.output
15790 (!O_ERROR_DEVICE,formatMessage (4,!O_ERROR_LINEWIDTH)
15791 (Position2String pos^" Warning:"::warningMessage warn))
15792
15793 fun hookError (_,pe) = OS.Process.failure before printError pe
15794 fun hookWarning (status,pw) = status before printWarning pw
15795 end
15796 (* stop of nullHooks.sml *)
15797 (* start of null.sml *)
15798 structure Null =
15799 struct
15800 structure ParserOptions = ParserOptions ()
15801 structure CatOptions = CatOptions ()
15802 structure CatParams =
15803 struct
15804 open CatError CatOptions NullOptions Uri UtilError
15805
15806 fun catError(pos,err) = if !O_SILENT then () else TextIO.output
15807 (!O_ERROR_DEVICE,formatMessage (4,!O_ERROR_LINEWIDTH)
15808 (Position2String pos^" Error in catalog:"::catMessage err))
15809 end
15810 structure Resolve = ResolveCatalog (structure Params = CatParams)
15811 structure ParseNull = Parse (structure Dtd = Dtd
15812 structure Hooks = NullHooks
15813 structure Resolve = Resolve
15814 structure ParserOptions = ParserOptions)
15815
15816 fun parseNull uri = ParseNull.parseDocument uri NONE NullHooks.nullStart
15817
15818 open
15819 CatOptions NullOptions Options ParserOptions Uri
15820
15821 val usage = List.concat [parserUsage,[U_SEP],catalogUsage,[U_SEP],nullUsage]
15822
15823 exception Exit of OS.Process.status
15824
15825 fun null(prog,args) =
15826 let
15827 val prog = "fxp"
15828 val hadError = ref false
15829
15830 fun optError msg =
15831 let val _ = TextIO.output(TextIO.stdErr,msg^".\n")
15832 in hadError := true
15833 end
15834 fun exitError msg =
15835 let val _ = TextIO.output(TextIO.stdErr,msg^".\n")
15836 in raise Exit OS.Process.failure
15837 end
15838 fun exitHelp prog =
15839 let val _ = printUsage TextIO.stdOut prog usage
15840 in raise Exit OS.Process.success
15841 end
15842 fun exitVersion prog =
15843 let val _ = app print [prog," version ",Version.FXP_VERSION,"\n"]
15844 in raise Exit OS.Process.success
15845 end
15846
15847 fun summOpt prog = "For a summary of options type "^prog^" --help"
15848 fun noFile(f,cause) = "can't open file '"^f^"': "^exnMessage cause
15849
15850 val opts = parseOptions args
15851 val _ = setParserDefaults()
15852 val opts1 = setParserOptions (opts,optError)
15853 val _ = setCatalogDefaults()
15854 val opts2 = setCatalogOptions (opts1,optError)
15855 val _ = setNullDefaults()
15856 val (vers,help,err,file) = setNullOptions (opts2,optError)
15857 val _ = if !hadError then exitError (summOpt prog) else ()
15858 val _ = if vers then exitVersion prog else ()
15859 val _ = if help then exitHelp prog else ()
15860 val _ = case err
15861 of SOME "-" => O_ERROR_DEVICE := TextIO.stdErr
15862 | SOME f => (O_ERROR_DEVICE := TextIO.openOut f
15863 handle IO.Io {cause,...} => exitError(noFile(f,cause)))
15864 | NONE => ()
15865 val f = valOf file handle Option => "-"
15866 val uri = if f="-" then NONE else SOME(String2Uri f)
15867 val status = parseNull uri
15868 val _ = if isSome err then TextIO.closeOut (!O_ERROR_DEVICE) else ()
15869 in status
15870 end
15871 handle Exit status => status
15872 | exn =>
15873 let val _ = TextIO.output
15874 (TextIO.stdErr,prog^": Unexpected exception: "^exnMessage exn^".\n")
15875 in OS.Process.failure
15876 end
15877 end
15878
15879 (* stop of null.sml *)
15880 (* start of call-null.sml *)
15881 val _ = Null.null (CommandLine.name (), CommandLine.arguments ())
15882 (* stop of call-null.sml *)