1 (**************************************************************************)
5 (* François Pottier, INRIA Rocquencourt *)
6 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
8 (* Copyright 2005-2008 Institut National de Recherche en Informatique *)
9 (* et en Automatique. All rights reserved. This file is distributed *)
10 (* under the terms of the Q Public License version 1.0, with the change *)
11 (* described in file LICENSE. *)
13 (**************************************************************************)
17 module InfiniteArray
=
18 MenhirLib.InfiniteArray
20 (* ------------------------------------------------------------------------ *)
21 (* Symbolic lookahead information. *)
23 (* A symbolic lookahead set consists of an actual concrete set of
24 terminal symbols and of a number of set variables. Set variables as
25 encoded as integers. *)
27 module SymbolicLookahead
= struct
30 TerminalSet.t
* CompressedBitSet.t
33 (toks
, CompressedBitSet.empty
)
36 constant TerminalSet.empty
38 let union (toks1
, vars1
) ((toks2
, vars2
) as s2
) =
39 let toks = TerminalSet.union toks1 toks2
40 and vars
= CompressedBitSet.union vars1 vars2
in
41 if toks2
== toks && vars2
== vars
then
46 let variable (var
: int) : t
=
47 (TerminalSet.empty, CompressedBitSet.singleton var
)
49 let project (toks, vars
) =
50 assert (CompressedBitSet.is_empty vars
);
55 (* We will perform closure operations over symbolic lookahead sets.
56 This allows us to later represent LR(1) states as pairs of an
57 LR(0) node number and an array of concrete lookahead sets. *)
59 module SymbolicClosure
=
60 Item.Closure
(SymbolicLookahead
)
62 (* Closure operations over concrete lookahead sets are also used (when
63 explaining conflicts). One could take another instance of the
64 functor. The approach below is somewhat less elegant and makes each
65 call to [closure] somewhat slower, but saves the cost of
66 instantiating the functor again -- which is linear in the size of
69 type concretelr1state
=
70 TerminalSet.t
Item.Map.t
72 let closure (state
: concretelr1state
) : concretelr1state
=
73 Item.Map.map
SymbolicLookahead.project
74 (SymbolicClosure.closure
75 (Item.Map.map
SymbolicLookahead.constant state
))
77 (* ------------------------------------------------------------------------ *)
78 (* Finding which non-epsilon transitions leave a set of items. This
79 code is parametric in the nature of lookahead sets. *)
81 let transitions (state
: 'a
Item.Map.t
) : 'a
Item.Map.t
SymbolMap.t
=
83 Item.Map.fold
(fun item
toks transitions ->
84 match Item.classify item
with
85 | Item.Shift
(symbol
, item'
) ->
86 let items : 'a
Item.Map.t
=
88 SymbolMap.find symbol
transitions
92 SymbolMap.add symbol
(Item.Map.add item'
toks items) transitions
95 ) state
SymbolMap.empty
97 (* ------------------------------------------------------------------------ *)
98 (* Determining the reduction opportunities at a (closed) state. They
99 are represented as a list of pairs of a lookahead set and a
100 production index. This code is again parametric in the nature of
103 let reductions (state
: 'a
Item.Map.t
) : ('a
* Production.index
) list
=
104 Item.Map.fold
(fun item
toks accu
->
105 match Item.classify item
with
106 | Item.Reduce prod
->
112 (* ------------------------------------------------------------------------ *)
113 (* Construction of the the LR(0) automaton. *)
115 (* Nodes are numbered sequentially. *)
120 (* A symbolic transition is a pair of the target state number and an
121 array of symbolic lookahead sets. The variables in these sets are
122 numbered in [0,g) where g is the number of items in the source
123 LR(0) state. Items are numbered in the order of presentation by
126 type symbolic_transition_target
=
127 node
* SymbolicLookahead.t array
129 (* The automaton is represented by (growing) arrays of states (sets of
130 items), symbolic transition information, and symbolic reduction
131 information, indexed by node numbers. Conversely, a hash table maps
132 states (sets of items) to node numbers. *)
137 let states : Item.Set.t
InfiniteArray.t
=
138 InfiniteArray.make
Item.Set.empty
140 let _transitions : symbolic_transition_target
SymbolMap.t
InfiniteArray.t
=
141 InfiniteArray.make
SymbolMap.empty
143 let _reductions : (SymbolicLookahead.t
* Production.index
) list
InfiniteArray.t
=
144 InfiniteArray.make
[]
146 let map : (Item.Set.t
, node
) Hashtbl.t
=
149 (* The automaton is built depth-first. *)
151 let rec explore (state
: Item.Set.t
) : node
=
153 (* Find out whether this state was already explored. *)
156 Hashtbl.find
map state
159 (* If not, create a new node. *)
163 InfiniteArray.set
states k state
;
164 Hashtbl.add
map state
k;
166 (* Build a symbolic version of the current state, where each item
167 is associated with a distinct lookahead set variable, numbered
170 let (_
: int), (symbolic_state
: SymbolicClosure.state
) =
171 Item.Set.fold
(fun item
(i
, symbolic_state
) ->
172 i
+1, Item.Map.add item
(SymbolicLookahead.variable i
) symbolic_state
173 ) state
(0, Item.Map.empty) in
175 (* Compute the symbolic closure. *)
177 let closure = SymbolicClosure.closure symbolic_state
in
179 (* Compute symbolic information about reductions. *)
181 InfiniteArray.set
_reductions k (reductions closure);
183 (* Compute symbolic information about the transitions, and, by
184 dropping the symbolic lookahead information, explore the
185 transitions to further LR(0) states. *)
187 InfiniteArray.set
_transitions k (SymbolMap.map (fun symbolic_state
->
188 let (k : node
) = explore (Item.Map.domain symbolic_state
) in
189 let lookahead : SymbolicLookahead.t array
=
190 Array.create
(Item.Map.cardinal symbolic_state
) SymbolicLookahead.empty in
191 let (_
: int) = Item.Map.fold
(fun _ s i
->
194 ) symbolic_state
0 in
195 ((k, lookahead) : symbolic_transition_target
)
196 ) (transitions closure));
200 (* Creating a start state out of a start production. It contains a
201 single item, consisting of the start production, at position 0. *)
203 let start prod
: Item.Set.t
=
204 Item.Set.singleton
(Item.import
(prod
, 0))
206 (* This starts the construction of the automaton and records the
207 entry nodes in an array. *)
209 let entry : node
ProductionMap.t
=
210 ProductionMap.start (fun prod
->
221 Error.logA
1 (fun f
-> Printf.fprintf f
"Built an LR(0) automaton with %d states.\n" n);
222 Time.tick
"Construction of the LR(0) automaton"
224 (* ------------------------------------------------------------------------ *)
227 let items node
: Item.Set.t
=
228 InfiniteArray.get
states node
230 (* ------------------------------------------------------------------------ *)
231 (* Help for building the LR(1) automaton. *)
233 (* An LR(1) state is represented as a pair of an LR(0) state number
234 and an array of concrete lookahead sets (whose length depends on
238 node
* TerminalSet.t array
240 (* An encoded LR(1) state can be turned into a concrete representation,
241 that is, a mapping of items to concrete lookahead sets. *)
243 let export (k, toksr
) =
244 let (_
: int), items = Item.Set.fold
(fun item
(i
, items) ->
245 i
+1, Item.Map.add item toksr
.(i
) items
246 ) (InfiniteArray.get
states k) (0, Item.Map.empty) in
249 (* Displaying a concrete state. *)
251 let print_concrete (state
: concretelr1state
) =
252 let buffer = Buffer.create
1024 in
253 Item.Map.iter
(fun item
toks ->
254 Printf.bprintf
buffer "%s[ %s ]\n" (Item.print item
) (TerminalSet.print
toks)
256 Buffer.contents
buffer
258 (* Displaying a state. By default, only the kernel is displayed, not
262 print_concrete (export state
)
264 let print_closure state
=
265 print_concrete (closure (export state
))
267 (* The core of an LR(1) state is the underlying LR(0) state. *)
272 (* A sanity check. *)
274 let well_formed (k, toksr
) =
275 Array.length toksr
= Item.Set.cardinal
(InfiniteArray.get
states k)
277 (* An LR(1) start state is the combination of an LR(0) start state
278 (which consists of a single item) with a singleton lookahead set
279 that consists of the end-of-file pseudo-token. *)
282 let state = (k, [| TerminalSet.singleton
Terminal.sharp
|]) in
283 assert (well_formed state);
286 (* Interpreting a symbolic lookahead set with respect to a source
287 state. The variables in the symbolic lookahead set (which are
288 integers) are interpreted as indices into the state's array of
289 concrete lookahead sets. The result is a concrete lookahead set. *)
292 ((_
, toksr
) as state : lr1state
)
293 ((toks, vars
) : SymbolicLookahead.t
)
296 assert (well_formed state);
297 CompressedBitSet.fold
(fun var
toks ->
298 assert (var
>= 0 && var
< Array.length toksr
);
299 TerminalSet.union toksr
.(var
) toks
302 (* Out of an LR(1) state, one produces information about reductions
303 and transitions. This is done in an efficient way by interpreting
304 the precomputed symbolic information with respect to that state. *)
307 ((k, _
) as state : lr1state
)
308 : (TerminalSet.t
* Production.index
) list
=
310 List.map (fun (s
, prod
) ->
311 interpret state s
, prod
312 ) (InfiniteArray.get
_reductions k)
315 ((k, _
) as state : lr1state
)
316 : lr1state
SymbolMap.t
=
318 SymbolMap.map (fun ((k, sr
) : symbolic_transition_target
) ->
319 ((k, Array.map (interpret state) sr
) : lr1state
)
320 ) (InfiniteArray.get
_transitions k)
326 SymbolMap.domain
(InfiniteArray.get
_transitions k)
330 ((k, _
) as state : lr1state
)
333 let ((k, sr
) : symbolic_transition_target
) =
335 SymbolMap.find symbol
(InfiniteArray.get
_transitions k)
337 assert false (* no transition along this symbol *)
339 (k, Array.map (interpret state) sr
)
341 (* Equality of states. *)
343 let equal ((k1
, toksr1
) as state1
) ((k2
, toksr2
) as state2
) =
344 assert (k1
= k2
&& well_formed state1
&& well_formed state2
);
350 (TerminalSet.equal toksr1
.(i) toksr2
.(i)) && (loop i)
352 loop (Array.length toksr1
)
354 (* Subsumption between states. *)
356 let subsume ((k1
, toksr1
) as state1
) ((k2
, toksr2
) as state2
) =
357 assert (k1
= k2
&& well_formed state1
&& well_formed state2
);
363 (TerminalSet.subset toksr1
.(i) toksr2
.(i)) && (loop i)
365 loop (Array.length toksr1
)
367 (* This function determines whether two (core-equivalent) states are
368 compatible, according to a criterion that is close to Pager's weak
369 compatibility criterion.
371 Pager's criterion guarantees that if a merged state has a potential
372 conflict at [(i, j)] -- that is, some token [t] appears within the
373 lookahead sets of both item [i] and item [j] -- then there exists a
374 state in the canonical automaton that also has a potential conflict
375 at [(i, j)] -- that is, some token [u] appears within the lookahead
376 sets of both item [i] and item [j]. Note that [t] and [u] can be
379 Pager has shown that his weak compatibility criterion is stable,
380 that is, preserved by transitions and closure. This means that, if
381 two states can be merged, then so can their successors. This is
382 important, because merging two states means committing to merging
383 their successors, even though we have not even built these
386 The criterion used here is a slightly more restrictive version of
387 Pager's criterion, which guarantees equality of the tokens [t] and
388 [u]. This is done essentially by applying Pager's original
389 criterion on a token-wise basis. Pager's original criterion states
390 that two states can be merged if the new state has no conflict or
391 one of the original states has a conflict. Our more restrictive
392 criterion states that two states can be merged if, for every token
393 [t], the new state has no conflict at [t] or one of the original
394 states has a conflict at [t].
396 This modified criterion is also stable. My experiments show that it
397 is almost as effective in practice: out of more than a hundred
398 real-world sample grammars, only one automaton was affected, and
399 only one extra state appeared as a result of using the modified
400 criterion. Its advantage is to potentially make conflict
401 explanations easier: if there appears to be a conflict at [t], then
402 some conflict at [t] can be explained. This was not true when using
403 Pager's original criterion. *)
405 let compatible (k1
, toksr1
) (k2
, toksr2
) =
407 let n = Array.length toksr1
in
408 (* Two states are compatible if and only if they are compatible
409 at every pair (i, j), where i and j are distinct. *)
414 let toksr1i = toksr1
.(i)
415 and toksr2i
= toksr2
.(i) in
420 let toksr1j = toksr1
.(j
)
421 and toksr2j
= toksr2
.(j
) in
423 (* The two states are compatible at (i, j) if every conflict
424 token in the merged state already was a conflict token in
425 one of the two original states. This could be written as
429 (TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
430 (TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))
432 but is easily seen (on paper) to be equivalent to:
437 (TerminalSet.inter toksr2i
toksr1j)
438 (TerminalSet.union toksr1i toksr2j
)
441 (TerminalSet.inter
toksr1i toksr2j
)
442 (TerminalSet.union toksr2i
toksr1j)
446 loopj 0 && loopi (i+1)
450 (* This function determines whether two (core-equivalent) states can
451 be merged without creating an end-of-stream conflict, now or in the
454 The rule is, if an item appears in one state with the singleton "#"
455 as its lookahead set, then its lookahead set in the other state
458 So, either the second lookahead set is also the singleton "#", and
459 no end-of-stream conflict exists, or it is larger, and the second
460 state already contains an end-of-stream conflict.
462 Put another way, we do not want to merge two lookahead sets when one
463 contains "#" alone and the other does not contain "#".
465 I invented this rule to complement Pager's criterion. I believe,
466 but I am not 100% sure, that it does indeed prevent end-of-stream
467 conflicts and that it is stable.
469 Thanks to Sébastien Hinderer for reporting the bug caused by the
470 absence of this extra criterion. *)
472 let eos_compatible (k1
, toksr1
) (k2
, toksr2
) =
474 let n = Array.length toksr1
in
479 let toks1 = toksr1
.(i)
480 and toks2
= toksr2
.(i) in
482 if TerminalSet.mem
Terminal.sharp
toks1 && TerminalSet.cardinal
toks1 = 1 then
483 (* "#" is alone in one set: it must be a member of the other set. *)
484 TerminalSet.mem
Terminal.sharp toks2
485 else if TerminalSet.mem
Terminal.sharp toks2
&& TerminalSet.cardinal toks2
= 1 then
486 (* Symmetric condition. *)
487 TerminalSet.mem
Terminal.sharp
toks1
495 (* This function determines whether two (core-equivalent) states can
496 be merged without creating spurious reductions on the [error]
499 The rule is, we merge two states only if they agree on which
500 reductions are permitted on the [error] token.
502 Without this restriction, we might end up in a situation where we
503 decide to introduce an [error] token into the input stream and
504 perform a reduction, whereas a canonical LR(1) automaton,
505 confronted with the same input string, would fail normally -- that
506 is, it would introduce an [error] token into the input stream, but
507 it would not be able to perform a reduction right away: the current
508 state would be discarded.
510 In the interest of more accurate (or sane, or predictable) error
511 handling, I decided to introduce this restriction as of 20110124.
512 This will cause an increase in the size of automata for grammars
513 that use the [error] token. It might actually make the [error]
514 token somewhat easier to use.
516 Note that two sets can be in the subsumption relation and still
517 be error-incompatible. Error-compatibility requires equality of
518 the lookahead sets, restricted to [error].
520 Thanks to Didier Rémy for reporting a bug caused by the absence
521 of this extra criterion. *)
523 let error_compatible (k1
, toksr1
) (k2
, toksr2
) =
525 let n = Array.length toksr1
in
530 let toks1 = toksr1
.(i)
531 and toks2
= toksr2
.(i) in
533 if TerminalSet.mem
Terminal.error
toks1 then
534 (* [error] is a member of one set: it must be a member of the other set. *)
535 TerminalSet.mem
Terminal.error toks2
536 else if TerminalSet.mem
Terminal.error toks2
then
537 (* Symmetric condition. *)
538 TerminalSet.mem
Terminal.error
toks1
546 (* Union of two states. The two states must have the same core. The
547 new state is obtained by pointwise union of the lookahead sets. *)
549 let union (k1
, toksr1
) (k2
, toksr2
) =
551 k1
, Array.init
(Array.length toksr1
) (fun i ->
552 TerminalSet.union toksr1
.(i) toksr2
.(i)
555 (* Restriction of a state to a set of tokens of interest. Every
556 lookahead set is intersected with that set. *)
558 let restrict toks (k, toksr
) =
559 k, Array.map (fun toksri
->
560 TerminalSet.inter toksri
toks