c3b53ced0e64b74f13f8061ac38f3ad0fc4e84db
[bpt/coccinelle.git] / bundles / menhirLib / menhir-20120123 / src / lr0.ml
1 (**************************************************************************)
2 (* *)
3 (* Menhir *)
4 (* *)
5 (* François Pottier, INRIA Rocquencourt *)
6 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
7 (* *)
8 (* Copyright 2005-2008 Institut National de Recherche en Informatique *)
9 (* et en Automatique. All rights reserved. This file is distributed *)
10 (* under the terms of the Q Public License version 1.0, with the change *)
11 (* described in file LICENSE. *)
12 (* *)
13 (**************************************************************************)
14
15 open Grammar
16
17 module InfiniteArray =
18 MenhirLib.InfiniteArray
19
20 (* ------------------------------------------------------------------------ *)
21 (* Symbolic lookahead information. *)
22
23 (* A symbolic lookahead set consists of an actual concrete set of
24 terminal symbols and of a number of set variables. Set variables as
25 encoded as integers. *)
26
27 module SymbolicLookahead = struct
28
29 type t =
30 TerminalSet.t * CompressedBitSet.t
31
32 let constant toks =
33 (toks, CompressedBitSet.empty)
34
35 let empty =
36 constant TerminalSet.empty
37
38 let union (toks1, vars1) ((toks2, vars2) as s2) =
39 let toks = TerminalSet.union toks1 toks2
40 and vars = CompressedBitSet.union vars1 vars2 in
41 if toks2 == toks && vars2 == vars then
42 s2
43 else
44 (toks, vars)
45
46 let variable (var : int) : t =
47 (TerminalSet.empty, CompressedBitSet.singleton var)
48
49 let project (toks, vars) =
50 assert (CompressedBitSet.is_empty vars);
51 toks
52
53 end
54
55 (* We will perform closure operations over symbolic lookahead sets.
56 This allows us to later represent LR(1) states as pairs of an
57 LR(0) node number and an array of concrete lookahead sets. *)
58
59 module SymbolicClosure =
60 Item.Closure(SymbolicLookahead)
61
62 (* Closure operations over concrete lookahead sets are also used (when
63 explaining conflicts). One could take another instance of the
64 functor. The approach below is somewhat less elegant and makes each
65 call to [closure] somewhat slower, but saves the cost of
66 instantiating the functor again -- which is linear in the size of
67 the grammar. *)
68
69 type concretelr1state =
70 TerminalSet.t Item.Map.t
71
72 let closure (state : concretelr1state) : concretelr1state =
73 Item.Map.map SymbolicLookahead.project
74 (SymbolicClosure.closure
75 (Item.Map.map SymbolicLookahead.constant state))
76
77 (* ------------------------------------------------------------------------ *)
78 (* Finding which non-epsilon transitions leave a set of items. This
79 code is parametric in the nature of lookahead sets. *)
80
81 let transitions (state : 'a Item.Map.t) : 'a Item.Map.t SymbolMap.t =
82
83 Item.Map.fold (fun item toks transitions ->
84 match Item.classify item with
85 | Item.Shift (symbol, item') ->
86 let items : 'a Item.Map.t =
87 try
88 SymbolMap.find symbol transitions
89 with Not_found ->
90 Item.Map.empty
91 in
92 SymbolMap.add symbol (Item.Map.add item' toks items) transitions
93 | Item.Reduce _ ->
94 transitions
95 ) state SymbolMap.empty
96
97 (* ------------------------------------------------------------------------ *)
98 (* Determining the reduction opportunities at a (closed) state. They
99 are represented as a list of pairs of a lookahead set and a
100 production index. This code is again parametric in the nature of
101 lookahead sets. *)
102
103 let reductions (state : 'a Item.Map.t) : ('a * Production.index) list =
104 Item.Map.fold (fun item toks accu ->
105 match Item.classify item with
106 | Item.Reduce prod ->
107 (toks, prod) :: accu
108 | Item.Shift _ ->
109 accu
110 ) state []
111
112 (* ------------------------------------------------------------------------ *)
113 (* Construction of the the LR(0) automaton. *)
114
115 (* Nodes are numbered sequentially. *)
116
117 type node =
118 int
119
120 (* A symbolic transition is a pair of the target state number and an
121 array of symbolic lookahead sets. The variables in these sets are
122 numbered in [0,g) where g is the number of items in the source
123 LR(0) state. Items are numbered in the order of presentation by
124 [Item.Set.fold]. *)
125
126 type symbolic_transition_target =
127 node * SymbolicLookahead.t array
128
129 (* The automaton is represented by (growing) arrays of states (sets of
130 items), symbolic transition information, and symbolic reduction
131 information, indexed by node numbers. Conversely, a hash table maps
132 states (sets of items) to node numbers. *)
133
134 let n =
135 ref 0
136
137 let states : Item.Set.t InfiniteArray.t =
138 InfiniteArray.make Item.Set.empty
139
140 let _transitions : symbolic_transition_target SymbolMap.t InfiniteArray.t =
141 InfiniteArray.make SymbolMap.empty
142
143 let _reductions : (SymbolicLookahead.t * Production.index) list InfiniteArray.t =
144 InfiniteArray.make []
145
146 let map : (Item.Set.t, node) Hashtbl.t =
147 Hashtbl.create 50021
148
149 (* The automaton is built depth-first. *)
150
151 let rec explore (state : Item.Set.t) : node =
152
153 (* Find out whether this state was already explored. *)
154
155 try
156 Hashtbl.find map state
157 with Not_found ->
158
159 (* If not, create a new node. *)
160
161 let k = !n in
162 n := k + 1;
163 InfiniteArray.set states k state;
164 Hashtbl.add map state k;
165
166 (* Build a symbolic version of the current state, where each item
167 is associated with a distinct lookahead set variable, numbered
168 consecutively. *)
169
170 let (_ : int), (symbolic_state : SymbolicClosure.state) =
171 Item.Set.fold (fun item (i, symbolic_state) ->
172 i+1, Item.Map.add item (SymbolicLookahead.variable i) symbolic_state
173 ) state (0, Item.Map.empty) in
174
175 (* Compute the symbolic closure. *)
176
177 let closure = SymbolicClosure.closure symbolic_state in
178
179 (* Compute symbolic information about reductions. *)
180
181 InfiniteArray.set _reductions k (reductions closure);
182
183 (* Compute symbolic information about the transitions, and, by
184 dropping the symbolic lookahead information, explore the
185 transitions to further LR(0) states. *)
186
187 InfiniteArray.set _transitions k (SymbolMap.map (fun symbolic_state ->
188 let (k : node) = explore (Item.Map.domain symbolic_state) in
189 let lookahead : SymbolicLookahead.t array =
190 Array.create (Item.Map.cardinal symbolic_state) SymbolicLookahead.empty in
191 let (_ : int) = Item.Map.fold (fun _ s i ->
192 lookahead.(i) <- s;
193 i+1
194 ) symbolic_state 0 in
195 ((k, lookahead) : symbolic_transition_target)
196 ) (transitions closure));
197
198 k
199
200 (* Creating a start state out of a start production. It contains a
201 single item, consisting of the start production, at position 0. *)
202
203 let start prod : Item.Set.t =
204 Item.Set.singleton (Item.import (prod, 0))
205
206 (* This starts the construction of the automaton and records the
207 entry nodes in an array. *)
208
209 let entry : node ProductionMap.t =
210 ProductionMap.start (fun prod ->
211 explore (start prod)
212 )
213
214 let () =
215 Hashtbl.clear map
216
217 let n =
218 !n
219
220 let () =
221 Error.logA 1 (fun f -> Printf.fprintf f "Built an LR(0) automaton with %d states.\n" n);
222 Time.tick "Construction of the LR(0) automaton"
223
224 (* ------------------------------------------------------------------------ *)
225 (* Accessors. *)
226
227 let items node : Item.Set.t =
228 InfiniteArray.get states node
229
230 (* ------------------------------------------------------------------------ *)
231 (* Help for building the LR(1) automaton. *)
232
233 (* An LR(1) state is represented as a pair of an LR(0) state number
234 and an array of concrete lookahead sets (whose length depends on
235 the LR(0) state). *)
236
237 type lr1state =
238 node * TerminalSet.t array
239
240 (* An encoded LR(1) state can be turned into a concrete representation,
241 that is, a mapping of items to concrete lookahead sets. *)
242
243 let export (k, toksr) =
244 let (_ : int), items = Item.Set.fold (fun item (i, items) ->
245 i+1, Item.Map.add item toksr.(i) items
246 ) (InfiniteArray.get states k) (0, Item.Map.empty) in
247 items
248
249 (* Displaying a concrete state. *)
250
251 let print_concrete (state : concretelr1state) =
252 let buffer = Buffer.create 1024 in
253 Item.Map.iter (fun item toks ->
254 Printf.bprintf buffer "%s[ %s ]\n" (Item.print item) (TerminalSet.print toks)
255 ) state;
256 Buffer.contents buffer
257
258 (* Displaying a state. By default, only the kernel is displayed, not
259 the closure. *)
260
261 let print state =
262 print_concrete (export state)
263
264 let print_closure state =
265 print_concrete (closure (export state))
266
267 (* The core of an LR(1) state is the underlying LR(0) state. *)
268
269 let core (k, _) =
270 k
271
272 (* A sanity check. *)
273
274 let well_formed (k, toksr) =
275 Array.length toksr = Item.Set.cardinal (InfiniteArray.get states k)
276
277 (* An LR(1) start state is the combination of an LR(0) start state
278 (which consists of a single item) with a singleton lookahead set
279 that consists of the end-of-file pseudo-token. *)
280
281 let start k =
282 let state = (k, [| TerminalSet.singleton Terminal.sharp |]) in
283 assert (well_formed state);
284 state
285
286 (* Interpreting a symbolic lookahead set with respect to a source
287 state. The variables in the symbolic lookahead set (which are
288 integers) are interpreted as indices into the state's array of
289 concrete lookahead sets. The result is a concrete lookahead set. *)
290
291 let interpret
292 ((_, toksr) as state : lr1state)
293 ((toks, vars) : SymbolicLookahead.t)
294 : TerminalSet.t =
295
296 assert (well_formed state);
297 CompressedBitSet.fold (fun var toks ->
298 assert (var >= 0 && var < Array.length toksr);
299 TerminalSet.union toksr.(var) toks
300 ) vars toks
301
302 (* Out of an LR(1) state, one produces information about reductions
303 and transitions. This is done in an efficient way by interpreting
304 the precomputed symbolic information with respect to that state. *)
305
306 let reductions
307 ((k, _) as state : lr1state)
308 : (TerminalSet.t * Production.index) list =
309
310 List.map (fun (s, prod) ->
311 interpret state s, prod
312 ) (InfiniteArray.get _reductions k)
313
314 let transitions
315 ((k, _) as state : lr1state)
316 : lr1state SymbolMap.t =
317
318 SymbolMap.map (fun ((k, sr) : symbolic_transition_target) ->
319 ((k, Array.map (interpret state) sr) : lr1state)
320 ) (InfiniteArray.get _transitions k)
321
322 let outgoing_symbols
323 (k : node)
324 : Symbol.t list =
325
326 SymbolMap.domain (InfiniteArray.get _transitions k)
327
328 let transition
329 symbol
330 ((k, _) as state : lr1state)
331 : lr1state =
332
333 let ((k, sr) : symbolic_transition_target) =
334 try
335 SymbolMap.find symbol (InfiniteArray.get _transitions k)
336 with Not_found ->
337 assert false (* no transition along this symbol *)
338 in
339 (k, Array.map (interpret state) sr)
340
341 (* Equality of states. *)
342
343 let equal ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
344 assert (k1 = k2 && well_formed state1 && well_formed state2);
345 let rec loop i =
346 if i = 0 then
347 true
348 else
349 let i = i - 1 in
350 (TerminalSet.equal toksr1.(i) toksr2.(i)) && (loop i)
351 in
352 loop (Array.length toksr1)
353
354 (* Subsumption between states. *)
355
356 let subsume ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
357 assert (k1 = k2 && well_formed state1 && well_formed state2);
358 let rec loop i =
359 if i = 0 then
360 true
361 else
362 let i = i - 1 in
363 (TerminalSet.subset toksr1.(i) toksr2.(i)) && (loop i)
364 in
365 loop (Array.length toksr1)
366
367 (* This function determines whether two (core-equivalent) states are
368 compatible, according to a criterion that is close to Pager's weak
369 compatibility criterion.
370
371 Pager's criterion guarantees that if a merged state has a potential
372 conflict at [(i, j)] -- that is, some token [t] appears within the
373 lookahead sets of both item [i] and item [j] -- then there exists a
374 state in the canonical automaton that also has a potential conflict
375 at [(i, j)] -- that is, some token [u] appears within the lookahead
376 sets of both item [i] and item [j]. Note that [t] and [u] can be
377 distinct.
378
379 Pager has shown that his weak compatibility criterion is stable,
380 that is, preserved by transitions and closure. This means that, if
381 two states can be merged, then so can their successors. This is
382 important, because merging two states means committing to merging
383 their successors, even though we have not even built these
384 successors yet.
385
386 The criterion used here is a slightly more restrictive version of
387 Pager's criterion, which guarantees equality of the tokens [t] and
388 [u]. This is done essentially by applying Pager's original
389 criterion on a token-wise basis. Pager's original criterion states
390 that two states can be merged if the new state has no conflict or
391 one of the original states has a conflict. Our more restrictive
392 criterion states that two states can be merged if, for every token
393 [t], the new state has no conflict at [t] or one of the original
394 states has a conflict at [t].
395
396 This modified criterion is also stable. My experiments show that it
397 is almost as effective in practice: out of more than a hundred
398 real-world sample grammars, only one automaton was affected, and
399 only one extra state appeared as a result of using the modified
400 criterion. Its advantage is to potentially make conflict
401 explanations easier: if there appears to be a conflict at [t], then
402 some conflict at [t] can be explained. This was not true when using
403 Pager's original criterion. *)
404
405 let compatible (k1, toksr1) (k2, toksr2) =
406 assert (k1 = k2);
407 let n = Array.length toksr1 in
408 (* Two states are compatible if and only if they are compatible
409 at every pair (i, j), where i and j are distinct. *)
410 let rec loopi i =
411 if i = n then
412 true
413 else
414 let toksr1i = toksr1.(i)
415 and toksr2i = toksr2.(i) in
416 let rec loopj j =
417 if j = i then
418 true
419 else
420 let toksr1j = toksr1.(j)
421 and toksr2j = toksr2.(j) in
422
423 (* The two states are compatible at (i, j) if every conflict
424 token in the merged state already was a conflict token in
425 one of the two original states. This could be written as
426 follows:
427
428 TerminalSet.subset
429 (TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
430 (TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))
431
432 but is easily seen (on paper) to be equivalent to:
433
434 *)
435
436 TerminalSet.subset
437 (TerminalSet.inter toksr2i toksr1j)
438 (TerminalSet.union toksr1i toksr2j)
439 &&
440 TerminalSet.subset
441 (TerminalSet.inter toksr1i toksr2j)
442 (TerminalSet.union toksr2i toksr1j)
443 &&
444 loopj (j+1)
445 in
446 loopj 0 && loopi (i+1)
447 in
448 loopi 0
449
450 (* This function determines whether two (core-equivalent) states can
451 be merged without creating an end-of-stream conflict, now or in the
452 future.
453
454 The rule is, if an item appears in one state with the singleton "#"
455 as its lookahead set, then its lookahead set in the other state
456 must contain "#".
457
458 So, either the second lookahead set is also the singleton "#", and
459 no end-of-stream conflict exists, or it is larger, and the second
460 state already contains an end-of-stream conflict.
461
462 Put another way, we do not want to merge two lookahead sets when one
463 contains "#" alone and the other does not contain "#".
464
465 I invented this rule to complement Pager's criterion. I believe,
466 but I am not 100% sure, that it does indeed prevent end-of-stream
467 conflicts and that it is stable.
468
469 Thanks to Sébastien Hinderer for reporting the bug caused by the
470 absence of this extra criterion. *)
471
472 let eos_compatible (k1, toksr1) (k2, toksr2) =
473 assert (k1 = k2);
474 let n = Array.length toksr1 in
475 let rec loop i =
476 if i = n then
477 true
478 else
479 let toks1 = toksr1.(i)
480 and toks2 = toksr2.(i) in
481 begin
482 if TerminalSet.mem Terminal.sharp toks1 && TerminalSet.cardinal toks1 = 1 then
483 (* "#" is alone in one set: it must be a member of the other set. *)
484 TerminalSet.mem Terminal.sharp toks2
485 else if TerminalSet.mem Terminal.sharp toks2 && TerminalSet.cardinal toks2 = 1 then
486 (* Symmetric condition. *)
487 TerminalSet.mem Terminal.sharp toks1
488 else
489 true
490 end
491 && loop (i+1)
492 in
493 loop 0
494
495 (* This function determines whether two (core-equivalent) states can
496 be merged without creating spurious reductions on the [error]
497 token.
498
499 The rule is, we merge two states only if they agree on which
500 reductions are permitted on the [error] token.
501
502 Without this restriction, we might end up in a situation where we
503 decide to introduce an [error] token into the input stream and
504 perform a reduction, whereas a canonical LR(1) automaton,
505 confronted with the same input string, would fail normally -- that
506 is, it would introduce an [error] token into the input stream, but
507 it would not be able to perform a reduction right away: the current
508 state would be discarded.
509
510 In the interest of more accurate (or sane, or predictable) error
511 handling, I decided to introduce this restriction as of 20110124.
512 This will cause an increase in the size of automata for grammars
513 that use the [error] token. It might actually make the [error]
514 token somewhat easier to use.
515
516 Note that two sets can be in the subsumption relation and still
517 be error-incompatible. Error-compatibility requires equality of
518 the lookahead sets, restricted to [error].
519
520 Thanks to Didier Rémy for reporting a bug caused by the absence
521 of this extra criterion. *)
522
523 let error_compatible (k1, toksr1) (k2, toksr2) =
524 assert (k1 = k2);
525 let n = Array.length toksr1 in
526 let rec loop i =
527 if i = n then
528 true
529 else
530 let toks1 = toksr1.(i)
531 and toks2 = toksr2.(i) in
532 begin
533 if TerminalSet.mem Terminal.error toks1 then
534 (* [error] is a member of one set: it must be a member of the other set. *)
535 TerminalSet.mem Terminal.error toks2
536 else if TerminalSet.mem Terminal.error toks2 then
537 (* Symmetric condition. *)
538 TerminalSet.mem Terminal.error toks1
539 else
540 true
541 end
542 && loop (i+1)
543 in
544 loop 0
545
546 (* Union of two states. The two states must have the same core. The
547 new state is obtained by pointwise union of the lookahead sets. *)
548
549 let union (k1, toksr1) (k2, toksr2) =
550 assert (k1 = k2);
551 k1, Array.init (Array.length toksr1) (fun i ->
552 TerminalSet.union toksr1.(i) toksr2.(i)
553 )
554
555 (* Restriction of a state to a set of tokens of interest. Every
556 lookahead set is intersected with that set. *)
557
558 let restrict toks (k, toksr) =
559 k, Array.map (fun toksri ->
560 TerminalSet.inter toksri toks
561 ) toksr
562