bundles/menhirLib/menhir-20120123/src/lr0.ml

   1 (**************************************************************************)
   2 (*                                                                        *)
   3 (*  Menhir                                                                *)
   4 (*                                                                        *)
   5 (*  François Pottier, INRIA Rocquencourt                                  *)
   6 (*  Yann Régis-Gianas, PPS, Université Paris Diderot                      *)
   7 (*                                                                        *)
   8 (*  Copyright 2005-2008 Institut National de Recherche en Informatique    *)
   9 (*  et en Automatique. All rights reserved. This file is distributed      *)
  10 (*  under the terms of the Q Public License version 1.0, with the change  *)
  11 (*  described in file LICENSE.                                            *)
  12 (*                                                                        *)
  13 (**************************************************************************)
  14
  15 open Grammar
  16
  17 module InfiniteArray =
  18   MenhirLib.InfiniteArray
  19
  20 (* ------------------------------------------------------------------------ *)
  21 (* Symbolic lookahead information. *)
  22
  23 (* A symbolic lookahead set consists of an actual concrete set of
  24    terminal symbols and of a number of set variables. Set variables as
  25    encoded as integers. *)
  26
  27 module SymbolicLookahead = struct
  28
  29   type t =
  30     TerminalSet.t * CompressedBitSet.t
  31
  32   let constant toks =
  33     (toks, CompressedBitSet.empty)
  34
  35   let empty =
  36     constant TerminalSet.empty
  37
  38   let union (toks1, vars1) ((toks2, vars2) as s2) =
  39     let toks = TerminalSet.union toks1 toks2
  40     and vars = CompressedBitSet.union vars1 vars2 in
  41     if toks2 == toks && vars2 == vars then
  42       s2
  43     else
  44       (toks, vars)
  45
  46   let variable (var : int) : t =
  47     (TerminalSet.empty, CompressedBitSet.singleton var)
  48
  49   let project (toks, vars) =
  50     assert (CompressedBitSet.is_empty vars);
  51     toks
  52
  53 end
  54
  55 (* We will perform closure operations over symbolic lookahead sets.
  56    This allows us to later represent LR(1) states as pairs of an
  57    LR(0) node number and an array of concrete lookahead sets. *)
  58
  59 module SymbolicClosure =
  60   Item.Closure(SymbolicLookahead)
  61
  62 (* Closure operations over concrete lookahead sets are also used (when
  63    explaining conflicts). One could take another instance of the
  64    functor. The approach below is somewhat less elegant and makes each
  65    call to [closure] somewhat slower, but saves the cost of
  66    instantiating the functor again -- which is linear in the size of
  67    the grammar. *)
  68
  69 type concretelr1state =
  70     TerminalSet.t Item.Map.t
  71
  72 let closure (state : concretelr1state) : concretelr1state =
  73    Item.Map.map SymbolicLookahead.project
  74      (SymbolicClosure.closure
  75        (Item.Map.map SymbolicLookahead.constant state))
  76
  77 (* ------------------------------------------------------------------------ *)
  78 (* Finding which non-epsilon transitions leave a set of items. This
  79    code is parametric in the nature of lookahead sets. *)
  80
  81 let transitions (state : 'a Item.Map.t) : 'a Item.Map.t SymbolMap.t =
  82
  83   Item.Map.fold (fun item toks transitions ->
  84     match Item.classify item with
  85     | Item.Shift (symbol, item') ->
  86         let items : 'a Item.Map.t =
  87           try
  88             SymbolMap.find symbol transitions
  89           with Not_found ->
  90             Item.Map.empty
  91         in
  92         SymbolMap.add symbol (Item.Map.add item' toks items) transitions
  93     | Item.Reduce _ ->
  94         transitions
  95   ) state SymbolMap.empty
  96
  97 (* ------------------------------------------------------------------------ *)
  98 (* Determining the reduction opportunities at a (closed) state. They
  99    are represented as a list of pairs of a lookahead set and a
 100    production index. This code is again parametric in the nature of
 101    lookahead sets. *)
 102
 103 let reductions (state : 'a Item.Map.t) : ('a * Production.index) list =
 104   Item.Map.fold (fun item toks accu ->
 105     match Item.classify item with
 106     | Item.Reduce prod ->
 107         (toks, prod) :: accu
 108     | Item.Shift _ ->
 109         accu
 110   ) state []
 111
 112 (* ------------------------------------------------------------------------ *)
 113 (* Construction of the the LR(0) automaton. *)
 114
 115 (* Nodes are numbered sequentially. *)
 116
 117 type node =
 118     int
 119
 120 (* A symbolic transition is a pair of the target state number and an
 121    array of symbolic lookahead sets. The variables in these sets are
 122    numbered in [0,g) where g is the number of items in the source
 123    LR(0) state. Items are numbered in the order of presentation by
 124    [Item.Set.fold]. *)
 125
 126 type symbolic_transition_target =
 127     node * SymbolicLookahead.t array
 128
 129 (* The automaton is represented by (growing) arrays of states (sets of
 130    items), symbolic transition information, and symbolic reduction
 131    information, indexed by node numbers. Conversely, a hash table maps
 132    states (sets of items) to node numbers. *)
 133
 134 let n =
 135   ref 0
 136
 137 let states : Item.Set.t InfiniteArray.t =
 138   InfiniteArray.make Item.Set.empty
 139
 140 let _transitions : symbolic_transition_target SymbolMap.t InfiniteArray.t =
 141   InfiniteArray.make SymbolMap.empty
 142
 143 let _reductions : (SymbolicLookahead.t * Production.index) list InfiniteArray.t =
 144   InfiniteArray.make []
 145
 146 let map : (Item.Set.t, node) Hashtbl.t =
 147   Hashtbl.create 50021
 148
 149 (* The automaton is built depth-first. *)
 150
 151 let rec explore (state : Item.Set.t) : node =
 152
 153   (* Find out whether this state was already explored. *)
 154
 155   try
 156     Hashtbl.find map state
 157   with Not_found ->
 158
 159     (* If not, create a new node. *)
 160
 161     let k = !n in
 162     n := k + 1;
 163     InfiniteArray.set states k state;
 164     Hashtbl.add map state k;
 165
 166     (* Build a symbolic version of the current state, where each item
 167        is associated with a distinct lookahead set variable, numbered
 168        consecutively. *)
 169
 170     let (_ : int), (symbolic_state : SymbolicClosure.state) =
 171       Item.Set.fold (fun item (i, symbolic_state) ->
 172         i+1, Item.Map.add item (SymbolicLookahead.variable i) symbolic_state
 173       ) state (0, Item.Map.empty) in
 174
 175     (* Compute the symbolic closure. *)
 176
 177     let closure = SymbolicClosure.closure symbolic_state in
 178
 179     (* Compute symbolic information about reductions. *)
 180
 181     InfiniteArray.set _reductions k (reductions closure);
 182
 183     (* Compute symbolic information about the transitions, and, by
 184        dropping the symbolic lookahead information, explore the
 185        transitions to further LR(0) states. *)
 186
 187     InfiniteArray.set _transitions k (SymbolMap.map (fun symbolic_state ->
 188       let (k : node) = explore (Item.Map.domain symbolic_state) in
 189       let lookahead : SymbolicLookahead.t array =
 190         Array.create (Item.Map.cardinal symbolic_state) SymbolicLookahead.empty in
 191       let (_ : int) = Item.Map.fold (fun _ s i ->
 192         lookahead.(i) <- s;
 193         i+1
 194       ) symbolic_state 0 in
 195       ((k, lookahead) : symbolic_transition_target)
 196     ) (transitions closure));
 197
 198     k
 199
 200 (* Creating a start state out of a start production. It contains a
 201    single item, consisting of the start production, at position 0. *)
 202
 203 let start prod : Item.Set.t =
 204    Item.Set.singleton (Item.import (prod, 0))
 205
 206 (* This starts the construction of the automaton and records the
 207    entry nodes in an array. *)
 208
 209 let entry : node ProductionMap.t =
 210   ProductionMap.start (fun prod ->
 211     explore (start prod)
 212   )
 213
 214 let () =
 215   Hashtbl.clear map
 216
 217 let n =
 218   !n
 219
 220 let () =
 221   Error.logA 1 (fun f -> Printf.fprintf f "Built an LR(0) automaton with %d states.\n" n);
 222   Time.tick "Construction of the LR(0) automaton"
 223
 224 (* ------------------------------------------------------------------------ *)
 225 (* Accessors. *)
 226
 227 let items node : Item.Set.t =
 228   InfiniteArray.get states node
 229
 230 (* ------------------------------------------------------------------------ *)
 231 (* Help for building the LR(1) automaton. *)
 232
 233 (* An LR(1) state is represented as a pair of an LR(0) state number
 234    and an array of concrete lookahead sets (whose length depends on
 235    the LR(0) state). *)
 236
 237 type lr1state =
 238     node * TerminalSet.t array
 239
 240 (* An encoded LR(1) state can be turned into a concrete representation,
 241    that is, a mapping of items to concrete lookahead sets. *)
 242
 243 let export (k, toksr) =
 244   let (_ : int), items = Item.Set.fold (fun item (i, items) ->
 245     i+1, Item.Map.add item toksr.(i) items
 246   ) (InfiniteArray.get states k) (0, Item.Map.empty) in
 247   items
 248
 249 (* Displaying a concrete state. *)
 250
 251 let print_concrete (state : concretelr1state) =
 252   let buffer = Buffer.create 1024 in
 253   Item.Map.iter (fun item toks ->
 254     Printf.bprintf buffer "%s[ %s ]\n" (Item.print item) (TerminalSet.print toks)
 255   ) state;
 256   Buffer.contents buffer
 257
 258 (* Displaying a state. By default, only the kernel is displayed, not
 259    the closure. *)
 260
 261 let print state =
 262   print_concrete (export state)
 263
 264 let print_closure state =
 265   print_concrete (closure (export state))
 266
 267 (* The core of an LR(1) state is the underlying LR(0) state. *)
 268
 269 let core (k, _) =
 270   k
 271
 272 (* A sanity check. *)
 273
 274 let well_formed (k, toksr) =
 275   Array.length toksr = Item.Set.cardinal (InfiniteArray.get states k)
 276
 277 (* An LR(1) start state is the combination of an LR(0) start state
 278    (which consists of a single item) with a singleton lookahead set
 279    that consists of the end-of-file pseudo-token. *)
 280
 281 let start k =
 282   let state = (k, [| TerminalSet.singleton Terminal.sharp |]) in
 283   assert (well_formed state);
 284   state
 285
 286 (* Interpreting a symbolic lookahead set with respect to a source
 287    state. The variables in the symbolic lookahead set (which are
 288    integers) are interpreted as indices into the state's array of
 289    concrete lookahead sets. The result is a concrete lookahead set. *)
 290
 291 let interpret
 292     ((_, toksr) as state : lr1state)
 293     ((toks, vars) : SymbolicLookahead.t)
 294     : TerminalSet.t =
 295
 296   assert (well_formed state);
 297   CompressedBitSet.fold (fun var toks ->
 298     assert (var >= 0 && var < Array.length toksr);
 299     TerminalSet.union toksr.(var) toks
 300   ) vars toks
 301
 302 (* Out of an LR(1) state, one produces information about reductions
 303    and transitions. This is done in an efficient way by interpreting
 304    the precomputed symbolic information with respect to that state. *)
 305
 306 let reductions
 307     ((k, _) as state : lr1state)
 308     : (TerminalSet.t * Production.index) list =
 309
 310   List.map (fun (s, prod) ->
 311     interpret state s, prod
 312   ) (InfiniteArray.get _reductions k)
 313
 314 let transitions
 315     ((k, _) as state : lr1state)
 316     : lr1state SymbolMap.t =
 317
 318   SymbolMap.map (fun ((k, sr) : symbolic_transition_target) ->
 319     ((k, Array.map (interpret state) sr) : lr1state)
 320   ) (InfiniteArray.get _transitions k)
 321
 322 let outgoing_symbols
 323     (k : node)
 324     : Symbol.t list =
 325
 326   SymbolMap.domain (InfiniteArray.get _transitions k)
 327
 328 let transition
 329     symbol
 330     ((k, _) as state : lr1state)
 331     : lr1state =
 332
 333   let ((k, sr) : symbolic_transition_target) =
 334     try
 335       SymbolMap.find symbol (InfiniteArray.get _transitions k)
 336     with Not_found ->
 337       assert false (* no transition along this symbol *)
 338   in
 339   (k, Array.map (interpret state) sr)
 340
 341 (* Equality of states. *)
 342
 343 let equal ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
 344   assert (k1 = k2 && well_formed state1 && well_formed state2);
 345   let rec loop i =
 346     if i = 0 then
 347       true
 348     else
 349       let i = i - 1 in
 350       (TerminalSet.equal toksr1.(i) toksr2.(i)) && (loop i)
 351   in
 352   loop (Array.length toksr1)
 353
 354 (* Subsumption between states. *)
 355
 356 let subsume ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
 357   assert (k1 = k2 && well_formed state1 && well_formed state2);
 358   let rec loop i =
 359     if i = 0 then
 360       true
 361     else
 362       let i = i - 1 in
 363       (TerminalSet.subset toksr1.(i) toksr2.(i)) && (loop i)
 364   in
 365   loop (Array.length toksr1)
 366
 367 (* This function determines whether two (core-equivalent) states are
 368    compatible, according to a criterion that is close to Pager's weak
 369    compatibility criterion.
 370
 371    Pager's criterion guarantees that if a merged state has a potential
 372    conflict at [(i, j)] -- that is, some token [t] appears within the
 373    lookahead sets of both item [i] and item [j] -- then there exists a
 374    state in the canonical automaton that also has a potential conflict
 375    at [(i, j)] -- that is, some token [u] appears within the lookahead
 376    sets of both item [i] and item [j]. Note that [t] and [u] can be
 377    distinct.
 378
 379    Pager has shown that his weak compatibility criterion is stable,
 380    that is, preserved by transitions and closure. This means that, if
 381    two states can be merged, then so can their successors. This is
 382    important, because merging two states means committing to merging
 383    their successors, even though we have not even built these
 384    successors yet.
 385
 386    The criterion used here is a slightly more restrictive version of
 387    Pager's criterion, which guarantees equality of the tokens [t] and
 388    [u]. This is done essentially by applying Pager's original
 389    criterion on a token-wise basis. Pager's original criterion states
 390    that two states can be merged if the new state has no conflict or
 391    one of the original states has a conflict. Our more restrictive
 392    criterion states that two states can be merged if, for every token
 393    [t], the new state has no conflict at [t] or one of the original
 394    states has a conflict at [t].
 395
 396    This modified criterion is also stable. My experiments show that it
 397    is almost as effective in practice: out of more than a hundred
 398    real-world sample grammars, only one automaton was affected, and
 399    only one extra state appeared as a result of using the modified
 400    criterion. Its advantage is to potentially make conflict
 401    explanations easier: if there appears to be a conflict at [t], then
 402    some conflict at [t] can be explained. This was not true when using
 403    Pager's original criterion. *)
 404
 405 let compatible (k1, toksr1) (k2, toksr2) =
 406   assert (k1 = k2);
 407   let n = Array.length toksr1 in
 408   (* Two states are compatible if and only if they are compatible
 409      at every pair (i, j), where i and j are distinct. *)
 410   let rec loopi i =
 411     if i = n then
 412       true
 413     else
 414       let toksr1i = toksr1.(i)
 415       and toksr2i = toksr2.(i) in
 416       let rec loopj j =
 417         if j = i then
 418           true
 419         else
 420           let toksr1j = toksr1.(j)
 421           and toksr2j = toksr2.(j) in
 422
 423           (* The two states are compatible at (i, j) if every conflict
 424              token in the merged state already was a conflict token in
 425              one of the two original states. This could be written as
 426              follows:
 427
 428             TerminalSet.subset
 429               (TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
 430               (TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))
 431
 432              but is easily seen (on paper) to be equivalent to:
 433
 434           *)
 435
 436              TerminalSet.subset
 437                (TerminalSet.inter toksr2i toksr1j)
 438                (TerminalSet.union toksr1i toksr2j)
 439           &&
 440              TerminalSet.subset
 441                (TerminalSet.inter toksr1i toksr2j)
 442                (TerminalSet.union toksr2i toksr1j)
 443           &&
 444              loopj (j+1)
 445       in
 446       loopj 0 && loopi (i+1)
 447   in
 448   loopi 0
 449
 450 (* This function determines whether two (core-equivalent) states can
 451    be merged without creating an end-of-stream conflict, now or in the
 452    future.
 453
 454    The rule is, if an item appears in one state with the singleton "#"
 455    as its lookahead set, then its lookahead set in the other state
 456    must contain "#".
 457
 458    So, either the second lookahead set is also the singleton "#", and
 459    no end-of-stream conflict exists, or it is larger, and the second
 460    state already contains an end-of-stream conflict.
 461
 462    Put another way, we do not want to merge two lookahead sets when one
 463    contains "#" alone and the other does not contain "#".
 464
 465    I invented this rule to complement Pager's criterion. I believe,
 466    but I am not 100% sure, that it does indeed prevent end-of-stream
 467    conflicts and that it is stable.
 468
 469    Thanks to Sébastien Hinderer for reporting the bug caused by the
 470    absence of this extra criterion. *)
 471
 472 let eos_compatible  (k1, toksr1) (k2, toksr2) =
 473   assert (k1 = k2);
 474   let n = Array.length toksr1 in
 475   let rec loop i =
 476     if i = n then
 477       true
 478     else
 479       let toks1 = toksr1.(i)
 480       and toks2 = toksr2.(i) in
 481       begin
 482         if TerminalSet.mem Terminal.sharp toks1 && TerminalSet.cardinal toks1 = 1 then
 483           (* "#" is alone in one set: it must be a member of the other set. *)
 484           TerminalSet.mem Terminal.sharp toks2
 485         else if TerminalSet.mem Terminal.sharp toks2 && TerminalSet.cardinal toks2 = 1 then
 486           (* Symmetric condition. *)
 487           TerminalSet.mem Terminal.sharp toks1
 488         else
 489           true
 490       end
 491       && loop (i+1)
 492   in
 493   loop 0
 494
 495 (* This function determines whether two (core-equivalent) states can
 496    be merged without creating spurious reductions on the [error]
 497    token.
 498
 499    The rule is, we merge two states only if they agree on which
 500    reductions are permitted on the [error] token.
 501
 502    Without this restriction, we might end up in a situation where we
 503    decide to introduce an [error] token into the input stream and
 504    perform a reduction, whereas a canonical LR(1) automaton,
 505    confronted with the same input string, would fail normally -- that
 506    is, it would introduce an [error] token into the input stream, but
 507    it would not be able to perform a reduction right away: the current
 508    state would be discarded.
 509
 510    In the interest of more accurate (or sane, or predictable) error
 511    handling, I decided to introduce this restriction as of 20110124.
 512    This will cause an increase in the size of automata for grammars
 513    that use the [error] token. It might actually make the [error]
 514    token somewhat easier to use.
 515
 516    Note that two sets can be in the subsumption relation and still
 517    be error-incompatible. Error-compatibility requires equality of
 518    the lookahead sets, restricted to [error].
 519
 520    Thanks to Didier Rémy for reporting a bug caused by the absence
 521    of this extra criterion. *)
 522
 523 let error_compatible  (k1, toksr1) (k2, toksr2) =
 524   assert (k1 = k2);
 525   let n = Array.length toksr1 in
 526   let rec loop i =
 527     if i = n then
 528       true
 529     else
 530       let toks1 = toksr1.(i)
 531       and toks2 = toksr2.(i) in
 532       begin
 533         if TerminalSet.mem Terminal.error toks1 then
 534           (* [error] is a member of one set: it must be a member of the other set. *)
 535           TerminalSet.mem Terminal.error toks2
 536         else if TerminalSet.mem Terminal.error toks2 then
 537           (* Symmetric condition. *)
 538           TerminalSet.mem Terminal.error toks1
 539         else
 540           true
 541       end
 542       && loop (i+1)
 543   in
 544   loop 0
 545
 546 (* Union of two states. The two states must have the same core. The
 547    new state is obtained by pointwise union of the lookahead sets. *)
 548
 549 let union (k1, toksr1) (k2, toksr2) =
 550   assert (k1 = k2);
 551   k1, Array.init (Array.length toksr1) (fun i ->
 552     TerminalSet.union toksr1.(i) toksr2.(i)
 553   )
 554
 555 (* Restriction of a state to a set of tokens of interest. Every
 556    lookahead set is intersected with that set. *)
 557
 558 let restrict toks (k, toksr) =
 559   k, Array.map (fun toksri ->
 560     TerminalSet.inter toksri toks
 561   ) toksr
 562