bundles/menhirLib/menhir-20120123/src/engineTypes.ml

   1 (**************************************************************************)
   2 (*                                                                        *)
   3 (*  Menhir                                                                *)
   4 (*                                                                        *)
   5 (*  François Pottier, INRIA Rocquencourt                                  *)
   6 (*  Yann Régis-Gianas, PPS, Université Paris Diderot                      *)
   7 (*                                                                        *)
   8 (*  Copyright 2005-2008 Institut National de Recherche en Informatique    *)
   9 (*  et en Automatique. All rights reserved. This file is distributed      *)
  10 (*  under the terms of the GNU Library General Public License, with the   *)
  11 (*  special exception on linking described in file LICENSE.               *)
  12 (*                                                                        *)
  13 (**************************************************************************)
  14
  15 (* This file defines several types and module types that are used in the
  16    specification of module [Engine]. *)
  17
  18 (* --------------------------------------------------------------------------- *)
  19
  20 (* It would be nice if we could keep the structure of stacks and environments
  21    hidden. However, stacks and environments must be accessible to semantic
  22    actions, so the following data structure definitions must be public. *)
  23
  24 (* --------------------------------------------------------------------------- *)
  25
  26 (* A stack is a linked list of cells. A sentinel cell -- which is its own
  27    successor -- is used to mark the bottom of the stack. The sentinel cell
  28    itself is not significant -- it contains dummy values. *)
  29
  30 type ('state, 'semantic_value) stack = {
  31
  32   (* The state that we should go back to if we pop this stack cell. *)
  33
  34   (* This convention means that the state contained in the top stack cell is
  35      not the current state [env.current]. It also means that the state found
  36      within the sentinel is a dummy -- it is never consulted. This convention
  37      is the same as that adopted by the code-based back-end. *)
  38
  39   state: 'state;
  40
  41   (* The semantic value associated with the chunk of input that this cell
  42      represents. *)
  43
  44   semv: 'semantic_value;
  45
  46   (* The start and end positions of the chunk of input that this cell
  47      represents. *)
  48
  49   startp: Lexing.position;
  50   endp: Lexing.position;
  51
  52   (* The next cell down in the stack. If this is a self-pointer, then this
  53      cell is the sentinel, and the stack is conceptually empty. *)
  54
  55   next: ('state, 'semantic_value) stack;
  56
  57 }
  58
  59 (* --------------------------------------------------------------------------- *)
  60
  61 (* A parsing environment contains basically all of the automaton's state. *)
  62
  63 type ('state, 'semantic_value, 'token) env = {
  64
  65   (* The lexer. *)
  66
  67   lexer: Lexing.lexbuf -> 'token;
  68
  69   (* The lexing buffer. It is used as an argument to the lexer, and also
  70      accessed directly when extracting positions. *)
  71
  72   lexbuf: Lexing.lexbuf;
  73
  74   (* The last token that was obtained from the lexer. *)
  75
  76   mutable token: 'token;
  77
  78   (* A count of how many tokens were shifted since the beginning, or since
  79      the last [error] token was encountered. By convention, if [shifted]
  80      is (-1), then the current lookahead token is [error]. *)
  81
  82   mutable shifted: int;
  83
  84   (* A copy of the value of [shifted] just before the most recent error
  85      was detected. This value is not used by the automaton itself, but
  86      is made accessible to semantic actions. *)
  87
  88   mutable previouserror: int;
  89
  90   (* The stack. In [CodeBackend], it is passed around on its own,
  91      whereas, here, it is accessed via the environment. *)
  92
  93   mutable stack: ('state, 'semantic_value) stack;
  94
  95   (* The current state. In [CodeBackend], it is passed around on its
  96      own, whereas, here, it is accessed via the environment. *)
  97
  98   mutable current: 'state;
  99
 100 }
 101
 102 (* --------------------------------------------------------------------------- *)
 103
 104 (* This signature describes the parameters that must be supplied to the LR
 105    engine. *)
 106
 107 module type TABLE = sig
 108
 109   (* The type of automaton states. *)
 110
 111   type state
 112
 113   (* The type of tokens. These can be thought of as real tokens, that is,
 114      tokens returned by the lexer. They carry a semantic value. This type
 115      does not include the [error] pseudo-token. *)
 116
 117   type token
 118
 119   (* The type of terminal symbols. These can be thought of as integer codes.
 120      They do not carry a semantic value. This type does include the [error]
 121      pseudo-token. *)
 122
 123   type terminal
 124
 125   (* The type of semantic values. *)
 126
 127   type semantic_value
 128
 129   (* A token is conceptually a pair of a (non-[error]) terminal symbol and
 130      a semantic value. The following two functions are the pair projections. *)
 131
 132   val token2terminal: token -> terminal
 133   val token2value: token -> semantic_value
 134
 135   (* Even though the [error] pseudo-token is not a real token, it is a
 136      terminal symbol. Furthermore, for regularity, it must have a semantic
 137      value. *)
 138
 139   val error_terminal: terminal
 140   val error_value: semantic_value
 141
 142   (* The type of productions. *)
 143
 144   type production
 145
 146   (* If a state [s] has a default reduction on production [prod], then, upon
 147      entering [s], the automaton should reduce [prod] without consulting the
 148      lookahead token. The following function allows determining which states
 149      have default reductions. *)
 150
 151   (* Instead of returning a value of a sum type -- either [DefRed prod], or
 152      [NoDefRed] -- it accepts two continuations, and invokes just one of
 153      them. This mechanism allows avoiding a memory allocation. *)
 154
 155   val default_reduction:
 156     state ->
 157     ('env -> production -> 'answer) ->
 158     ('env -> 'answer) ->
 159     'env -> 'answer
 160
 161   (* An LR automaton can normally take three kinds of actions: shift, reduce,
 162      or fail. (Acceptance is a particular case of reduction: it consists in
 163      reducing a start production.) *)
 164
 165   (* There are two variants of the shift action. [shift/discard s] instructs
 166      the automaton to discard the current token, request a new one from the
 167      lexer, and move to state [s]. [shift/nodiscard s] instructs it to move to
 168      state [s] without requesting a new token. This instruction should be used
 169      when [s] has a default reduction on [#]. See [CodeBackend.gettoken] for
 170      details. *)
 171
 172   (* This is the automaton's action table. It maps a pair of a state and a
 173      terminal symbol to an action. *)
 174
 175   (* Instead of returning a value of a sum type -- one of shift/discard,
 176      shift/nodiscard, reduce, or fail -- this function accepts three
 177      continuations, and invokes just one them. This mechanism allows avoiding
 178      a memory allocation. *)
 179
 180   (* In summary, the parameters to [action] are as follows:
 181
 182      - the first two parameters, a state and a terminal symbol, are used to
 183        look up the action table;
 184
 185      - the next parameter is the semantic value associated with the above
 186        terminal symbol; it is not used, only passed along to the shift
 187        continuation, as explained below;
 188
 189      - the shift continuation expects an environment; a flag that tells
 190        whether to discard the current token; the terminal symbol that
 191        is being shifted; its semantic value; and the target state of
 192        the transition;
 193
 194      - the reduce continuation expects an environment and a production;
 195
 196      - the fail continuation expects an environment;
 197
 198      - the last parameter is the environment; it is not used, only passed
 199        along to the selected continuation. *)
 200
 201   val action:
 202     state ->
 203     terminal ->
 204     semantic_value ->
 205     ('env -> bool -> terminal -> semantic_value -> state -> 'answer) ->
 206     ('env -> production -> 'answer) ->
 207     ('env -> 'answer) ->
 208     'env -> 'answer
 209
 210   (* This is the automaton's goto table. It maps a pair of a state and a
 211      production to a new state.
 212
 213      This convention is slightly different from the textbook approach. The
 214      goto table is usually indexed by a state and a non-terminal symbol. *)
 215
 216   val goto: state -> production -> state
 217
 218   (* By convention, a semantic action is responsible for:
 219
 220      1. fetching whatever semantic values and positions it needs off the stack;
 221
 222      2. popping an appropriate number of cells off the stack, as dictated
 223      by the length of the right-hand side of the production; this involves
 224      updating [env.stack];
 225
 226      3. computing a new semantic value, as well as new start and end positions;
 227
 228      4. pushing a new stack cell, which contains the three values
 229      computed in step 3; this again involves updating [env.stack]
 230      (only one update is necessary).
 231
 232      Point 1 is essentially forced upon us: if semantic values were fetched
 233      off the stack by this interpreter, then the calling convention for
 234      semantic actions would be variadic: not all semantic actions would have
 235      the same number of arguments. The rest follows rather naturally. *)
 236
 237   (* If production [prod] is an accepting production, then the semantic action
 238      is responsible for raising exception [Accept], instead of returning
 239      normally. This convention allows us to not distinguish between regular
 240      productions and accepting productions. All we have to do is catch that
 241      exception at top level. *)
 242
 243   (* Semantic actions are allowed to raise [Error]. *)
 244
 245   exception Accept of semantic_value
 246   exception Error
 247
 248   type semantic_action =
 249       (state, semantic_value, token) env -> unit
 250
 251   val semantic_action: production -> semantic_action
 252
 253   (* The LR engine can attempt error recovery. This consists in discarding
 254      tokens, just after an error has been successfully handled, until a token
 255      that can be successfully handled is found. This mechanism is optional.
 256      The following flag enables it. *)
 257
 258   val recovery: bool
 259
 260   (* The LR engine requires a number of hooks, which are used for logging. *)
 261
 262   (* The comments below indicate the conventional messages that correspond
 263      to these hooks in the code-based back-end; see [CodeBackend]. *)
 264
 265   module Log : sig
 266
 267     (* State %d: *)
 268
 269     val state: state -> unit
 270
 271     (* Shifting (<terminal>) to state <state> *)
 272
 273     val shift: terminal -> state -> unit
 274
 275     (* Reducing a production should be logged either as a reduction
 276        event (for regular productions) or as an acceptance event (for
 277        start productions). *)
 278
 279     (* Reducing production <production> / Accepting *)
 280
 281     val reduce_or_accept: production -> unit
 282
 283     (* Lookahead token is now <terminal> (<pos>-<pos>) *)
 284
 285     val lookahead_token: Lexing.lexbuf -> terminal -> unit
 286
 287     (* Initiating error handling *)
 288
 289     val initiating_error_handling: unit -> unit
 290
 291     (* Resuming error handling *)
 292
 293     val resuming_error_handling: unit -> unit
 294
 295     (* Handling error in state <state> *)
 296
 297     val handling_error: state -> unit
 298
 299     (* Discarding last token read (<terminal>) *)
 300
 301     val discarding_last_token: terminal -> unit
 302
 303   end
 304
 305 end
 306
 307 (* --------------------------------------------------------------------------- *)
 308
 309 (* This signature describes the LR engine. *)
 310
 311 module type ENGINE = sig
 312
 313   type state
 314
 315   type token
 316
 317   type semantic_value
 318
 319   (* An entry point to the engine requires a start state, a lexer, and a lexing
 320      buffer. It either succeeds and produces a semantic value, or fails and
 321      raises [Error]. *)
 322
 323   exception Error
 324
 325   val entry:
 326     state ->
 327     (Lexing.lexbuf -> token) ->
 328     Lexing.lexbuf ->
 329     semantic_value
 330
 331 end