1 (**************************************************************************)
5 (* François Pottier, INRIA Rocquencourt *)
6 (* Yann Régis-Gianas, PPS, Université Paris Diderot *)
8 (* Copyright 2005-2008 Institut National de Recherche en Informatique *)
9 (* et en Automatique. All rights reserved. This file is distributed *)
10 (* under the terms of the GNU Library General Public License, with the *)
11 (* special exception on linking described in file LICENSE. *)
13 (**************************************************************************)
15 (* This file defines several types and module types that are used in the
16 specification of module [Engine]. *)
18 (* --------------------------------------------------------------------------- *)
20 (* It would be nice if we could keep the structure of stacks and environments
21 hidden. However, stacks and environments must be accessible to semantic
22 actions, so the following data structure definitions must be public. *)
24 (* --------------------------------------------------------------------------- *)
26 (* A stack is a linked list of cells. A sentinel cell -- which is its own
27 successor is used to mark the bottom of the stack. The sentinel cell
28 itself is not significant -- it contains dummy values. *)
30 type ('state
, 'semantic_value
) stack
= {
32 (* The state that we should go back to if we pop this stack cell. *)
34 (* This convention means that the state contained in the top stack cell is
35 not the current state [env.current]. It also means that the state found
36 within the sentinel is a dummy -- it is never consulted. This convention
37 is the same as that adopted by the code-based back-end. *)
41 (* The semantic value associated with the chunk of input that this cell
44 semv
: 'semantic_value
;
46 (* The start and end positions of the chunk of input that this cell
49 startp
: Lexing.position
;
50 endp
: Lexing.position
;
52 (* The next cell down in the stack. If this is a self-pointer, then this
53 cell is the sentinel, and the stack is conceptually empty. *)
55 next
: ('state
, 'semantic_value
) stack
;
59 (* --------------------------------------------------------------------------- *)
61 (* A parsing environment contains basically all of the automaton's state. *)
63 type ('state
, 'semantic_value
, 'token
) env
= {
67 lexer
: Lexing.lexbuf
-> 'token
;
69 (* The lexing buffer. It is used as an argument to the lexer, and also
70 accessed directly when extracting positions. *)
72 lexbuf
: Lexing.lexbuf
;
74 (* The last token that was obtained from the lexer. *)
76 mutable token
: 'token
;
78 (* A count of how many tokens were shifted since the beginning, or since
79 the last [error] token was encountered. By convention, if [shifted]
80 is (-1), then the current lookahead token is [error]. *)
84 (* A copy of the value of [shifted] just before the most recent error
85 was detected. This value is not used by the automaton itself, but
86 is made accessible to semantic actions. *)
88 mutable previouserror
: int;
90 (* The stack. In [CodeBackend], it is passed around on its own,
91 whereas, here, it is accessed via the environment. *)
93 mutable stack
: ('state
, 'semantic_value
) stack
;
95 (* The current state. In [CodeBackend], it is passed around on its
96 own, whereas, here, it is accessed via the environment. *)
98 mutable current
: 'state
;
102 (* --------------------------------------------------------------------------- *)
104 (* This signature describes the parameters that must be supplied to the LR
107 module type TABLE
= sig
109 (* The type of automaton states. *)
113 (* The type of tokens. These can be thought of as real tokens, that is,
114 tokens returned by the lexer. They carry a semantic value. This type
115 does not include the [error] pseudo-token. *)
119 (* The type of terminal symbols. These can be thought of as integer codes.
120 They do not carry a semantic value. This type does include the [error]
125 (* The type of semantic values. *)
129 (* A token is conceptually a pair of a (non-[error]) terminal symbol and
130 a semantic value. The following two functions are the pair projections. *)
132 val token2terminal
: token
-> terminal
133 val token2value
: token
-> semantic_value
135 (* Even though the [error] pseudo-token is not a real token, it is a
136 terminal symbol. Furthermore, for regularity, it must have a semantic
139 val error_terminal
: terminal
140 val error_value
: semantic_value
142 (* The type of productions. *)
146 (* If a state [s] has a default reduction on production [prod], then, upon
147 entering [s], the automaton should reduce [prod] without consulting the
148 lookahead token. The following function allows determining which states
149 have default reductions. *)
151 (* Instead of returning a value of a sum type -- either [DefRed prod], or
152 [NoDefRed] -- it accepts two continuations, and invokes just one of
153 them. This mechanism allows avoiding a memory allocation. *)
155 val default_reduction
:
157 ('env
-> production
-> 'answer
) ->
161 (* An LR automaton can normally take three kinds of actions: shift, reduce,
162 or fail. (Acceptance is a particular case of reduction: it consists in
163 reducing a start production.) *)
165 (* There are two variants of the shift action. [shift/discard s] instructs
166 the automaton to discard the current token, request a new one from the
167 lexer, and move to state [s]. [shift/nodiscard s] instructs it to move to
168 state [s] without requesting a new token. This instruction should be used
169 when [s] has a default reduction on [#]. See [CodeBackend.gettoken] for
172 (* This is the automaton's action table. It maps a pair of a state and a
173 terminal symbol to an action. *)
175 (* Instead of returning a value of a sum type -- one of shift/discard,
176 shift/nodiscard, reduce, or fail -- this function accepts three
177 continuations, and invokes just one them. This mechanism allows avoiding
178 a memory allocation. *)
180 (* In summary, the parameters to [action] are as follows:
182 - the first two parameters, a state and a terminal symbol, are used to
183 look up the action table;
185 - the next parameter is the semantic value associated with the above
186 terminal symbol; it is not used, only passed along to the shift
187 continuation, as explained below;
189 - the shift continuation expects an environment; a flag that tells
190 whether to discard the current token; the terminal symbol that
191 is being shifted; its semantic value; and the target state of
194 - the reduce continuation expects an environment and a production;
196 - the fail continuation expects an environment;
198 - the last parameter is the environment; it is not used, only passed
199 along to the selected continuation. *)
205 ('env
-> bool -> terminal
-> semantic_value
-> state
-> 'answer
) ->
206 ('env
-> production
-> 'answer
) ->
210 (* This is the automaton's goto table. It maps a pair of a state and a
211 production to a new state.
213 This convention is slightly different from the textbook approach. The
214 goto table is usually indexed by a state and a non-terminal symbol. *)
216 val goto
: state
-> production
-> state
218 (* By convention, a semantic action is responsible for:
220 1. fetching whatever semantic values and positions it needs off the stack;
222 2. popping an appropriate number of cells off the stack, as dictated
223 by the length of the right-hand side of the production; this involves
224 updating [env.stack];
226 3. computing a new semantic value, as well as new start and end positions;
228 4. pushing a new stack cell, which contains the three values
229 computed in step 3; this again involves updating [env.stack]
230 (only one update is necessary).
232 Point 1 is essentially forced upon us: if semantic values were fetched
233 off the stack by this interpreter, then the calling convention for
234 semantic actions would be variadic: not all semantic actions would have
235 the same number of arguments. The rest follows rather naturally. *)
237 (* If production [prod] is an accepting production, then the semantic action
238 is responsible for raising exception [Accept], instead of returning
239 normally. This convention allows us to not distinguish between regular
240 productions and accepting productions. All we have to do is catch that
241 exception at top level. *)
243 (* Semantic actions are allowed to raise [Error]. *)
245 exception Accept
of semantic_value
248 type semantic_action
=
249 (state
, semantic_value
, token
) env
-> unit
251 val semantic_action
: production
-> semantic_action
253 (* The LR engine can attempt error recovery. This consists in discarding
254 tokens, just after an error has been successfully handled, until a token
255 that can be successfully handled is found. This mechanism is optional.
256 The following flag enables it. *)
260 (* The LR engine requires a number of hooks, which are used for logging. *)
262 (* The comments below indicate the conventional messages that correspond
263 to these hooks in the code-based back-end; see [CodeBackend]. *)
269 val state
: state
-> unit
271 (* Shifting (<terminal>) to state <state> *)
273 val shift
: terminal
-> state
-> unit
275 (* Reducing a production should be logged either as a reduction
276 event (for regular productions) or as an acceptance event (for
277 start productions). *)
279 (* Reducing production <production> / Accepting *)
281 val reduce_or_accept
: production
-> unit
283 (* Lookahead token is now <terminal> (<pos>-<pos>) *)
285 val lookahead_token
: Lexing.lexbuf
-> terminal
-> unit
287 (* Initiating error handling *)
289 val initiating_error_handling
: unit -> unit
291 (* Resuming error handling *)
293 val resuming_error_handling
: unit -> unit
295 (* Handling error in state <state> *)
297 val handling_error
: state
-> unit
299 (* Discarding last token read (<terminal>) *)
301 val discarding_last_token
: terminal
-> unit
307 (* --------------------------------------------------------------------------- *)
309 (* This signature describes the LR engine. *)
311 module type ENGINE
= sig
319 (* An entry point to the engine requires a start state, a lexer, and a lexing
320 buffer. It either succeeds and produces a semantic value, or fails and
327 (Lexing.lexbuf
-> token
) ->