1 (*****************************************************************************)
3 (*****************************************************************************)
6 * "Certain applications are extremely well suited to functional
7 * programming and parsing is one of them. Specifically, the ability to
8 * write functional combinators that allow parsers for everything from
9 * integers up to symbolic expressions to be composed is more general
10 * and provides more opportunity for code reuse than the use of
11 * conventional parser generators such as ocamllex and ocamlyacc. This
12 * article explains how parser combinators may be designed and
13 * implemented in OCaml, using the standard example of a calculator."
15 * Based on haskell articles I guess like meijer functional pearl or
16 * graham hutton articles. Also maybe based on haskell parsec.
18 * pad: a few bugfix. I also put more restrictive and descriptive types.
19 * pad: I remember having coded such a library, maybe not in ocaml.
20 * Or maybe it was during a "TP compilation" at INSA ? I remember having
21 * a generic lexer. Or maybe it was genlex ?
26 * alternatives: genlex + parser extension of ocaml (streams).
29 * Example: a lexer suitable for a desk calculator is obtained by
30 * let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"]
31 * let parse_expr = parser
33 * | [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n
34 * | [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2
35 * and parse_remainder n1 = parser
36 * [< 'Kwd "+"; n2 = parse_expr >] -> n1+n2
47 * Cf also ocaml manual
48 * let rec parse_expr = parser
49 * [< e1 = parse_mult; e = parse_more_adds e1 >] -> e
50 * and parse_more_adds e1 = parser
51 * [< 'Kwd "+"; e2 = parse_mult; e = parse_more_adds (Sum(e1, e2)) >] -> e
52 * | [< 'Kwd "-"; e2 = parse_mult; e = parse_more_adds (Diff(e1, e2)) >] -> e
54 * and parse_mult = parser
55 * [< e1 = parse_simple; e = parse_more_mults e1 >] -> e
56 * and parse_more_mults e1 = parser
57 * [< 'Kwd "*"; e2 = parse_simple; e = parse_more_mults (Prod(e1, e2)) >] -> e
58 * | [< 'Kwd "/"; e2 = parse_simple; e = parse_more_mults (Quot(e1, e2)) >] -> e
60 * and parse_simple = parser
61 * [< 'Ident s >] -> Var s
62 * | [< 'Int i >] -> Const(float i)
63 * | [< 'Float f >] -> Const f
64 * | [< 'Kwd "("; e = parse_expr; 'Kwd ")" >] -> e;;
65 * But see how they are forced to use a LL(1) grammar which denatures the
66 * grammar "parse_more_xxx"
70 (*****************************************************************************)
71 (* Parser Combinators *)
72 (*****************************************************************************)
74 (* src: Jon Harrop. pad: a few bugfix *)
76 type ('a
, 'b
) genp
= 'a list
-> 'b
* 'a list
77 let val_of_parser = fst
79 (* lexer = parser of char list *)
80 (* type 'a lexer = (char, 'a) genp *)
82 (* grammer = parser ot tokens *)
83 (* type 'a p = (token, 'a) genp *)
86 (* pad: could also do it by returning a Maybe and use monad *)
101 let es, s
= many p s
in
111 (* was called 'some', but confusing *)
112 let pred p
= function
113 | h
::t
when p h
-> h
, t
114 | _
-> raise Not_found
116 let a x
= pred (( = ) x
)
118 let several p
= many (pred p
)
130 | [] as t
-> Abstr.x, t
131 | _
-> raise Not_found
134 (*****************************************************************************)
136 (*****************************************************************************)
137 (* a generic lexer *)
144 | '
a'
..'z'
| 'A'
..'Z'
-> true
148 let symbol = function
153 | '
+'
| '
-'
| '
*'
| '
/'
161 | ' '
| '
\t'
| '
\n'
-> true
164 let stringquote = function
173 let alphanum c = digit c || alpha c
176 let alphanum_underscore c = digit c || alpha c || (c = '_')
177 let alphanum_minus c = digit c || alpha c || (c = '-')
178 let alphanum_under_minus c = digit c || alpha c || (c = '-') || (c = '_')
183 let string_of_chars cs =
184 cs +> List.map (String.make 1) +> String.concat ""
188 String.concat "" (List.map (String.make 1) (h::t))
191 String.concat "" (List.map (String.make 1) (xs))
193 let list_of_string string =
195 String.iter (fun c -> list := c :: !list) string;
199 (*****************************************************************************)
201 (*****************************************************************************)
210 let string_of_token = function
211 | IDENT string -> "IDENT
:" ^ string
212 | KWD string -> "KWD
:" ^ string
213 | INT string -> "INT
:" ^ string
214 | SYM string -> "SYM
:" ^ string
215 | STR string -> "STR
:" ^ string
218 type lexer = (char, token) genp
222 pred digit +++ several digit >| fun x -> INT(collect x)
224 pred alpha +++ several alphanum >| fun x -> IDENT(collect x)
226 pred symbol +++ several symbol >| fun x -> SYM(collect x)
229 let p c = not(space c) && not(digit c) in
230 pred p +++ several p >| fun x -> KWD(collect x)
233 (* todo: handle antislash *)
236 several (fun c -> not (stringquote c)) +++
238 >| (fun ((c1, cs), c3) ->
239 let s = string_of_chars cs in
240 STR s (* exclude the marker *)
244 let lex_gen tokenf str =
245 let alltoks = (many tokenf) +++ fin >| fst in
246 val_of_parser (alltoks (list_of_string str))
248 let parse_gen tokenf grammarf p string =
249 val_of_parser (grammarf (lex_gen tokenf string))
251 (*****************************************************************************)
252 (* not generic anymore *)
253 (*****************************************************************************)
254 (* the order is important if some "rules
" overlap, as in ocamllex *)
256 (rawident ||| rawnumber ||| rawkeyword) +++ several space >| fst
258 (* pad: bugfix: was not defined in jon harrop article *)
259 let tokens = many token
262 tokens +++ fin >| fst
264 let lex (string : string) =
265 val_of_parser (alltokens (list_of_string string))
271 (lex "a x^
2 + b
x + c
"
273 [IDENT "a"; IDENT "x"; KWD "^
"; INT "2"; KWD "+"; IDENT "b
"; IDENT "x";
277 (*****************************************************************************)
279 (*****************************************************************************)
287 type 'a pparser = (token, 'a) genp
291 # let rec print_expr ff = function
292 | Int n -> fprintf ff "%d
" n
293 | Var x -> fprintf ff "%s
" x
295 fprintf ff "%a
+ %a
" print_expr f print_expr g
297 fprintf ff "%a %a
" print_mul f print_mul g
298 and print_mul ff = function
299 | Add _ as e -> fprintf ff "(%a
)" print_expr e
300 | e -> fprintf ff "%a
" print_expr e
301 #install_printer print_expr
305 | IDENT x :: t -> x, t
306 | _ -> raise Not_found
310 | _ -> raise Not_found
312 let string = function
314 | _ -> raise Not_found
317 * "This style
of parsing
, known
as recursive descent parsing
, has one
318 * important caveat
. If
a rule tries
to match itself immediately
, even
if
319 * that is succeeded by other parsers
, then the resulting program will go
320 * into an infinite loops
with the parser
for that rule calling itself
321 * indefinitely until
a stack overflow occurs
. Consequently
, our
322 * implementation
of the factor parser is careful
to parse an atom first
,
323 * and term calls factor first
, to avoid this problem
."
325 * pad: bugfix, added the KWD "*".
328 (* pad: I think I remembered you cant eta-factorize the parameter
329 * when you use mutually recursive
333 (int >| fun n -> Int(int_of_string n))
335 (ident >| fun x -> Var x)
337 (a (KWD "(") +++ term +++ a (KWD ")") >| fun ((_, e), _) -> e)
341 (atom +++ a (KWD "*") +++ factor >| fun ((f, _), g) -> Mul (f,g))
347 (factor +++ a (KWD "+") +++ term >| fun ((f, _), g) -> Add (f,g))
357 val_of_parser(p(lex string))
360 parse expr "a x x + b
x + c
"
364 (*****************************************************************************)
366 module Infix = struct