Release coccinelle-0.1.1
[bpt/coccinelle.git] / commons / ocamlextra / parser_combinators.ml
1 (*****************************************************************************)
2 (* *)
3 (*****************************************************************************)
4 (* src: Jon Harrop.
5 *
6 * "Certain applications are extremely well suited to functional
7 * programming and parsing is one of them. Specifically, the ability to
8 * write functional combinators that allow parsers for everything from
9 * integers up to symbolic expressions to be composed is more general
10 * and provides more opportunity for code reuse than the use of
11 * conventional parser generators such as ocamllex and ocamlyacc. This
12 * article explains how parser combinators may be designed and
13 * implemented in OCaml, using the standard example of a calculator."
14 *
15 * Based on haskell articles I guess like meijer functional pearl or
16 * graham hutton articles. Also maybe based on haskell parsec.
17 *
18 * pad: a few bugfix. I also put more restrictive and descriptive types.
19 * pad: I remember having coded such a library, maybe not in ocaml.
20 * Or maybe it was during a "TP compilation" at INSA ? I remember having
21 * a generic lexer. Or maybe it was genlex ?
22 *
23 *
24 *
25 *
26 * alternatives: genlex + parser extension of ocaml (streams).
27 * cf genlex doc:
28 *
29 * Example: a lexer suitable for a desk calculator is obtained by
30 * let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"]
31 * let parse_expr = parser
32 * [< 'Int n >] -> n
33 * | [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n
34 * | [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2
35 * and parse_remainder n1 = parser
36 * [< 'Kwd "+"; n2 = parse_expr >] -> n1+n2
37 * | ...
38 * type token =
39 * | Kwd of string
40 * | Ident of string
41 * | Int of int
42 * | Float of float
43 * | String of string
44 * | Char of char
45 *
46 *
47 * Cf also ocaml manual
48 * let rec parse_expr = parser
49 * [< e1 = parse_mult; e = parse_more_adds e1 >] -> e
50 * and parse_more_adds e1 = parser
51 * [< 'Kwd "+"; e2 = parse_mult; e = parse_more_adds (Sum(e1, e2)) >] -> e
52 * | [< 'Kwd "-"; e2 = parse_mult; e = parse_more_adds (Diff(e1, e2)) >] -> e
53 * | [< >] -> e1
54 * and parse_mult = parser
55 * [< e1 = parse_simple; e = parse_more_mults e1 >] -> e
56 * and parse_more_mults e1 = parser
57 * [< 'Kwd "*"; e2 = parse_simple; e = parse_more_mults (Prod(e1, e2)) >] -> e
58 * | [< 'Kwd "/"; e2 = parse_simple; e = parse_more_mults (Quot(e1, e2)) >] -> e
59 * | [< >] -> e1
60 * and parse_simple = parser
61 * [< 'Ident s >] -> Var s
62 * | [< 'Int i >] -> Const(float i)
63 * | [< 'Float f >] -> Const f
64 * | [< 'Kwd "("; e = parse_expr; 'Kwd ")" >] -> e;;
65 * But see how they are forced to use a LL(1) grammar which denatures the
66 * grammar "parse_more_xxx"
67 *
68 *)
69
70 (*****************************************************************************)
71 (* Parser Combinators *)
72 (*****************************************************************************)
73
74 (* src: Jon Harrop. pad: a few bugfix *)
75
76 type ('a, 'b) genp = 'a list -> 'b * 'a list
77 let val_of_parser = fst
78
79 (* lexer = parser of char list *)
80 (* type 'a lexer = (char, 'a) genp *)
81
82 (* grammer = parser ot tokens *)
83 (* type 'a p = (token, 'a) genp *)
84
85
86 (* pad: could also do it by returning a Maybe and use monad *)
87 let ( ||| ) p1 p2 s =
88 try
89 p1 s
90 with Not_found ->
91 p2 s
92
93 let ( +++ ) p1 p2 s =
94 let e1, s = p1 s in
95 let e2, s = p2 s in
96 (e1, e2), s
97
98 let rec many p s =
99 try
100 let e, s = p s in
101 let es, s = many p s in
102 e::es, s
103 with Not_found ->
104 [], s
105
106
107 let ( >| ) p k i =
108 let e, s = p i in
109 k e, s
110
111 (* was called 'some', but confusing *)
112 let pred p = function
113 | h::t when p h -> h, t
114 | _ -> raise Not_found
115
116 let a x = pred (( = ) x)
117
118 let several p = many (pred p)
119
120
121 module Abstr : sig
122 type t
123 val x : t
124 end = struct
125 type t = int
126 let x = 0
127 end
128
129 let fin = function
130 | [] as t -> Abstr.x, t
131 | _ -> raise Not_found
132
133
134 (*****************************************************************************)
135 (* Lexing *)
136 (*****************************************************************************)
137 (* a generic lexer *)
138
139 let digit = function
140 | '0'..'9' -> true
141 | _ -> false
142
143 let alpha = function
144 | 'a'..'z' | 'A'..'Z' -> true
145 | _ -> false
146
147
148 let symbol = function
149 | '(' | ')'
150 | '{' | '}'
151 | '[' | ']'
152 | '<' | '>'
153 | '+' | '-' | '*' | '/'
154 | '&' | '|' | '!'
155
156 | '=' | '~' | '@'
157 -> true
158 | _ -> false
159
160 let space = function
161 | ' ' | '\t' | '\n' -> true
162 | _ -> false
163
164 let stringquote = function
165 | '"' -> true
166 | _ -> false
167
168 let quote = function
169 | '\'' -> true
170 | _ -> false
171
172
173 let alphanum c = digit c || alpha c
174
175
176 let alphanum_underscore c = digit c || alpha c || (c = '_')
177 let alphanum_minus c = digit c || alpha c || (c = '-')
178 let alphanum_under_minus c = digit c || alpha c || (c = '-') || (c = '_')
179
180
181
182 let (+>) o f = f o
183 let string_of_chars cs =
184 cs +> List.map (String.make 1) +> String.concat ""
185
186
187 let collect(h, t) =
188 String.concat "" (List.map (String.make 1) (h::t))
189
190 let collectbis(xs) =
191 String.concat "" (List.map (String.make 1) (xs))
192
193 let list_of_string string =
194 let list = ref [] in
195 String.iter (fun c -> list := c :: !list) string;
196 List.rev !list
197
198
199 (*****************************************************************************)
200 (* still generic *)
201 (*****************************************************************************)
202
203 type token =
204 | IDENT of string
205 | KWD of string
206 | INT of string
207 | SYM of string
208 | STR of string
209
210 let string_of_token = function
211 | IDENT string -> "IDENT:" ^ string
212 | KWD string -> "KWD:" ^ string
213 | INT string -> "INT:" ^ string
214 | SYM string -> "SYM:" ^ string
215 | STR string -> "STR:" ^ string
216
217
218 type lexer = (char, token) genp
219
220
221 let rawnumber =
222 pred digit +++ several digit >| fun x -> INT(collect x)
223 let rawident =
224 pred alpha +++ several alphanum >| fun x -> IDENT(collect x)
225 let rawsymbol =
226 pred symbol +++ several symbol >| fun x -> SYM(collect x)
227
228 let rawkeyword =
229 let p c = not(space c) && not(digit c) in
230 pred p +++ several p >| fun x -> KWD(collect x)
231
232
233 (* todo: handle antislash *)
234 let rawstring =
235 pred stringquote +++
236 several (fun c -> not (stringquote c)) +++
237 pred stringquote
238 >| (fun ((c1, cs), c3) ->
239 let s = string_of_chars cs in
240 STR s (* exclude the marker *)
241 )
242
243
244 let lex_gen tokenf str =
245 let alltoks = (many tokenf) +++ fin >| fst in
246 val_of_parser (alltoks (list_of_string str))
247
248 let parse_gen tokenf grammarf p string =
249 val_of_parser (grammarf (lex_gen tokenf string))
250
251 (*****************************************************************************)
252 (* not generic anymore *)
253 (*****************************************************************************)
254 (* the order is important if some "rules" overlap, as in ocamllex *)
255 let token =
256 (rawident ||| rawnumber ||| rawkeyword) +++ several space >| fst
257
258 (* pad: bugfix: was not defined in jon harrop article *)
259 let tokens = many token
260
261 let alltokens =
262 tokens +++ fin >| fst
263
264 let lex (string : string) =
265 val_of_parser (alltokens (list_of_string string))
266
267
268
269 let test1 () =
270 Common.example
271 (lex "a x^2 + b x + c"
272 =
273 [IDENT "a"; IDENT "x"; KWD "^"; INT "2"; KWD "+"; IDENT "b"; IDENT "x";
274 KWD "+"; IDENT "c"]
275 )
276
277 (*****************************************************************************)
278 (* Parsing *)
279 (*****************************************************************************)
280
281 type expr =
282 | Int of int
283 | Var of string
284 | Add of expr * expr
285 | Mul of expr * expr
286
287 type 'a pparser = (token, 'a) genp
288
289 (*
290 open Format;;
291 # let rec print_expr ff = function
292 | Int n -> fprintf ff "%d" n
293 | Var x -> fprintf ff "%s" x
294 | Add(f, g) ->
295 fprintf ff "%a + %a" print_expr f print_expr g
296 | Mul(f, g) ->
297 fprintf ff "%a %a" print_mul f print_mul g
298 and print_mul ff = function
299 | Add _ as e -> fprintf ff "(%a)" print_expr e
300 | e -> fprintf ff "%a" print_expr e
301 #install_printer print_expr
302 *)
303
304 let ident = function
305 | IDENT x :: t -> x, t
306 | _ -> raise Not_found
307
308 let int = function
309 | INT n :: t -> n, t
310 | _ -> raise Not_found
311
312 let string = function
313 | STR x :: t -> x, t
314 | _ -> raise Not_found
315
316 (* src: Jon Harrop
317 * "This style of parsing, known as recursive descent parsing , has one
318 * important caveat. If a rule tries to match itself immediately, even if
319 * that is succeeded by other parsers, then the resulting program will go
320 * into an infinite loops with the parser for that rule calling itself
321 * indefinitely until a stack overflow occurs. Consequently, our
322 * implementation of the factor parser is careful to parse an atom first,
323 * and term calls factor first, to avoid this problem."
324 *
325 * pad: bugfix, added the KWD "*".
326 *)
327
328 (* pad: I think I remembered you cant eta-factorize the parameter
329 * when you use mutually recursive
330 *)
331 let rec atom s =
332 (
333 (int >| fun n -> Int(int_of_string n))
334 |||
335 (ident >| fun x -> Var x)
336 |||
337 (a (KWD "(") +++ term +++ a (KWD ")") >| fun ((_, e), _) -> e)
338 ) s
339 and factor s =
340 (
341 (atom +++ a (KWD "*") +++ factor >| fun ((f, _), g) -> Mul (f,g))
342 |||
343 atom
344 ) s
345 and term s =
346 (
347 (factor +++ a (KWD "+") +++ term >| fun ((f, _), g) -> Add (f,g))
348 |||
349 factor
350 ) s
351
352
353 let expr =
354 term +++ fin >| fst
355
356 let parse p string =
357 val_of_parser(p(lex string))
358
359 (*
360 parse expr "a x x + b x + c"
361 *)
362
363
364 (*****************************************************************************)
365
366 module Infix = struct
367 let (|||) = (|||)
368 let (+++) = (+++)
369 let (>|) = (>|)
370 end