Commit | Line | Data |
---|---|---|
34e49164 C |
1 | (*****************************************************************************) |
2 | (* *) | |
3 | (*****************************************************************************) | |
ae4735db | 4 | (* src: Jon Harrop. |
34e49164 C |
5 | * |
6 | * "Certain applications are extremely well suited to functional | |
7 | * programming and parsing is one of them. Specifically, the ability to | |
8 | * write functional combinators that allow parsers for everything from | |
9 | * integers up to symbolic expressions to be composed is more general | |
10 | * and provides more opportunity for code reuse than the use of | |
11 | * conventional parser generators such as ocamllex and ocamlyacc. This | |
12 | * article explains how parser combinators may be designed and | |
13 | * implemented in OCaml, using the standard example of a calculator." | |
ae4735db | 14 | * |
34e49164 C |
15 | * Based on haskell articles I guess like meijer functional pearl or |
16 | * graham hutton articles. Also maybe based on haskell parsec. | |
ae4735db | 17 | * |
34e49164 | 18 | * pad: a few bugfix. I also put more restrictive and descriptive types. |
ae4735db | 19 | * pad: I remember having coded such a library, maybe not in ocaml. |
34e49164 C |
20 | * Or maybe it was during a "TP compilation" at INSA ? I remember having |
21 | * a generic lexer. Or maybe it was genlex ? | |
ae4735db C |
22 | * |
23 | * | |
24 | * | |
25 | * | |
34e49164 C |
26 | * alternatives: genlex + parser extension of ocaml (streams). |
27 | * cf genlex doc: | |
ae4735db | 28 | * |
34e49164 | 29 | * Example: a lexer suitable for a desk calculator is obtained by |
ae4735db | 30 | * let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"] |
34e49164 C |
31 | * let parse_expr = parser |
32 | * [< 'Int n >] -> n | |
33 | * | [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n | |
34 | * | [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2 | |
35 | * and parse_remainder n1 = parser | |
36 | * [< 'Kwd "+"; n2 = parse_expr >] -> n1+n2 | |
37 | * | ... | |
38 | * type token = | |
39 | * | Kwd of string | |
40 | * | Ident of string | |
41 | * | Int of int | |
42 | * | Float of float | |
43 | * | String of string | |
44 | * | Char of char | |
ae4735db C |
45 | * |
46 | * | |
34e49164 C |
47 | * Cf also ocaml manual |
48 | * let rec parse_expr = parser | |
49 | * [< e1 = parse_mult; e = parse_more_adds e1 >] -> e | |
50 | * and parse_more_adds e1 = parser | |
51 | * [< 'Kwd "+"; e2 = parse_mult; e = parse_more_adds (Sum(e1, e2)) >] -> e | |
52 | * | [< 'Kwd "-"; e2 = parse_mult; e = parse_more_adds (Diff(e1, e2)) >] -> e | |
53 | * | [< >] -> e1 | |
54 | * and parse_mult = parser | |
55 | * [< e1 = parse_simple; e = parse_more_mults e1 >] -> e | |
56 | * and parse_more_mults e1 = parser | |
57 | * [< 'Kwd "*"; e2 = parse_simple; e = parse_more_mults (Prod(e1, e2)) >] -> e | |
58 | * | [< 'Kwd "/"; e2 = parse_simple; e = parse_more_mults (Quot(e1, e2)) >] -> e | |
59 | * | [< >] -> e1 | |
60 | * and parse_simple = parser | |
61 | * [< 'Ident s >] -> Var s | |
62 | * | [< 'Int i >] -> Const(float i) | |
63 | * | [< 'Float f >] -> Const f | |
64 | * | [< 'Kwd "("; e = parse_expr; 'Kwd ")" >] -> e;; | |
65 | * But see how they are forced to use a LL(1) grammar which denatures the | |
66 | * grammar "parse_more_xxx" | |
ae4735db | 67 | * |
34e49164 C |
68 | *) |
69 | ||
70 | (*****************************************************************************) | |
71 | (* Parser Combinators *) | |
72 | (*****************************************************************************) | |
73 | ||
74 | (* src: Jon Harrop. pad: a few bugfix *) | |
75 | ||
76 | type ('a, 'b) genp = 'a list -> 'b * 'a list | |
77 | let val_of_parser = fst | |
78 | ||
79 | (* lexer = parser of char list *) | |
80 | (* type 'a lexer = (char, 'a) genp *) | |
81 | ||
82 | (* grammer = parser ot tokens *) | |
83 | (* type 'a p = (token, 'a) genp *) | |
84 | ||
85 | ||
86 | (* pad: could also do it by returning a Maybe and use monad *) | |
87 | let ( ||| ) p1 p2 s = | |
ae4735db C |
88 | try |
89 | p1 s | |
90 | with Not_found -> | |
34e49164 | 91 | p2 s |
ae4735db | 92 | |
34e49164 C |
93 | let ( +++ ) p1 p2 s = |
94 | let e1, s = p1 s in | |
95 | let e2, s = p2 s in | |
96 | (e1, e2), s | |
97 | ||
98 | let rec many p s = | |
99 | try | |
100 | let e, s = p s in | |
101 | let es, s = many p s in | |
102 | e::es, s | |
103 | with Not_found -> | |
104 | [], s | |
105 | ||
106 | ||
107 | let ( >| ) p k i = | |
108 | let e, s = p i in | |
109 | k e, s | |
110 | ||
111 | (* was called 'some', but confusing *) | |
112 | let pred p = function | |
113 | | h::t when p h -> h, t | |
114 | | _ -> raise Not_found | |
115 | ||
116 | let a x = pred (( = ) x) | |
117 | ||
118 | let several p = many (pred p) | |
119 | ||
120 | ||
121 | module Abstr : sig | |
122 | type t | |
123 | val x : t | |
124 | end = struct | |
125 | type t = int | |
126 | let x = 0 | |
127 | end | |
128 | ||
129 | let fin = function | |
130 | | [] as t -> Abstr.x, t | |
131 | | _ -> raise Not_found | |
132 | ||
133 | ||
134 | (*****************************************************************************) | |
135 | (* Lexing *) | |
136 | (*****************************************************************************) | |
137 | (* a generic lexer *) | |
138 | ||
139 | let digit = function | |
140 | | '0'..'9' -> true | |
141 | | _ -> false | |
142 | ||
143 | let alpha = function | |
144 | | 'a'..'z' | 'A'..'Z' -> true | |
145 | | _ -> false | |
146 | ||
147 | ||
148 | let symbol = function | |
ae4735db C |
149 | | '(' | ')' |
150 | | '{' | '}' | |
151 | | '[' | ']' | |
152 | | '<' | '>' | |
153 | | '+' | '-' | '*' | '/' | |
154 | | '&' | '|' | '!' | |
34e49164 C |
155 | |
156 | | '=' | '~' | '@' | |
157 | -> true | |
158 | | _ -> false | |
159 | ||
160 | let space = function | |
161 | | ' ' | '\t' | '\n' -> true | |
162 | | _ -> false | |
163 | ||
164 | let stringquote = function | |
165 | | '"' -> true | |
166 | | _ -> false | |
167 | ||
168 | let quote = function | |
169 | | '\'' -> true | |
170 | | _ -> false | |
171 | ||
172 | ||
173 | let alphanum c = digit c || alpha c | |
174 | ||
175 | ||
176 | let alphanum_underscore c = digit c || alpha c || (c = '_') | |
177 | let alphanum_minus c = digit c || alpha c || (c = '-') | |
178 | let alphanum_under_minus c = digit c || alpha c || (c = '-') || (c = '_') | |
179 | ||
180 | ||
181 | ||
182 | let (+>) o f = f o | |
ae4735db | 183 | let string_of_chars cs = |
34e49164 C |
184 | cs +> List.map (String.make 1) +> String.concat "" |
185 | ||
186 | ||
187 | let collect(h, t) = | |
188 | String.concat "" (List.map (String.make 1) (h::t)) | |
189 | ||
190 | let collectbis(xs) = | |
191 | String.concat "" (List.map (String.make 1) (xs)) | |
192 | ||
193 | let list_of_string string = | |
194 | let list = ref [] in | |
195 | String.iter (fun c -> list := c :: !list) string; | |
196 | List.rev !list | |
197 | ||
198 | ||
199 | (*****************************************************************************) | |
200 | (* still generic *) | |
201 | (*****************************************************************************) | |
202 | ||
203 | type token = | |
204 | | IDENT of string | |
205 | | KWD of string | |
206 | | INT of string | |
207 | | SYM of string | |
208 | | STR of string | |
209 | ||
210 | let string_of_token = function | |
211 | | IDENT string -> "IDENT:" ^ string | |
212 | | KWD string -> "KWD:" ^ string | |
213 | | INT string -> "INT:" ^ string | |
214 | | SYM string -> "SYM:" ^ string | |
215 | | STR string -> "STR:" ^ string | |
216 | ||
217 | ||
218 | type lexer = (char, token) genp | |
219 | ||
220 | ||
221 | let rawnumber = | |
222 | pred digit +++ several digit >| fun x -> INT(collect x) | |
223 | let rawident = | |
224 | pred alpha +++ several alphanum >| fun x -> IDENT(collect x) | |
225 | let rawsymbol = | |
226 | pred symbol +++ several symbol >| fun x -> SYM(collect x) | |
227 | ||
228 | let rawkeyword = | |
229 | let p c = not(space c) && not(digit c) in | |
230 | pred p +++ several p >| fun x -> KWD(collect x) | |
231 | ||
232 | ||
233 | (* todo: handle antislash *) | |
ae4735db C |
234 | let rawstring = |
235 | pred stringquote +++ | |
34e49164 C |
236 | several (fun c -> not (stringquote c)) +++ |
237 | pred stringquote | |
ae4735db | 238 | >| (fun ((c1, cs), c3) -> |
34e49164 C |
239 | let s = string_of_chars cs in |
240 | STR s (* exclude the marker *) | |
241 | ) | |
242 | ||
243 | ||
ae4735db | 244 | let lex_gen tokenf str = |
34e49164 C |
245 | let alltoks = (many tokenf) +++ fin >| fst in |
246 | val_of_parser (alltoks (list_of_string str)) | |
247 | ||
248 | let parse_gen tokenf grammarf p string = | |
249 | val_of_parser (grammarf (lex_gen tokenf string)) | |
250 | ||
251 | (*****************************************************************************) | |
252 | (* not generic anymore *) | |
253 | (*****************************************************************************) | |
254 | (* the order is important if some "rules" overlap, as in ocamllex *) | |
255 | let token = | |
256 | (rawident ||| rawnumber ||| rawkeyword) +++ several space >| fst | |
257 | ||
258 | (* pad: bugfix: was not defined in jon harrop article *) | |
259 | let tokens = many token | |
260 | ||
261 | let alltokens = | |
262 | tokens +++ fin >| fst | |
263 | ||
264 | let lex (string : string) = | |
265 | val_of_parser (alltokens (list_of_string string)) | |
266 | ||
267 | ||
268 | ||
ae4735db | 269 | let test1 () = |
34e49164 | 270 | Common.example |
ae4735db | 271 | (lex "a x^2 + b x + c" |
34e49164 C |
272 | = |
273 | [IDENT "a"; IDENT "x"; KWD "^"; INT "2"; KWD "+"; IDENT "b"; IDENT "x"; | |
274 | KWD "+"; IDENT "c"] | |
275 | ) | |
276 | ||
277 | (*****************************************************************************) | |
278 | (* Parsing *) | |
279 | (*****************************************************************************) | |
280 | ||
281 | type expr = | |
282 | | Int of int | |
283 | | Var of string | |
284 | | Add of expr * expr | |
285 | | Mul of expr * expr | |
286 | ||
287 | type 'a pparser = (token, 'a) genp | |
288 | ||
289 | (* | |
290 | open Format;; | |
291 | # let rec print_expr ff = function | |
292 | | Int n -> fprintf ff "%d" n | |
293 | | Var x -> fprintf ff "%s" x | |
294 | | Add(f, g) -> | |
295 | fprintf ff "%a + %a" print_expr f print_expr g | |
296 | | Mul(f, g) -> | |
297 | fprintf ff "%a %a" print_mul f print_mul g | |
298 | and print_mul ff = function | |
299 | | Add _ as e -> fprintf ff "(%a)" print_expr e | |
300 | | e -> fprintf ff "%a" print_expr e | |
301 | #install_printer print_expr | |
302 | *) | |
303 | ||
304 | let ident = function | |
305 | | IDENT x :: t -> x, t | |
306 | | _ -> raise Not_found | |
307 | ||
308 | let int = function | |
309 | | INT n :: t -> n, t | |
310 | | _ -> raise Not_found | |
311 | ||
312 | let string = function | |
313 | | STR x :: t -> x, t | |
314 | | _ -> raise Not_found | |
315 | ||
316 | (* src: Jon Harrop | |
317 | * "This style of parsing, known as recursive descent parsing , has one | |
318 | * important caveat. If a rule tries to match itself immediately, even if | |
319 | * that is succeeded by other parsers, then the resulting program will go | |
320 | * into an infinite loops with the parser for that rule calling itself | |
321 | * indefinitely until a stack overflow occurs. Consequently, our | |
322 | * implementation of the factor parser is careful to parse an atom first, | |
323 | * and term calls factor first, to avoid this problem." | |
ae4735db | 324 | * |
34e49164 C |
325 | * pad: bugfix, added the KWD "*". |
326 | *) | |
327 | ||
ae4735db | 328 | (* pad: I think I remembered you cant eta-factorize the parameter |
34e49164 C |
329 | * when you use mutually recursive |
330 | *) | |
331 | let rec atom s = | |
332 | ( | |
ae4735db | 333 | (int >| fun n -> Int(int_of_string n)) |
34e49164 | 334 | ||| |
ae4735db | 335 | (ident >| fun x -> Var x) |
34e49164 C |
336 | ||| |
337 | (a (KWD "(") +++ term +++ a (KWD ")") >| fun ((_, e), _) -> e) | |
338 | ) s | |
339 | and factor s = | |
340 | ( | |
ae4735db | 341 | (atom +++ a (KWD "*") +++ factor >| fun ((f, _), g) -> Mul (f,g)) |
34e49164 C |
342 | ||| |
343 | atom | |
344 | ) s | |
345 | and term s = | |
346 | ( | |
ae4735db | 347 | (factor +++ a (KWD "+") +++ term >| fun ((f, _), g) -> Add (f,g)) |
34e49164 C |
348 | ||| |
349 | factor | |
350 | ) s | |
351 | ||
352 | ||
353 | let expr = | |
354 | term +++ fin >| fst | |
355 | ||
356 | let parse p string = | |
357 | val_of_parser(p(lex string)) | |
358 | ||
ae4735db | 359 | (* |
34e49164 C |
360 | parse expr "a x x + b x + c" |
361 | *) | |
362 | ||
363 | ||
364 | (*****************************************************************************) | |
365 | ||
366 | module Infix = struct | |
367 | let (|||) = (|||) | |
368 | let (+++) = (+++) | |
369 | let (>|) = (>|) | |
370 | end |