| 1 | (*****************************************************************************) |
| 2 | (* *) |
| 3 | (*****************************************************************************) |
| 4 | (* src: Jon Harrop. |
| 5 | * |
| 6 | * "Certain applications are extremely well suited to functional |
| 7 | * programming and parsing is one of them. Specifically, the ability to |
| 8 | * write functional combinators that allow parsers for everything from |
| 9 | * integers up to symbolic expressions to be composed is more general |
| 10 | * and provides more opportunity for code reuse than the use of |
| 11 | * conventional parser generators such as ocamllex and ocamlyacc. This |
| 12 | * article explains how parser combinators may be designed and |
| 13 | * implemented in OCaml, using the standard example of a calculator." |
| 14 | * |
| 15 | * Based on haskell articles I guess like meijer functional pearl or |
| 16 | * graham hutton articles. Also maybe based on haskell parsec. |
| 17 | * |
| 18 | * pad: a few bugfix. I also put more restrictive and descriptive types. |
| 19 | * pad: I remember having coded such a library, maybe not in ocaml. |
| 20 | * Or maybe it was during a "TP compilation" at INSA ? I remember having |
| 21 | * a generic lexer. Or maybe it was genlex ? |
| 22 | * |
| 23 | * |
| 24 | * |
| 25 | * |
| 26 | * alternatives: genlex + parser extension of ocaml (streams). |
| 27 | * cf genlex doc: |
| 28 | * |
| 29 | * Example: a lexer suitable for a desk calculator is obtained by |
| 30 | * let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"] |
| 31 | * let parse_expr = parser |
| 32 | * [< 'Int n >] -> n |
| 33 | * | [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n |
| 34 | * | [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2 |
| 35 | * and parse_remainder n1 = parser |
| 36 | * [< 'Kwd "+"; n2 = parse_expr >] -> n1+n2 |
| 37 | * | ... |
| 38 | * type token = |
| 39 | * | Kwd of string |
| 40 | * | Ident of string |
| 41 | * | Int of int |
| 42 | * | Float of float |
| 43 | * | String of string |
| 44 | * | Char of char |
| 45 | * |
| 46 | * |
| 47 | * Cf also ocaml manual |
| 48 | * let rec parse_expr = parser |
| 49 | * [< e1 = parse_mult; e = parse_more_adds e1 >] -> e |
| 50 | * and parse_more_adds e1 = parser |
| 51 | * [< 'Kwd "+"; e2 = parse_mult; e = parse_more_adds (Sum(e1, e2)) >] -> e |
| 52 | * | [< 'Kwd "-"; e2 = parse_mult; e = parse_more_adds (Diff(e1, e2)) >] -> e |
| 53 | * | [< >] -> e1 |
| 54 | * and parse_mult = parser |
| 55 | * [< e1 = parse_simple; e = parse_more_mults e1 >] -> e |
| 56 | * and parse_more_mults e1 = parser |
| 57 | * [< 'Kwd "*"; e2 = parse_simple; e = parse_more_mults (Prod(e1, e2)) >] -> e |
| 58 | * | [< 'Kwd "/"; e2 = parse_simple; e = parse_more_mults (Quot(e1, e2)) >] -> e |
| 59 | * | [< >] -> e1 |
| 60 | * and parse_simple = parser |
| 61 | * [< 'Ident s >] -> Var s |
| 62 | * | [< 'Int i >] -> Const(float i) |
| 63 | * | [< 'Float f >] -> Const f |
| 64 | * | [< 'Kwd "("; e = parse_expr; 'Kwd ")" >] -> e;; |
| 65 | * But see how they are forced to use a LL(1) grammar which denatures the |
| 66 | * grammar "parse_more_xxx" |
| 67 | * |
| 68 | *) |
| 69 | |
| 70 | (*****************************************************************************) |
| 71 | (* Parser Combinators *) |
| 72 | (*****************************************************************************) |
| 73 | |
| 74 | (* src: Jon Harrop. pad: a few bugfix *) |
| 75 | |
| 76 | type ('a, 'b) genp = 'a list -> 'b * 'a list |
| 77 | let val_of_parser = fst |
| 78 | |
| 79 | (* lexer = parser of char list *) |
| 80 | (* type 'a lexer = (char, 'a) genp *) |
| 81 | |
| 82 | (* grammer = parser ot tokens *) |
| 83 | (* type 'a p = (token, 'a) genp *) |
| 84 | |
| 85 | |
| 86 | (* pad: could also do it by returning a Maybe and use monad *) |
| 87 | let ( ||| ) p1 p2 s = |
| 88 | try |
| 89 | p1 s |
| 90 | with Not_found -> |
| 91 | p2 s |
| 92 | |
| 93 | let ( +++ ) p1 p2 s = |
| 94 | let e1, s = p1 s in |
| 95 | let e2, s = p2 s in |
| 96 | (e1, e2), s |
| 97 | |
| 98 | let rec many p s = |
| 99 | try |
| 100 | let e, s = p s in |
| 101 | let es, s = many p s in |
| 102 | e::es, s |
| 103 | with Not_found -> |
| 104 | [], s |
| 105 | |
| 106 | |
| 107 | let ( >| ) p k i = |
| 108 | let e, s = p i in |
| 109 | k e, s |
| 110 | |
| 111 | (* was called 'some', but confusing *) |
| 112 | let pred p = function |
| 113 | | h::t when p h -> h, t |
| 114 | | _ -> raise Not_found |
| 115 | |
| 116 | let a x = pred (( = ) x) |
| 117 | |
| 118 | let several p = many (pred p) |
| 119 | |
| 120 | |
| 121 | module Abstr : sig |
| 122 | type t |
| 123 | val x : t |
| 124 | end = struct |
| 125 | type t = int |
| 126 | let x = 0 |
| 127 | end |
| 128 | |
| 129 | let fin = function |
| 130 | | [] as t -> Abstr.x, t |
| 131 | | _ -> raise Not_found |
| 132 | |
| 133 | |
| 134 | (*****************************************************************************) |
| 135 | (* Lexing *) |
| 136 | (*****************************************************************************) |
| 137 | (* a generic lexer *) |
| 138 | |
| 139 | let digit = function |
| 140 | | '0'..'9' -> true |
| 141 | | _ -> false |
| 142 | |
| 143 | let alpha = function |
| 144 | | 'a'..'z' | 'A'..'Z' -> true |
| 145 | | _ -> false |
| 146 | |
| 147 | |
| 148 | let symbol = function |
| 149 | | '(' | ')' |
| 150 | | '{' | '}' |
| 151 | | '[' | ']' |
| 152 | | '<' | '>' |
| 153 | | '+' | '-' | '*' | '/' |
| 154 | | '&' | '|' | '!' |
| 155 | |
| 156 | | '=' | '~' | '@' |
| 157 | -> true |
| 158 | | _ -> false |
| 159 | |
| 160 | let space = function |
| 161 | | ' ' | '\t' | '\n' -> true |
| 162 | | _ -> false |
| 163 | |
| 164 | let stringquote = function |
| 165 | | '"' -> true |
| 166 | | _ -> false |
| 167 | |
| 168 | let quote = function |
| 169 | | '\'' -> true |
| 170 | | _ -> false |
| 171 | |
| 172 | |
| 173 | let alphanum c = digit c || alpha c |
| 174 | |
| 175 | |
| 176 | let alphanum_underscore c = digit c || alpha c || (c = '_') |
| 177 | let alphanum_minus c = digit c || alpha c || (c = '-') |
| 178 | let alphanum_under_minus c = digit c || alpha c || (c = '-') || (c = '_') |
| 179 | |
| 180 | |
| 181 | |
| 182 | let (+>) o f = f o |
| 183 | let string_of_chars cs = |
| 184 | cs +> List.map (String.make 1) +> String.concat "" |
| 185 | |
| 186 | |
| 187 | let collect(h, t) = |
| 188 | String.concat "" (List.map (String.make 1) (h::t)) |
| 189 | |
| 190 | let collectbis(xs) = |
| 191 | String.concat "" (List.map (String.make 1) (xs)) |
| 192 | |
| 193 | let list_of_string string = |
| 194 | let list = ref [] in |
| 195 | String.iter (fun c -> list := c :: !list) string; |
| 196 | List.rev !list |
| 197 | |
| 198 | |
| 199 | (*****************************************************************************) |
| 200 | (* still generic *) |
| 201 | (*****************************************************************************) |
| 202 | |
| 203 | type token = |
| 204 | | IDENT of string |
| 205 | | KWD of string |
| 206 | | INT of string |
| 207 | | SYM of string |
| 208 | | STR of string |
| 209 | |
| 210 | let string_of_token = function |
| 211 | | IDENT string -> "IDENT:" ^ string |
| 212 | | KWD string -> "KWD:" ^ string |
| 213 | | INT string -> "INT:" ^ string |
| 214 | | SYM string -> "SYM:" ^ string |
| 215 | | STR string -> "STR:" ^ string |
| 216 | |
| 217 | |
| 218 | type lexer = (char, token) genp |
| 219 | |
| 220 | |
| 221 | let rawnumber = |
| 222 | pred digit +++ several digit >| fun x -> INT(collect x) |
| 223 | let rawident = |
| 224 | pred alpha +++ several alphanum >| fun x -> IDENT(collect x) |
| 225 | let rawsymbol = |
| 226 | pred symbol +++ several symbol >| fun x -> SYM(collect x) |
| 227 | |
| 228 | let rawkeyword = |
| 229 | let p c = not(space c) && not(digit c) in |
| 230 | pred p +++ several p >| fun x -> KWD(collect x) |
| 231 | |
| 232 | |
| 233 | (* todo: handle antislash *) |
| 234 | let rawstring = |
| 235 | pred stringquote +++ |
| 236 | several (fun c -> not (stringquote c)) +++ |
| 237 | pred stringquote |
| 238 | >| (fun ((c1, cs), c3) -> |
| 239 | let s = string_of_chars cs in |
| 240 | STR s (* exclude the marker *) |
| 241 | ) |
| 242 | |
| 243 | |
| 244 | let lex_gen tokenf str = |
| 245 | let alltoks = (many tokenf) +++ fin >| fst in |
| 246 | val_of_parser (alltoks (list_of_string str)) |
| 247 | |
| 248 | let parse_gen tokenf grammarf p string = |
| 249 | val_of_parser (grammarf (lex_gen tokenf string)) |
| 250 | |
| 251 | (*****************************************************************************) |
| 252 | (* not generic anymore *) |
| 253 | (*****************************************************************************) |
| 254 | (* the order is important if some "rules" overlap, as in ocamllex *) |
| 255 | let token = |
| 256 | (rawident ||| rawnumber ||| rawkeyword) +++ several space >| fst |
| 257 | |
| 258 | (* pad: bugfix: was not defined in jon harrop article *) |
| 259 | let tokens = many token |
| 260 | |
| 261 | let alltokens = |
| 262 | tokens +++ fin >| fst |
| 263 | |
| 264 | let lex (string : string) = |
| 265 | val_of_parser (alltokens (list_of_string string)) |
| 266 | |
| 267 | |
| 268 | |
| 269 | let test1 () = |
| 270 | Common.example |
| 271 | (lex "a x^2 + b x + c" |
| 272 | = |
| 273 | [IDENT "a"; IDENT "x"; KWD "^"; INT "2"; KWD "+"; IDENT "b"; IDENT "x"; |
| 274 | KWD "+"; IDENT "c"] |
| 275 | ) |
| 276 | |
| 277 | (*****************************************************************************) |
| 278 | (* Parsing *) |
| 279 | (*****************************************************************************) |
| 280 | |
| 281 | type expr = |
| 282 | | Int of int |
| 283 | | Var of string |
| 284 | | Add of expr * expr |
| 285 | | Mul of expr * expr |
| 286 | |
| 287 | type 'a pparser = (token, 'a) genp |
| 288 | |
| 289 | (* |
| 290 | open Format;; |
| 291 | # let rec print_expr ff = function |
| 292 | | Int n -> fprintf ff "%d" n |
| 293 | | Var x -> fprintf ff "%s" x |
| 294 | | Add(f, g) -> |
| 295 | fprintf ff "%a + %a" print_expr f print_expr g |
| 296 | | Mul(f, g) -> |
| 297 | fprintf ff "%a %a" print_mul f print_mul g |
| 298 | and print_mul ff = function |
| 299 | | Add _ as e -> fprintf ff "(%a)" print_expr e |
| 300 | | e -> fprintf ff "%a" print_expr e |
| 301 | #install_printer print_expr |
| 302 | *) |
| 303 | |
| 304 | let ident = function |
| 305 | | IDENT x :: t -> x, t |
| 306 | | _ -> raise Not_found |
| 307 | |
| 308 | let int = function |
| 309 | | INT n :: t -> n, t |
| 310 | | _ -> raise Not_found |
| 311 | |
| 312 | let string = function |
| 313 | | STR x :: t -> x, t |
| 314 | | _ -> raise Not_found |
| 315 | |
| 316 | (* src: Jon Harrop |
| 317 | * "This style of parsing, known as recursive descent parsing , has one |
| 318 | * important caveat. If a rule tries to match itself immediately, even if |
| 319 | * that is succeeded by other parsers, then the resulting program will go |
| 320 | * into an infinite loops with the parser for that rule calling itself |
| 321 | * indefinitely until a stack overflow occurs. Consequently, our |
| 322 | * implementation of the factor parser is careful to parse an atom first, |
| 323 | * and term calls factor first, to avoid this problem." |
| 324 | * |
| 325 | * pad: bugfix, added the KWD "*". |
| 326 | *) |
| 327 | |
| 328 | (* pad: I think I remembered you cant eta-factorize the parameter |
| 329 | * when you use mutually recursive |
| 330 | *) |
| 331 | let rec atom s = |
| 332 | ( |
| 333 | (int >| fun n -> Int(int_of_string n)) |
| 334 | ||| |
| 335 | (ident >| fun x -> Var x) |
| 336 | ||| |
| 337 | (a (KWD "(") +++ term +++ a (KWD ")") >| fun ((_, e), _) -> e) |
| 338 | ) s |
| 339 | and factor s = |
| 340 | ( |
| 341 | (atom +++ a (KWD "*") +++ factor >| fun ((f, _), g) -> Mul (f,g)) |
| 342 | ||| |
| 343 | atom |
| 344 | ) s |
| 345 | and term s = |
| 346 | ( |
| 347 | (factor +++ a (KWD "+") +++ term >| fun ((f, _), g) -> Add (f,g)) |
| 348 | ||| |
| 349 | factor |
| 350 | ) s |
| 351 | |
| 352 | |
| 353 | let expr = |
| 354 | term +++ fin >| fst |
| 355 | |
| 356 | let parse p string = |
| 357 | val_of_parser(p(lex string)) |
| 358 | |
| 359 | (* |
| 360 | parse expr "a x x + b x + c" |
| 361 | *) |
| 362 | |
| 363 | |
| 364 | (*****************************************************************************) |
| 365 | |
| 366 | module Infix = struct |
| 367 | let (|||) = (|||) |
| 368 | let (+++) = (+++) |
| 369 | let (>|) = (>|) |
| 370 | end |