--- /dev/null
+(*****************************************************************************)
+(* *)
+(*****************************************************************************)
+(* src: Jon Harrop.
+ *
+ * "Certain applications are extremely well suited to functional
+ * programming and parsing is one of them. Specifically, the ability to
+ * write functional combinators that allow parsers for everything from
+ * integers up to symbolic expressions to be composed is more general
+ * and provides more opportunity for code reuse than the use of
+ * conventional parser generators such as ocamllex and ocamlyacc. This
+ * article explains how parser combinators may be designed and
+ * implemented in OCaml, using the standard example of a calculator."
+ *
+ * Based on haskell articles I guess like meijer functional pearl or
+ * graham hutton articles. Also maybe based on haskell parsec.
+ *
+ * pad: a few bugfix. I also put more restrictive and descriptive types.
+ * pad: I remember having coded such a library, maybe not in ocaml.
+ * Or maybe it was during a "TP compilation" at INSA ? I remember having
+ * a generic lexer. Or maybe it was genlex ?
+ *
+ *
+ *
+ *
+ * alternatives: genlex + parser extension of ocaml (streams).
+ * cf genlex doc:
+ *
+ * Example: a lexer suitable for a desk calculator is obtained by
+ * let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"]
+ * let parse_expr = parser
+ * [< 'Int n >] -> n
+ * | [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n
+ * | [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2
+ * and parse_remainder n1 = parser
+ * [< 'Kwd "+"; n2 = parse_expr >] -> n1+n2
+ * | ...
+ * type token =
+ * | Kwd of string
+ * | Ident of string
+ * | Int of int
+ * | Float of float
+ * | String of string
+ * | Char of char
+ *
+ *
+ * Cf also ocaml manual
+ * let rec parse_expr = parser
+ * [< e1 = parse_mult; e = parse_more_adds e1 >] -> e
+ * and parse_more_adds e1 = parser
+ * [< 'Kwd "+"; e2 = parse_mult; e = parse_more_adds (Sum(e1, e2)) >] -> e
+ * | [< 'Kwd "-"; e2 = parse_mult; e = parse_more_adds (Diff(e1, e2)) >] -> e
+ * | [< >] -> e1
+ * and parse_mult = parser
+ * [< e1 = parse_simple; e = parse_more_mults e1 >] -> e
+ * and parse_more_mults e1 = parser
+ * [< 'Kwd "*"; e2 = parse_simple; e = parse_more_mults (Prod(e1, e2)) >] -> e
+ * | [< 'Kwd "/"; e2 = parse_simple; e = parse_more_mults (Quot(e1, e2)) >] -> e
+ * | [< >] -> e1
+ * and parse_simple = parser
+ * [< 'Ident s >] -> Var s
+ * | [< 'Int i >] -> Const(float i)
+ * | [< 'Float f >] -> Const f
+ * | [< 'Kwd "("; e = parse_expr; 'Kwd ")" >] -> e;;
+ * But see how they are forced to use a LL(1) grammar which denatures the
+ * grammar "parse_more_xxx"
+ *
+ *)
+
+(*****************************************************************************)
+(* Parser Combinators *)
+(*****************************************************************************)
+
+(* src: Jon Harrop. pad: a few bugfix *)
+
+type ('a, 'b) genp = 'a list -> 'b * 'a list
+let val_of_parser = fst
+
+(* lexer = parser of char list *)
+(* type 'a lexer = (char, 'a) genp *)
+
+(* grammer = parser ot tokens *)
+(* type 'a p = (token, 'a) genp *)
+
+
+(* pad: could also do it by returning a Maybe and use monad *)
+let ( ||| ) p1 p2 s =
+ try
+ p1 s
+ with Not_found ->
+ p2 s
+
+let ( +++ ) p1 p2 s =
+ let e1, s = p1 s in
+ let e2, s = p2 s in
+ (e1, e2), s
+
+let rec many p s =
+ try
+ let e, s = p s in
+ let es, s = many p s in
+ e::es, s
+ with Not_found ->
+ [], s
+
+
+let ( >| ) p k i =
+ let e, s = p i in
+ k e, s
+
+(* was called 'some', but confusing *)
+let pred p = function
+ | h::t when p h -> h, t
+ | _ -> raise Not_found
+
+let a x = pred (( = ) x)
+
+let several p = many (pred p)
+
+
+module Abstr : sig
+ type t
+ val x : t
+ end = struct
+ type t = int
+ let x = 0
+ end
+
+let fin = function
+ | [] as t -> Abstr.x, t
+ | _ -> raise Not_found
+
+
+(*****************************************************************************)
+(* Lexing *)
+(*****************************************************************************)
+(* a generic lexer *)
+
+let digit = function
+ | '0'..'9' -> true
+ | _ -> false
+
+let alpha = function
+ | 'a'..'z' | 'A'..'Z' -> true
+ | _ -> false
+
+
+let symbol = function
+ | '(' | ')'
+ | '{' | '}'
+ | '[' | ']'
+ | '<' | '>'
+ | '+' | '-' | '*' | '/'
+ | '&' | '|' | '!'
+
+ | '=' | '~' | '@'
+ -> true
+ | _ -> false
+
+let space = function
+ | ' ' | '\t' | '\n' -> true
+ | _ -> false
+
+let stringquote = function
+ | '"' -> true
+ | _ -> false
+
+let quote = function
+ | '\'' -> true
+ | _ -> false
+
+
+let alphanum c = digit c || alpha c
+
+
+let alphanum_underscore c = digit c || alpha c || (c = '_')
+let alphanum_minus c = digit c || alpha c || (c = '-')
+let alphanum_under_minus c = digit c || alpha c || (c = '-') || (c = '_')
+
+
+
+let (+>) o f = f o
+let string_of_chars cs =
+ cs +> List.map (String.make 1) +> String.concat ""
+
+
+let collect(h, t) =
+ String.concat "" (List.map (String.make 1) (h::t))
+
+let collectbis(xs) =
+ String.concat "" (List.map (String.make 1) (xs))
+
+let list_of_string string =
+ let list = ref [] in
+ String.iter (fun c -> list := c :: !list) string;
+ List.rev !list
+
+
+(*****************************************************************************)
+(* still generic *)
+(*****************************************************************************)
+
+type token =
+ | IDENT of string
+ | KWD of string
+ | INT of string
+ | SYM of string
+ | STR of string
+
+let string_of_token = function
+ | IDENT string -> "IDENT:" ^ string
+ | KWD string -> "KWD:" ^ string
+ | INT string -> "INT:" ^ string
+ | SYM string -> "SYM:" ^ string
+ | STR string -> "STR:" ^ string
+
+
+type lexer = (char, token) genp
+
+
+let rawnumber =
+ pred digit +++ several digit >| fun x -> INT(collect x)
+let rawident =
+ pred alpha +++ several alphanum >| fun x -> IDENT(collect x)
+let rawsymbol =
+ pred symbol +++ several symbol >| fun x -> SYM(collect x)
+
+let rawkeyword =
+ let p c = not(space c) && not(digit c) in
+ pred p +++ several p >| fun x -> KWD(collect x)
+
+
+(* todo: handle antislash *)
+let rawstring =
+ pred stringquote +++
+ several (fun c -> not (stringquote c)) +++
+ pred stringquote
+ >| (fun ((c1, cs), c3) ->
+ let s = string_of_chars cs in
+ STR s (* exclude the marker *)
+ )
+
+
+let lex_gen tokenf str =
+ let alltoks = (many tokenf) +++ fin >| fst in
+ val_of_parser (alltoks (list_of_string str))
+
+let parse_gen tokenf grammarf p string =
+ val_of_parser (grammarf (lex_gen tokenf string))
+
+(*****************************************************************************)
+(* not generic anymore *)
+(*****************************************************************************)
+(* the order is important if some "rules" overlap, as in ocamllex *)
+let token =
+ (rawident ||| rawnumber ||| rawkeyword) +++ several space >| fst
+
+(* pad: bugfix: was not defined in jon harrop article *)
+let tokens = many token
+
+let alltokens =
+ tokens +++ fin >| fst
+
+let lex (string : string) =
+ val_of_parser (alltokens (list_of_string string))
+
+
+
+let test1 () =
+ Common.example
+ (lex "a x^2 + b x + c"
+ =
+ [IDENT "a"; IDENT "x"; KWD "^"; INT "2"; KWD "+"; IDENT "b"; IDENT "x";
+ KWD "+"; IDENT "c"]
+ )
+
+(*****************************************************************************)
+(* Parsing *)
+(*****************************************************************************)
+
+type expr =
+ | Int of int
+ | Var of string
+ | Add of expr * expr
+ | Mul of expr * expr
+
+type 'a pparser = (token, 'a) genp
+
+(*
+open Format;;
+# let rec print_expr ff = function
+ | Int n -> fprintf ff "%d" n
+ | Var x -> fprintf ff "%s" x
+ | Add(f, g) ->
+ fprintf ff "%a + %a" print_expr f print_expr g
+ | Mul(f, g) ->
+ fprintf ff "%a %a" print_mul f print_mul g
+ and print_mul ff = function
+ | Add _ as e -> fprintf ff "(%a)" print_expr e
+ | e -> fprintf ff "%a" print_expr e
+#install_printer print_expr
+*)
+
+let ident = function
+ | IDENT x :: t -> x, t
+ | _ -> raise Not_found
+
+let int = function
+ | INT n :: t -> n, t
+ | _ -> raise Not_found
+
+let string = function
+ | STR x :: t -> x, t
+ | _ -> raise Not_found
+
+(* src: Jon Harrop
+ * "This style of parsing, known as recursive descent parsing , has one
+ * important caveat. If a rule tries to match itself immediately, even if
+ * that is succeeded by other parsers, then the resulting program will go
+ * into an infinite loops with the parser for that rule calling itself
+ * indefinitely until a stack overflow occurs. Consequently, our
+ * implementation of the factor parser is careful to parse an atom first,
+ * and term calls factor first, to avoid this problem."
+ *
+ * pad: bugfix, added the KWD "*".
+ *)
+
+(* pad: I think I remembered you cant eta-factorize the parameter
+ * when you use mutually recursive
+ *)
+let rec atom s =
+ (
+ (int >| fun n -> Int(int_of_string n))
+ |||
+ (ident >| fun x -> Var x)
+ |||
+ (a (KWD "(") +++ term +++ a (KWD ")") >| fun ((_, e), _) -> e)
+ ) s
+and factor s =
+ (
+ (atom +++ a (KWD "*") +++ factor >| fun ((f, _), g) -> Mul (f,g))
+ |||
+ atom
+ ) s
+and term s =
+ (
+ (factor +++ a (KWD "+") +++ term >| fun ((f, _), g) -> Add (f,g))
+ |||
+ factor
+ ) s
+
+
+let expr =
+ term +++ fin >| fst
+
+let parse p string =
+ val_of_parser(p(lex string))
+
+(*
+parse expr "a x x + b x + c"
+*)
+
+
+(*****************************************************************************)
+
+module Infix = struct
+ let (|||) = (|||)
+ let (+++) = (+++)
+ let (>|) = (>|)
+end