bundles/menhirLib/menhir-20120123/src/lexer.mll

   1 (**************************************************************************)
   2 (*                                                                        *)
   3 (*  Menhir                                                                *)
   4 (*                                                                        *)
   5 (*  François Pottier, INRIA Rocquencourt                                  *)
   6 (*  Yann Régis-Gianas, PPS, Université Paris Diderot                      *)
   7 (*                                                                        *)
   8 (*  Copyright 2005-2008 Institut National de Recherche en Informatique    *)
   9 (*  et en Automatique. All rights reserved. This file is distributed      *)
  10 (*  under the terms of the Q Public License version 1.0, with the change  *)
  11 (*  described in file LICENSE.                                            *)
  12 (*                                                                        *)
  13 (**************************************************************************)
  14
  15 {
  16
  17   open Lexing
  18   open Parser
  19   open Positions
  20
  21   (* This wrapper saves the current lexeme start, invokes its argument,
  22      and restores it. This allows transmitting better positions to the
  23      parser. *)
  24
  25   let savestart lexbuf f =
  26     let startp = lexbuf.lex_start_p in
  27     let token = f lexbuf in
  28     lexbuf.lex_start_p <- startp;
  29     token
  30
  31   (* Updates the line counter, which is used in some error messages. *)
  32
  33   let update_loc lexbuf =
  34     let pos = lexbuf.lex_curr_p in
  35     lexbuf.lex_curr_p <- { pos with
  36       pos_lnum = pos.pos_lnum + 1;
  37       pos_bol = pos.pos_cnum;
  38     }
  39
  40   (* Extracts a chunk out of the source file. *)
  41
  42   let chunk ofs1 ofs2 =
  43     let contents = Error.get_file_contents() in
  44     let len = ofs2 - ofs1 in
  45     String.sub contents ofs1 len
  46
  47   (* Extracts a chunk out of the source file, delimited by
  48      one position and extending to the end of the file. *)
  49
  50   let echunk ofs1 =
  51     let contents = Error.get_file_contents() in
  52     let len = String.length contents - ofs1 in
  53     String.sub contents ofs1 len
  54
  55   (* Overwrites an old character with a new one at a specified
  56      offset in a string. *)
  57
  58   let overwrite content offset c1 c2 =
  59     assert (content.[offset] = c1);
  60     content.[offset] <- c2
  61
  62   (* Creates a stretch. *)
  63
  64   let mk_stretch parenthesize pos1 pos2 pkeywords =
  65     let ofs1 = pos1.pos_cnum
  66     and ofs2 = pos2.pos_cnum in
  67     let raw_content = chunk ofs1 ofs2 in
  68     let content = String.copy raw_content in
  69     (* Turn our keywords into valid Objective Caml identifiers
  70        by replacing '$', '(', and ')' with '_'. Bloody. *)
  71     List.iter (function { value = keyword; position = pos } ->
  72       let pos = start_of_position pos in
  73       let ofs = pos.pos_cnum - ofs1 in
  74       overwrite content ofs '$' '_';
  75       match keyword with
  76       | Keyword.Dollar _
  77       | Keyword.Position (Keyword.Left, _, _)
  78       | Keyword.PreviousError ->
  79           ()
  80       | Keyword.SyntaxError ->
  81           (* $syntaxerror is replaced with
  82              (raise _eRR) *)
  83           let source = "(raise _eRR)" in
  84           String.blit source 0 content ofs (String.length source)
  85       | Keyword.Position (subject, where, _) ->
  86           let ofslpar =
  87             match where with
  88             | Keyword.WhereStart ->
  89                 ofs + 9
  90             | Keyword.WhereEnd ->
  91                 ofs + 7
  92           in
  93           overwrite content ofslpar '(' '_';
  94           match subject with
  95           | Keyword.Left ->
  96               assert false
  97           | Keyword.RightDollar i ->
  98               overwrite content (ofslpar + 1) '$' '_';
  99               overwrite content (ofslpar + 2 + String.length (string_of_int i)) ')' '_'
 100           | Keyword.RightNamed id ->
 101               overwrite content (ofslpar + 1 + String.length id) ')' '_'
 102     ) pkeywords;
 103     (* Add whitespace so that the column numbers match those of the source file.
 104        If requested, add parentheses so that the semantic action can be inserted
 105        into other code without ambiguity. *)
 106     let content =
 107       if parenthesize then
 108           (String.make (pos1.pos_cnum - pos1.pos_bol - 1) ' ') ^ "(" ^ content ^ ")"
 109       else
 110         (String.make (pos1.pos_cnum - pos1.pos_bol) ' ') ^ content
 111     in
 112     {
 113       Stretch.stretch_filename = Error.get_filename();
 114       Stretch.stretch_linenum = pos1.pos_lnum;
 115       Stretch.stretch_linecount = pos2.pos_lnum - pos1.pos_lnum;
 116       Stretch.stretch_content = content;
 117       Stretch.stretch_raw_content = raw_content;
 118       Stretch.stretch_keywords = pkeywords
 119     }
 120
 121   (* Translates the family of position-related keywords to abstract
 122      syntax. *)
 123
 124   let mk_keyword lexbuf w f n id =
 125     let where =
 126       match w with
 127       | Some _ ->
 128           Keyword.WhereStart
 129       | None ->
 130           Keyword.WhereEnd
 131     and flavor =
 132       match f with
 133       | Some _ ->
 134           Keyword.FlavorPosition
 135       | None ->
 136           Keyword.FlavorOffset
 137     and subject =
 138       match n, id with
 139       | Some n, None ->
 140           Keyword.RightDollar (int_of_string n)
 141       | None, Some id ->
 142           Keyword.RightNamed id
 143       | None, None ->
 144           Keyword.Left
 145       | Some _, Some _ ->
 146           assert false
 147     in
 148     let keyword = Keyword.Position (subject, where, flavor) in
 149     with_cpos lexbuf keyword
 150
 151   (* Objective Caml's reserved words. *)
 152
 153   let reserved =
 154     let table = Hashtbl.create 149 in
 155     List.iter (fun word -> Hashtbl.add table word ()) [
 156       "and";
 157       "as";
 158       "assert";
 159       "begin";
 160       "class";
 161       "constraint";
 162       "do";
 163       "done";
 164       "downto";
 165       "else";
 166       "end";
 167       "exception";
 168       "external";
 169       "false";
 170       "for";
 171       "fun";
 172       "function";
 173       "functor";
 174       "if";
 175       "in";
 176       "include";
 177       "inherit";
 178       "initializer";
 179       "lazy";
 180       "let";
 181       "match";
 182       "method";
 183       "module";
 184       "mutable";
 185       "new";
 186       "object";
 187       "of";
 188       "open";
 189       "or";
 190       "parser";
 191       "private";
 192       "rec";
 193       "sig";
 194       "struct";
 195       "then";
 196       "to";
 197       "true";
 198       "try";
 199       "type";
 200       "val";
 201       "virtual";
 202       "when";
 203       "while";
 204       "with";
 205       "mod";
 206       "land";
 207       "lor";
 208       "lxor";
 209       "lsl";
 210       "lsr";
 211       "asr";
 212     ];
 213     table
 214
 215   (* A short-hand. *)
 216
 217   let error1 pos msg =
 218     Error.error (Positions.one pos) msg
 219
 220 }
 221
 222 let newline = ('\010' | '\013' | "\013\010")
 223
 224 let whitespace = [ ' ' '\t' ';' ]
 225
 226 let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
 227
 228 let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
 229
 230 let identchar = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '0'-'9'] (* '\'' forbidden *)
 231
 232 let poskeyword =
 233   '$'
 234   (("start" as w) | "end")
 235   (("pos" as f) | "ofs")
 236   ( '(' ( '$' (['0'-'9']+ as n) | ((lowercase identchar*) as id)) ')')?
 237
 238 let previouserror =
 239   "$previouserror"
 240
 241 let syntaxerror =
 242   "$syntaxerror"
 243
 244 rule main = parse
 245 | "%token"
 246     { TOKEN }
 247 | "%type"
 248     { TYPE }
 249 | "%left"
 250     { LEFT }
 251 | "%right"
 252     { RIGHT }
 253 | "%nonassoc"
 254     { NONASSOC }
 255 | "%start"
 256     { START }
 257 | "%prec"
 258     { PREC }
 259 | "%public"
 260     { PUBLIC }
 261 | "%parameter"
 262     { PARAMETER }
 263 | "%inline"
 264     { INLINE }
 265 | "%%"
 266     { let ofs = lexeme_end lexbuf in
 267       PERCENTPERCENT (lazy (echunk ofs)) }
 268 | ":"
 269     { COLON }
 270 | ","
 271     { COMMA }
 272 | "="
 273     { EQUAL }
 274 | "("
 275     { LPAREN }
 276 | ")"
 277     { RPAREN }
 278 | "|"
 279     { BAR }
 280 | "?"
 281     { QUESTION }
 282 | "*"
 283     { STAR }
 284 | "+"
 285     { PLUS }
 286 | (lowercase identchar *) as id
 287     { if Hashtbl.mem reserved id then
 288         Error.errorp
 289           (Positions.with_poss (lexeme_start_p lexbuf) (lexeme_end_p lexbuf) ())
 290           "this is an Objective Caml reserved word."
 291       else
 292         LID (with_pos (cpos lexbuf) id)
 293     }
 294 | (uppercase identchar *) as id
 295     { UID (with_pos (cpos lexbuf) id) }
 296 | "//" [^ '\010' '\013']* newline (* skip C++ style comment *)
 297 | newline
 298     { update_loc lexbuf; main lexbuf }
 299 | whitespace+
 300     { main lexbuf }
 301 | "/*"
 302     { comment (lexeme_start_p lexbuf) lexbuf; main lexbuf }
 303 | "(*"
 304     { ocamlcomment (lexeme_start_p lexbuf) lexbuf; main lexbuf }
 305 | "<"
 306     { savestart lexbuf (ocamltype (lexeme_end_p lexbuf)) }
 307 | "%{"
 308     { savestart lexbuf (fun lexbuf ->
 309         let openingpos = lexeme_end_p lexbuf in
 310         let closingpos, _ = action true openingpos [] lexbuf in
 311         (* TEMPORARY if keyword list nonempty, issue an error *)
 312         HEADER (mk_stretch false openingpos closingpos [])
 313       ) }
 314 | "{"
 315     { savestart lexbuf (fun lexbuf ->
 316         let openingpos = lexeme_end_p lexbuf in
 317         let closingpos, pkeywords = action false openingpos [] lexbuf in
 318         let stretch = mk_stretch true openingpos closingpos pkeywords in
 319         ACTION (Action.from_stretch stretch)
 320       ) }
 321 (* TEMPORARY comprendre si la différence entre header et action est bien
 322              justifiée et si certains choix comme le parenthésage et le
 323              traitement des keywords ne pourraient pas être effectués
 324              plus loin. *)
 325 | eof
 326     { EOF }
 327 | _
 328     { error1 (lexeme_start_p lexbuf) "unexpected character(s)." }
 329
 330 (* Skip C style comments. *)
 331
 332 and comment openingpos = parse
 333 | newline
 334     { update_loc lexbuf; comment openingpos lexbuf }
 335 | "*/"
 336     { () }
 337 | eof
 338     { error1 openingpos "unterminated comment." }
 339 | _
 340     { comment openingpos lexbuf }
 341
 342 (* Collect an O'Caml type delimited by angle brackets. Angle brackets can
 343    appear as part of O'Caml function types. They might also appear as part
 344    of O'Caml variant types, but we ignore that possibility for the moment. *)
 345
 346 and ocamltype openingpos = parse
 347 | "->"
 348     { ocamltype openingpos lexbuf }
 349 | '>'
 350     { OCAMLTYPE (Stretch.Declared (mk_stretch true openingpos (lexeme_start_p lexbuf) [])) }
 351 | "(*"
 352     { ocamlcomment (lexeme_start_p lexbuf) lexbuf; ocamltype openingpos lexbuf }
 353 | newline
 354     { update_loc lexbuf; ocamltype openingpos lexbuf }
 355 | eof
 356     { error1 openingpos "unterminated Objective Caml type." }
 357 | _
 358     { ocamltype openingpos lexbuf }
 359
 360 (* Collect O'Caml code delimited by curly brackets. Any occurrences of
 361    the special ``$i'' identifiers are recorded in the accumulating
 362    parameter [pkeywords]. Nested curly brackets must be properly
 363    counted. Nested parentheses are also kept track of, so as to better
 364    report errors when they are not balanced. *)
 365
 366 and action percent openingpos pkeywords = parse
 367 | '{'
 368     { let _, pkeywords = action false (lexeme_end_p lexbuf) pkeywords lexbuf in
 369       action percent openingpos pkeywords lexbuf }
 370 | ("}" | "%}") as delimiter
 371     { match percent, delimiter with
 372       | true, "%}"
 373       | false, "}" ->
 374           (* This is the delimiter we were instructed to look for. *)
 375           lexeme_start_p lexbuf, pkeywords
 376       | _, _ ->
 377           (* This is not it. *)
 378           error1 openingpos "unbalanced opening brace."
 379     }
 380 | '('
 381     { let _, pkeywords = parentheses (lexeme_end_p lexbuf) pkeywords lexbuf in
 382       action percent openingpos pkeywords lexbuf }
 383 | '$' (['0'-'9']+ as n)
 384     { let pkeyword = with_cpos lexbuf (Keyword.Dollar (int_of_string n)) in
 385       action percent openingpos (pkeyword :: pkeywords) lexbuf }
 386 | poskeyword
 387     { let pkeyword = mk_keyword lexbuf w f n id in
 388       action percent openingpos (pkeyword :: pkeywords) lexbuf }
 389 | previouserror
 390     { let pkeyword = with_cpos lexbuf Keyword.PreviousError in
 391       action percent openingpos (pkeyword :: pkeywords) lexbuf }
 392 | syntaxerror
 393     { let pkeyword = with_cpos lexbuf Keyword.SyntaxError in
 394       action percent openingpos (pkeyword :: pkeywords) lexbuf }
 395 | '"'
 396     { string (lexeme_start_p lexbuf) lexbuf;
 397       action percent openingpos pkeywords lexbuf }
 398 | "'"
 399     { char lexbuf;
 400       action percent openingpos pkeywords lexbuf }
 401 | "(*"
 402     { ocamlcomment (lexeme_start_p lexbuf) lexbuf;
 403       action percent openingpos pkeywords lexbuf }
 404 | newline
 405     { update_loc lexbuf;
 406       action percent openingpos pkeywords lexbuf }
 407 | ')'
 408 | eof
 409     { error1 openingpos "unbalanced opening brace." }
 410 | _
 411     { action percent openingpos pkeywords lexbuf }
 412
 413 and parentheses openingpos pkeywords = parse
 414 | '('
 415     { let _, pkeywords = parentheses (lexeme_end_p lexbuf) pkeywords lexbuf in
 416       parentheses openingpos pkeywords lexbuf }
 417 | ')'
 418     { lexeme_start_p lexbuf, pkeywords }
 419 | '{'
 420     { let _, pkeywords = action false (lexeme_end_p lexbuf) pkeywords lexbuf in
 421       parentheses openingpos pkeywords lexbuf }
 422 | '$' (['0'-'9']+ as n)
 423     { let pkeyword = with_cpos lexbuf (Keyword.Dollar (int_of_string n)) in
 424       parentheses openingpos (pkeyword :: pkeywords) lexbuf }
 425 | poskeyword
 426     { let pkeyword = mk_keyword lexbuf w f n id in
 427       parentheses openingpos (pkeyword :: pkeywords) lexbuf }
 428 | previouserror
 429     { let pkeyword = with_cpos lexbuf Keyword.PreviousError in
 430       parentheses openingpos (pkeyword :: pkeywords) lexbuf }
 431 | syntaxerror
 432     { let pkeyword = with_cpos lexbuf Keyword.SyntaxError in
 433       parentheses openingpos (pkeyword :: pkeywords) lexbuf }
 434 | '"'
 435     { string (lexeme_start_p lexbuf) lexbuf; parentheses openingpos pkeywords lexbuf }
 436 | "'"
 437     { char lexbuf; parentheses openingpos pkeywords lexbuf }
 438 | "(*"
 439     { ocamlcomment (lexeme_start_p lexbuf) lexbuf; parentheses openingpos pkeywords lexbuf }
 440 | newline
 441     { update_loc lexbuf; parentheses openingpos pkeywords lexbuf }
 442 | '}'
 443 | eof
 444     { error1 openingpos "unbalanced opening parenthesis." }
 445 | _
 446     { parentheses openingpos pkeywords lexbuf }
 447
 448 (* Skip O'Caml comments. Comments can be nested and can contain
 449    strings or characters, which must be correctly analyzed. (A string
 450    could contain begin-of-comment or end-of-comment sequences, which
 451    must be ignored; a character could contain a begin-of-string
 452    sequence.) *)
 453
 454 and ocamlcomment openingpos = parse
 455 | "*)"
 456     { () }
 457 | "(*"
 458     { ocamlcomment (lexeme_start_p lexbuf) lexbuf; ocamlcomment openingpos lexbuf }
 459 | '"'
 460     { string (lexeme_start_p lexbuf) lexbuf; ocamlcomment openingpos lexbuf }
 461 | "'"
 462     { char lexbuf; ocamlcomment openingpos lexbuf }
 463 | newline
 464     { update_loc lexbuf; ocamlcomment openingpos lexbuf }
 465 | eof
 466     { error1 openingpos "unterminated Objective Caml comment." }
 467 | _
 468     { ocamlcomment openingpos lexbuf }
 469
 470 (* Skip O'Caml strings. *)
 471
 472 and string openingpos = parse
 473 | '"'
 474    { () }
 475 | '\\' newline
 476 | newline
 477    { update_loc lexbuf; string openingpos lexbuf }
 478 | '\\' _
 479    (* Upon finding a backslash, skip the character that follows,
 480       unless it is a newline. Pretty crude, but should work. *)
 481    { string openingpos lexbuf }
 482 | eof
 483    { error1 openingpos "unterminated Objective Caml string." }
 484 | _
 485    { string openingpos lexbuf }
 486
 487 (* Skip O'Caml characters. A lone quote character is legal inside
 488    a comment, so if we don't recognize the matching closing quote,
 489    we simply abandon. *)
 490
 491 and char = parse
 492 | '\\'? newline "'"
 493    { update_loc lexbuf }
 494 | [^ '\\' '\''] "'"
 495 | '\\' _ "'"
 496 | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
 497 | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
 498 | ""
 499    { () }
 500