parsing_c/lexer_c.mll

   1 {
   2 (* Yoann Padioleau
   3  *
   4  * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License (GPL)
   8  * version 2 as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * file license.txt for more details.
  14  *)
  15 open Common
  16
  17 open Parser_c
  18
  19 open Ast_c (* to factorise tokens, OpAssign, ... *)
  20
  21 (*****************************************************************************)
  22 (*
  23  * subtil: ocamllex use side effect on lexbuf, so must take care.
  24  * For instance must do
  25  *
  26  *  let info = tokinfo lexbuf in
  27  *  TComment (info +> tok_add_s (comment lexbuf))
  28  *
  29  * and not
  30  *
  31  *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
  32  *
  33  * because of the "wierd" order of evaluation of OCaml.
  34  *
  35  *
  36  *
  37  * note: can't use Lexer_parser._lexer_hint here to do different
  38  * things, because now we call the lexer to get all the tokens
  39  * (tokens_all), and then we parse. So we can't have the _lexer_hint
  40  * info here. We can have it only in parse_c. For the same reason, the
  41  * typedef handling here is now useless.
  42  *)
  43 (*****************************************************************************)
  44
  45 (*****************************************************************************)
  46 (* Wrappers *)
  47 (*****************************************************************************)
  48 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
  49
  50 (*****************************************************************************)
  51
  52
  53 exception Lexical of string
  54
  55 let tok     lexbuf  = Lexing.lexeme lexbuf
  56
  57 let tokinfo lexbuf  =
  58   {
  59     pinfo = Ast_c.OriginTok {
  60       Common.charpos = Lexing.lexeme_start lexbuf;
  61       Common.str     = Lexing.lexeme lexbuf;
  62       (* info filled in a post-lexing phase *)
  63       Common.line = -1;
  64       Common.column = -1;
  65       Common.file = "";
  66     };
  67    (* must generate a new ref each time, otherwise share *)
  68     cocci_tag = ref Ast_c.emptyAnnot;
  69     comments_tag = ref Ast_c.emptyComments;
  70   }
  71
  72 (* cppext: must generate a new ref each time, otherwise share *)
  73 let no_ifdef_mark () = ref (None: (int * int) option)
  74
  75 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
  76
  77
  78 (* opti: less convenient, but using a hash is faster than using a match *)
  79 let keyword_table = Common.hash_of_list [
  80
  81   (* c: *)
  82   "void",   (fun ii -> Tvoid ii);
  83   "char",   (fun ii -> Tchar ii);
  84   "short",  (fun ii -> Tshort ii);
  85   "int",    (fun ii -> Tint ii);
  86   "long",   (fun ii -> Tlong ii);
  87   "float",  (fun ii -> Tfloat ii);
  88   "double", (fun ii -> Tdouble ii);
  89   "size_t", (fun ii -> Tsize_t ii);
  90   "ssize_t", (fun ii -> Tssize_t ii);
  91   "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
  92
  93   "unsigned", (fun ii -> Tunsigned ii);
  94   "signed",   (fun ii -> Tsigned ii);
  95
  96   "auto",     (fun ii -> Tauto ii);
  97   "register", (fun ii -> Tregister ii);
  98   "extern",   (fun ii -> Textern ii);
  99   "static",   (fun ii -> Tstatic ii);
 100
 101   "const",    (fun ii -> Tconst ii);
 102   "volatile", (fun ii -> Tvolatile ii);
 103
 104   "struct",  (fun ii -> Tstruct ii);
 105   "union",   (fun ii -> Tunion ii);
 106   "enum",    (fun ii -> Tenum ii);
 107   "typedef", (fun ii -> Ttypedef ii);
 108
 109   "if",      (fun ii -> Tif ii);
 110   "else",     (fun ii -> Telse ii);
 111   "break",   (fun ii -> Tbreak ii);
 112   "continue", (fun ii -> Tcontinue ii);
 113   "switch",  (fun ii -> Tswitch ii);
 114   "case",     (fun ii -> Tcase ii);
 115   "default", (fun ii -> Tdefault ii);
 116   "for",     (fun ii -> Tfor ii);
 117   "do",      (fun ii -> Tdo ii);
 118   "while",   (fun ii -> Twhile ii);
 119   "return",  (fun ii -> Treturn ii);
 120   "goto",    (fun ii -> Tgoto ii);
 121
 122   "sizeof", (fun ii -> Tsizeof ii);
 123
 124
 125   (* gccext: cppext: linuxext: synonyms *)
 126   "asm",     (fun ii -> Tasm ii);
 127   "__asm__", (fun ii -> Tasm ii);
 128   "__asm",   (fun ii -> Tasm ii);
 129
 130   "inline",     (fun ii -> Tinline ii);
 131   "__inline__", (fun ii -> Tinline ii);
 132   "__inline",   (fun ii -> Tinline ii);
 133
 134   "__attribute__", (fun ii -> Tattribute ii);
 135   "__attribute", (fun ii -> Tattribute ii);
 136
 137   "typeof", (fun ii -> Ttypeof ii);
 138   "__typeof__", (fun ii -> Ttypeof ii);
 139   "__typeof", (fun ii -> Ttypeof ii);
 140
 141         (* found a lot in expanded code *)
 142   "__extension__", (fun ii -> TattributeNoarg ii);
 143
 144
 145   (* gccext: alias *)
 146   "__signed__",     (fun ii -> Tsigned ii);
 147
 148   "__const__",     (fun ii -> Tconst ii);
 149   "__const",     (fun ii -> Tconst ii);
 150
 151   "__volatile__",  (fun ii -> Tvolatile ii);
 152   "__volatile",    (fun ii -> Tvolatile ii);
 153
 154   (* windowsext: *)
 155   "__declspec", (fun ii -> Tattribute ii);
 156
 157   "__stdcall", (fun ii -> TattributeNoarg ii);
 158   "__cdecl", (fun ii -> TattributeNoarg ii);
 159   "WINAPI", (fun ii -> TattributeNoarg ii);
 160   "APIENTRY", (fun ii -> TattributeNoarg ii);
 161   "CALLBACK", (fun ii -> TattributeNoarg ii);
 162
 163   (* c99:  *)
 164   (* no just "restrict" ? maybe for backward compatibility they avoided
 165    * to use restrict which people may have used in their program already
 166    *)
 167   "__restrict",    (fun ii -> Trestrict ii);
 168   "__restrict__",    (fun ii -> Trestrict ii);
 169
 170  ]
 171
 172 let error_radix s =
 173   ("numeric " ^ s ^ " constant contains digits beyond the radix:")
 174
 175 (* julia: functions for figuring out the type of integers *)
 176
 177 let is_long_dec s int uint long ulong =
 178   match !Flag_parsing_c.int_thresholds with
 179     None -> int
 180   | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
 181       let bn = Big_int.big_int_of_string s in
 182       if Big_int.ge_big_int bn ulong_threshold
 183       then ulong
 184       else
 185         if Big_int.ge_big_int bn long_threshold
 186         then long
 187         else
 188           if Big_int.ge_big_int bn uint_threshold
 189           then long
 190           else int
 191
 192 let is_long_ho s int uint long ulong drop bpd count =
 193   match !Flag_parsing_c.int_thresholds with
 194     None -> int
 195   | Some (uint_sz,ulong_sz,_,_,_) ->
 196       let len = String.length s in
 197       (* this assumes that all of the hex/oct digits are significant *)
 198       (* drop is 2 for hex (0x) and 1 for oct (0) *)
 199       let s = String.sub s drop (len - drop) in
 200       let len =
 201         ((len-drop) * bpd) -
 202           (count (int_of_string("0x"^(String.sub s 0 1)))) in
 203       if len < uint_sz
 204       then int
 205       else
 206         if len = uint_sz
 207         then uint
 208         else
 209           if len < ulong_sz
 210           then long
 211           else ulong
 212
 213 let is_long_oct s int uint long ulong =
 214   is_long_ho s int uint long ulong 1 3
 215     (* stupid, but probably more efficient than taking logs *)
 216     (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
 217 let is_long_hex s int uint long ulong =
 218   is_long_ho s int uint long ulong 2 4
 219     (* stupid, but probably more efficient than taking logs *)
 220     (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
 221       | _ -> 0)
 222
 223 let sint = (Signed,CInt)
 224 let uint = (UnSigned,CInt)
 225 let slong = (Signed,CLong)
 226 let ulong = (UnSigned,CLong)
 227
 228 }
 229
 230 (*****************************************************************************)
 231 let letter = ['A'-'Z' 'a'-'z' '_']
 232 let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*)
 233 let digit  = ['0'-'9']
 234
 235 (* not used for the moment *)
 236 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
 237                    ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
 238 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
 239 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
 240 (* 7 = \a = bell in C. this is not the only char allowed !!
 241  * ex @ and $ ` are valid too
 242  *)
 243
 244 let cchar = (letter | digit | punctuation | additionnal)
 245
 246 let sp = [' ' '\t']+
 247 let spopt = [' ' '\t']*
 248
 249 let dec = ['0'-'9']
 250 let oct = ['0'-'7']
 251 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
 252
 253 let decimal = ('0' | (['1'-'9'] dec*))
 254 let octal   = ['0']        oct+
 255 let hexa    = ("0x" |"0X") hex+
 256
 257
 258 let pent   = dec+
 259 let pfract = dec+
 260 let sign = ['-' '+']
 261 let exp  = ['e''E'] sign? dec+
 262 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
 263
 264 let id = letter (letter | digit) *
 265
 266 (*****************************************************************************)
 267 rule token = parse
 268
 269   (* ----------------------------------------------------------------------- *)
 270   (* spacing/comments *)
 271   (* ----------------------------------------------------------------------- *)
 272
 273   (* note: this lexer generate tokens for comments!! so can not give
 274    * this lexer as-is to the parsing function. The caller must preprocess
 275    * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
 276    *
 277    * update: we now also generate a separate token for newlines, so now
 278    * the caller may also have to reagglomerate all those commentspace
 279    * tokens if he was assuming that spaces were agglomerate in a single
 280    * token.
 281    *)
 282
 283   | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
 284       (* starting a new line; the newline character followed by whitespace *)
 285       { TCommentNewline (tokinfo lexbuf) }
 286   | [' ' '\t' '\r' '\011' '\012' ]+
 287       { TCommentSpace (tokinfo lexbuf) }
 288   | "/*"
 289       { let info = tokinfo lexbuf in
 290         let com = comment lexbuf in
 291
 292         let info' = info +> tok_add_s com in
 293         let s = Ast_c.str_of_info info' in
 294         (* could be more flexible, use [\t ]* instead of hardcoded
 295          * single space. *)
 296         match s with
 297         | "/* {{coccinelle:skip_start}} */" ->
 298             TCommentSkipTagStart (info')
 299         | "/* {{coccinelle:skip_end}} */" ->
 300             TCommentSkipTagEnd (info')
 301         | _ -> TComment(info')
 302       }
 303
 304
 305   (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
 306    * So need this here only when dont call cpp before.
 307    * note that we don't keep the trailing \n; it will be in another token.
 308    *)
 309   | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) }
 310
 311   (* ----------------------------------------------------------------------- *)
 312   (* cpp *)
 313   (* ----------------------------------------------------------------------- *)
 314
 315   (* old:
 316    *   | '#'            { endline lexbuf} // should be line, and not endline
 317    *   and endline = parse  | '\n'      { token lexbuf}
 318    *                        |   _       { endline lexbuf}
 319    *)
 320
 321   (* less?:
 322    *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ...
 323    *    => just "#" -> token {lexbuf} (that is ignore)
 324    *  il y'a 1 #elif  sans rien  apres
 325    *  il y'a 1 #error sans rien  apres
 326    *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
 327    *  entouré par des #if 0
 328    *  => make as for comment, call a comment_cpp that when #endif finish the
 329    *   comment and if other cpp stuff raise exn
 330    *  il y'a environ 10  #if(xxx)  ou le ( est collé direct
 331    *  il y'a des include"" et include<
 332    *  il y'a 1 ` (derriere un #ifndef linux)
 333    *)
 334
 335
 336
 337   (* ---------------------- *)
 338   (* misc *)
 339   (* ---------------------- *)
 340
 341   (* bugfix: I want now to keep comments for the cComment study
 342    * so cant do:    sp [^'\n']+ '\n'
 343    * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
 344    *)
 345
 346   | "#" spopt "pragma"  sp  [^'\n']* '\n'
 347   | "#" spopt "ident"   sp  [^'\n']* '\n'
 348   | "#" spopt "line"    sp  [^'\n']* '\n'
 349   | "#" spopt "error"   sp  [^'\n']* '\n'
 350   | "#" spopt "warning" sp  [^'\n']* '\n'
 351   | "#" spopt "abort"   sp  [^'\n']* '\n'
 352       { TCppDirectiveOther (tokinfo lexbuf) }
 353
 354   | "#" [' ' '\t']* '\n'
 355       { TCppDirectiveOther (tokinfo lexbuf) }
 356
 357   (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
 358   | "#" sp pent sp  '"' [^ '"']* '"' (spopt pent)*  spopt '\n'
 359       { TCppDirectiveOther (tokinfo lexbuf) }
 360
 361
 362
 363   (* ---------------------- *)
 364   (* #define, #undef *)
 365   (* ---------------------- *)
 366
 367   (* the rest of the lexing/parsing of define is done in fix_tokens_define
 368    * where we parse until a TCppEscapedNewline and generate a TDefEol
 369    *)
 370   | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
 371
 372   (* note: in some cases can have stuff after the ident as in #undef XXX 50,
 373    * but I currently don't handle it cos I think it's bad code.
 374    *)
 375   | "#" [' ' '\t']* "undef" { TUndef (tokinfo lexbuf) }
 376
 377   (* ---------------------- *)
 378   (* #include *)
 379   (* ---------------------- *)
 380
 381   (* The difference between a local "" and standard <> include is computed
 382    * later in parser_c.mly. So redo a little bit of lexing there; ugly but
 383    * simpler to generate a single token here.  *)
 384   | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
 385     (('"' ([^ '"']+) '"' |
 386      '<' [^ '>']+ '>' |
 387       ['A'-'Z''_']+
 388     ) as filename)
 389       { let info = tokinfo lexbuf in
 390         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 391       }
 392   (* gccext: found in glibc *)
 393   | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
 394     (('"' ([^ '"']+) '"' |
 395      '<' [^ '>']+ '>' |
 396       ['A'-'Z''_']+
 397     ) as filename)
 398       { let info = tokinfo lexbuf in
 399         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 400       }
 401
 402   (* ---------------------- *)
 403   (* #ifdef *)
 404   (* ---------------------- *)
 405
 406   (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
 407    * when working on the ifdef view.
 408    *)
 409
 410   (* '0'+ because sometimes it is a #if 000 *)
 411   | "#" [' ' '\t']* "if" [' ' '\t']* '0'+           (* [^'\n']*  '\n' *)
 412       { let info = tokinfo lexbuf in
 413         TIfdefBool (false, no_ifdef_mark(), info)
 414           (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
 415       }
 416
 417   | "#" [' ' '\t']* "if" [' ' '\t']* '1'   (* [^'\n']*  '\n' *)
 418       { let info = tokinfo lexbuf in
 419         TIfdefBool (true, no_ifdef_mark(), info)
 420
 421       }
 422
 423  (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
 424   | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
 425       { let info = tokinfo lexbuf in
 426         TIfdefMisc (false, no_ifdef_mark(), info)
 427       }
 428
 429  (* DO NOT cherry pick to lexer_cplusplus !!! *)
 430   | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus"   [^'\n']*  '\n'
 431       { let info = tokinfo lexbuf in
 432         TIfdefMisc (false, no_ifdef_mark(), info)
 433       }
 434
 435   (* in glibc *)
 436   | "#" spopt ("ifdef"|"if") sp "__STDC__"
 437       { let info = tokinfo lexbuf in
 438         TIfdefVersion (true, no_ifdef_mark(),
 439                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 440       }
 441
 442
 443   (* linuxext: different possible variations (we do not manage all of them):
 444
 445     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
 446     #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
 447     #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 448     #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
 449     #if LINUX_VERSION_CODE < 0x020600
 450     #if LINUX_VERSION_CODE >= 0x2051c
 451     #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 452     #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
 453     #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 454     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)  &&  LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 455     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
 456     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 457     #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
 458     #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
 459     #ifndef LINUX_VERSION_CODE
 460     #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
 461     (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
 462     LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
 463     #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
 464     #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
 465     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 466
 467   *)
 468
 469   (* linuxext: must be before the generic rules for if and ifdef *)
 470   | "#" spopt "if" sp "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 471       { let info = tokinfo lexbuf in
 472         TIfdefVersion (true, no_ifdef_mark(),
 473                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 474       }
 475   (* linuxext: *)
 476   | "#" spopt "if" sp "!" "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 477   | "#" spopt "if" sp ['(']?  "LINUX_VERSION_CODE" sp ("<=" | "<") sp
 478
 479       { let info = tokinfo lexbuf in
 480         TIfdefVersion (false, no_ifdef_mark(),
 481                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 482       }
 483
 484
 485
 486
 487   (* can have some ifdef 0  hence the letter|digit even at beginning of word *)
 488   | "#" [' ''\t']* "ifdef"  [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 489       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 490   | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 491       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 492   | "#" [' ''\t']* "if" [' ' '\t']+
 493       { let info = tokinfo lexbuf in
 494         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 495       }
 496   | "#" [' ' '\t']* "if" '('
 497       { let info = tokinfo lexbuf in
 498         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 499       }
 500
 501   | "#" [' ' '\t']* "elif" [' ' '\t']+
 502       { let info = tokinfo lexbuf in
 503         TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 504       }
 505
 506
 507   | "#" [' ''\t']* "endif"  [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 508       { TEndif (no_ifdef_mark(), tokinfo lexbuf) }
 509   (* bugfix: can have #endif LINUX  but at the same time if I eat everything
 510    * until next line, I may miss some TComment which for some tools
 511    * are important such as aComment
 512    *)
 513   | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
 514       TEndif     (no_ifdef_mark(), tokinfo lexbuf)
 515     }
 516   (* can be at eof *)
 517   (*| "#" [' ' '\t']* "endif"                { TEndif     (tokinfo lexbuf) }*)
 518
 519   | "#" [' ' '\t']* "else" [' ' '\t' '\n']
 520       { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
 521
 522
 523
 524
 525   (* ---------------------- *)
 526   (* #define body *)
 527   (* ---------------------- *)
 528
 529   (* only in cpp directives normally *)
 530   | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
 531
 532   (* We must generate separate tokens for #, ## and extend the grammar.
 533    * Note there can be "elaborated" idents in many different places, in
 534    * expression but also in declaration, in function name. So having 3 tokens
 535    * for an ident does not work well with how we add info in
 536    * ast_c. Was easier to generate just one token, just one info,
 537    * even if have later to reanalyse those tokens and unsplit. But then,
 538    * handling C++ lead to having not just a string for ident but something
 539    * more complex. Also when we want to parse elaborated function headers
 540    * (e.g. void METH(foo)(int x)), we need anyway to go from a string
 541    * to something more. So having also for C something more than just
 542    * string for ident is natural.
 543    *
 544    * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
 545    * an easier solution would be to augment the TIdent type such as
 546    *   TIdent of string * info * cpp_ident_additionnal_info
 547    *
 548    * old:
 549    * |  id   ([' ''\t']* "##" [' ''\t']* id)+
 550    *   { let info = tokinfo lexbuf in
 551    *     TIdent (tok lexbuf, info)
 552    *   }
 553    * |  "##" spopt id
 554    *   { let info = tokinfo lexbuf in
 555    *     TIdent (tok lexbuf, info)
 556    *   }
 557    *
 558    *)
 559   (* cppext: string concatenation of idents, also ##args for variadic macro. *)
 560   | "##" { TCppConcatOp (tokinfo lexbuf) }
 561
 562   (* cppext: stringification.
 563    * bugfix: this case must be after the other cases such as #endif
 564    * otherwise take precedent.
 565    *)
 566   |  "#" spopt id
 567       { let info = tokinfo lexbuf in
 568         TIdent (tok lexbuf, info)
 569       }
 570   (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
 571   | ((id as s)  "...")
 572       { TDefParamVariadic (s, tokinfo lexbuf) }
 573
 574
 575
 576
 577
 578   (* ----------------------------------------------------------------------- *)
 579   (* C symbols *)
 580   (* ----------------------------------------------------------------------- *)
 581    (* stdC:
 582     ...   &&   -=   >=   ~   +   ;   ]
 583     <<=   &=   ->   >>   %   ,   <   ^
 584     >>=   *=   /=   ^=   &   -   =   {
 585     !=    ++   <<   |=   (   .   >   |
 586     %=    +=   <=   ||   )   /   ?   }
 587         --   ==   !    *   :   [
 588     recent addition:    <:  :>  <%  %>
 589     only at processing: %:  %:%: # ##
 590    *)
 591
 592
 593   | '[' { TOCro(tokinfo lexbuf) }   | ']' { TCCro(tokinfo lexbuf) }
 594   | '(' { TOPar(tokinfo lexbuf)   } | ')' { TCPar(tokinfo lexbuf)   }
 595   | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
 596
 597   | '+' { TPlus(tokinfo lexbuf) }   | '*' { TMul(tokinfo lexbuf) }
 598   | '-' { TMinus(tokinfo lexbuf) }  | '/' { TDiv(tokinfo lexbuf) }
 599   | '%' { TMod(tokinfo lexbuf) }
 600
 601   | "++"{ TInc(tokinfo lexbuf) }    | "--"{ TDec(tokinfo lexbuf) }
 602
 603   | "="  { TEq(tokinfo lexbuf) }
 604
 605   | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
 606   | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
 607   | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
 608   | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
 609   | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
 610   | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
 611   | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
 612   | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
 613   | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
 614   | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
 615
 616   | "==" { TEqEq(tokinfo lexbuf) }  | "!=" { TNotEq(tokinfo lexbuf) }
 617   | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
 618   | "<"  { TInf(tokinfo lexbuf) }   | ">"  { TSup(tokinfo lexbuf) }
 619
 620   | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
 621   | ">>" { TShr(tokinfo lexbuf) }    | "<<" { TShl(tokinfo lexbuf) }
 622   | "&"  { TAnd(tokinfo lexbuf) }    | "|" { TOr(tokinfo lexbuf) }
 623   | "^"  { TXor(tokinfo lexbuf) }
 624   | "..." { TEllipsis(tokinfo lexbuf) }
 625   | "->"   { TPtrOp(tokinfo lexbuf) }  | '.'  { TDot(tokinfo lexbuf) }
 626   | ','    { TComma(tokinfo lexbuf) }
 627   | ";"    { TPtVirg(tokinfo lexbuf) }
 628   | "?"    { TWhy(tokinfo lexbuf) }    | ":"   { TDotDot(tokinfo lexbuf) }
 629   | "!"    { TBang(tokinfo lexbuf) }   | "~"   { TTilde(tokinfo lexbuf) }
 630
 631   | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
 632   | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
 633
 634
 635
 636   (* ----------------------------------------------------------------------- *)
 637   (* C keywords and ident *)
 638   (* ----------------------------------------------------------------------- *)
 639
 640   (* StdC: must handle at least name of length > 509, but can
 641    * truncate to 31 when compare and truncate to 6 and even lowerise
 642    * in the external linkage phase
 643    *)
 644   | letter (letter | digit) *
 645       { let info = tokinfo lexbuf in
 646         let s = tok lexbuf in
 647         Common.profile_code "C parsing.lex_ident" (fun () ->
 648           match Common.optionise (fun () -> Hashtbl.find keyword_table s)
 649           with
 650           | Some f -> f info
 651
 652            (* parse_typedef_fix.
 653             *    if Lexer_parser.is_typedef s
 654             *    then TypedefIdent (s, info)
 655             *    else TIdent (s, info)
 656             *
 657             * update: now this is no more useful, cos
 658             * as we use tokens_all, it first parse all as an ident and
 659             * later transform an indent in a typedef. so the typedef job is
 660             * now done in parse_c.ml.
 661             *)
 662
 663           | None -> TIdent (s, info)
 664         )
 665       }
 666   (* gccext: apparently gcc allows dollar in variable names. found such
 667    * thing a few time in linux and in glibc. No need look in keyword_table
 668    * here.
 669    *)
 670   | (letter | '$') (letter | digit | '$') *
 671       {
 672         let info = tokinfo lexbuf in
 673         let s = tok lexbuf in
 674         pr2 ("LEXER: identifier with dollar: "  ^ s);
 675         TIdent (s, info)
 676       }
 677   | (letter | '$') (letter | digit | '$' | '~') *
 678     ("::" (letter | '$' | '~') (letter | digit | '$' | '~') *
 679       ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) *
 680
 681       {
 682         if !Flag.c_plus_plus
 683         then
 684           begin
 685             let info = tokinfo lexbuf in
 686             let s = tok lexbuf in
 687             TIdent (s, info)
 688           end
 689         else
 690           raise
 691             (Lexical "~ and :: not allowed in C identifiers, try -c++ option")
 692       }
 693
 694
 695   (* ----------------------------------------------------------------------- *)
 696   (* C constant *)
 697   (* ----------------------------------------------------------------------- *)
 698
 699   | "'"
 700       { let info = tokinfo lexbuf in
 701         let s = char lexbuf   in
 702         TChar     ((s,   IsChar),  (info +> tok_add_s (s ^ "'")))
 703       }
 704   | '"'
 705       { let info = tokinfo lexbuf in
 706         let s = string lexbuf in
 707         TString   ((s,   IsChar),  (info +> tok_add_s (s ^ "\"")))
 708       }
 709   (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
 710   | 'L' "'"
 711       { let info = tokinfo lexbuf in
 712         let s = char lexbuf   in
 713         TChar     ((s,   IsWchar),  (info +> tok_add_s (s ^ "'")))
 714       }
 715   | 'L' '"'
 716       { let info = tokinfo lexbuf in
 717         let s = string lexbuf in
 718         TString   ((s,   IsWchar),  (info +> tok_add_s (s ^ "\"")))
 719       }
 720
 721
 722   (* Take care of the order ? No because lex tries the longest match. The
 723    * strange diff between decimal and octal constant semantic is not
 724    * understood too by refman :) refman:11.1.4, and ritchie.
 725    *)
 726
 727   | decimal as x
 728       { TInt ((x, is_long_dec x sint slong slong ulong), tokinfo lexbuf) }
 729   | hexa as x
 730       { TInt ((x, is_long_hex x sint uint slong ulong), tokinfo lexbuf) }
 731   | octal as x
 732       { TInt ((x, is_long_oct x sint uint slong ulong), tokinfo lexbuf) }
 733   | ((decimal as s) ['u' 'U']) as x
 734       { TInt ((x, is_long_dec s uint uint ulong ulong), tokinfo lexbuf) }
 735   | ((hexa as s) ['u' 'U']) as x
 736       { TInt ((x, is_long_hex s uint uint ulong ulong), tokinfo lexbuf) }
 737   | ((octal as s) ['u' 'U']) as x
 738       { TInt ((x, is_long_oct s uint uint ulong ulong), tokinfo lexbuf) }
 739   | (( decimal as s) ['l' 'L']) as x
 740       { TInt ((x, is_long_dec s slong slong slong ulong), tokinfo lexbuf) }
 741   | ((hexa as s) ['l' 'L']) as x
 742       { TInt ((x, is_long_hex s slong slong slong ulong), tokinfo lexbuf) }
 743   | ((octal as s) ['l' 'L']) as x
 744       { TInt ((x, is_long_oct s slong slong slong ulong), tokinfo lexbuf) }
 745   | ((( decimal | hexa | octal) ['l' 'L'] ['u' 'U'])
 746   | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'])) as x
 747       { TInt ((x, (UnSigned,CLong)), tokinfo lexbuf) }
 748   | (( decimal | hexa | octal) ['l' 'L'] ['l' 'L']) as x
 749       { TInt ((x, (Signed,CLongLong)), tokinfo lexbuf) }
 750   | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'] ['l' 'L']) as x
 751       { TInt ((x, (UnSigned,CLongLong)), tokinfo lexbuf) }
 752
 753   | (real ['f' 'F']) as x { TFloat ((x, CFloat),      tokinfo lexbuf) }
 754   | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
 755   | (real as x)           { TFloat ((x, CDouble),     tokinfo lexbuf) }
 756
 757   | ['0'] ['0'-'9']+
 758       { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
 759         TUnknown (tokinfo lexbuf)
 760       }
 761   | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
 762       { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
 763         TUnknown (tokinfo lexbuf)
 764       }
 765
 766
 767  (* !!! to put after other rules !!! otherwise 0xff
 768   * will be parsed as an ident.
 769   *)
 770   | ['0'-'9']+ letter (letter | digit) *
 771       { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
 772         TIdent (tok lexbuf, tokinfo lexbuf)
 773       }
 774
 775 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
 776 (*
 777  | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
 778  | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
 779 *)
 780
 781
 782   (*------------------------------------------------------------------------ *)
 783   | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
 784
 785   | _
 786       {
 787         if !Flag_parsing_c.verbose_lexing
 788         then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
 789         TUnknown (tokinfo lexbuf)
 790       }
 791
 792
 793
 794 (*****************************************************************************)
 795 and char = parse
 796   | (_ as x)                                    "'"  { String.make 1 x }
 797   (* todo?: as for octal, do exception  beyond radix exception ? *)
 798   | (("\\" (oct | oct oct | oct oct oct)) as x  "'") { x }
 799   (* this rule must be after the one with octal, lex try first longest
 800    * and when \7  we want an octal, not an exn.
 801    *)
 802   | (("\\x" ((hex | hex hex))) as x        "'")      { x }
 803   | (("\\" (_ as v))           as x        "'")
 804         {
 805           (match v with (* Machine specific ? *)
 806           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 807           | 'f' -> () | 'a' -> ()
 808           | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 809           | 'e' -> () (* linuxext: ? *)
 810           | _ ->
 811               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 812           );
 813           x
 814         }
 815   | _
 816       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 817         tok lexbuf
 818       }
 819
 820
 821
 822 (*****************************************************************************)
 823
 824 (* todo? factorise code with char ? but not same ending token so hard. *)
 825 and string  = parse
 826   | '"'                                       { "" }
 827   | (_ as x)                                  { string_of_char x^string lexbuf}
 828   | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
 829   | ("\\x" (hex | hex hex)) as x              { x ^ string lexbuf }
 830   | ("\\" (_ as v)) as x
 831        {
 832          (match v with (* Machine specific ? *)
 833          | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 834          | 'f' -> () | 'a' -> ()
 835          | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 836          | 'e' -> () (* linuxext: ? *)
 837
 838          (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
 839
 840          (* cppext:  can have   \ for multiline in string too *)
 841          | '\n' -> ()
 842          | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
 843          );
 844           x ^ string lexbuf
 845        }
 846
 847   | eof { pr2 "LEXER: WIERD end of file in string"; ""}
 848
 849  (* Bug if add following code, cos match also the '"' that is needed
 850   * to finish the string, and so go until end of file.
 851   *)
 852  (*
 853   | [^ '\\']+
 854     { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
 855       cs ++ string lexbuf
 856     }
 857   *)
 858
 859
 860
 861 (*****************************************************************************)
 862
 863 (* less: allow only char-'*' ? *)
 864 and comment = parse
 865   | "*/"     { tok lexbuf }
 866   (* noteopti: *)
 867   | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
 868   | [ '*']   { let s = tok lexbuf in s ^ comment lexbuf }
 869   | eof { pr2 "LEXER: end of file in comment"; "*/"}
 870   | _
 871       { let s = tok lexbuf in
 872         pr2 ("LEXER: unrecognised symbol in comment:"^s);
 873         s ^ comment lexbuf
 874       }
 875
 876
 877
 878 (*****************************************************************************)
 879
 880 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
 881  * then he has already erased the /* comment. So:
 882  * - dont eat the start of the comment otherwise afterwards we are in the middle
 883  *   of a comment and so will problably get a parse error somewhere.
 884  * - have to recognize comments in cpp_eat_until_nl.
 885  *)
 886
 887 and cpp_eat_until_nl = parse
 888   (* bugfix: *)
 889   | "/*"
 890       { let s = tok lexbuf in
 891         let s2 = comment lexbuf in
 892         let s3 = cpp_eat_until_nl lexbuf in
 893         s ^ s2 ^ s3
 894       }
 895   | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 896
 897   | "\n"      { tok lexbuf }
 898   (* noteopti:
 899    * update: need also deal with comments chars now
 900    *)
 901   | [^ '\n' '\\'      '/' '*'  ]+
 902      { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 903   | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
 904   | _   { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }