parsing_c/lexer_c.mll

   1 {
   2 (* Yoann Padioleau
   3  *
   4  * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License (GPL)
   8  * version 2 as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * file license.txt for more details.
  14  *)
  15 open Common
  16
  17 open Parser_c
  18
  19 open Ast_c (* to factorise tokens, OpAssign, ... *)
  20
  21 (*****************************************************************************)
  22 (*
  23  * subtil: ocamllex use side effect on lexbuf, so must take care.
  24  * For instance must do
  25  *
  26  *  let info = tokinfo lexbuf in
  27  *  TComment (info +> tok_add_s (comment lexbuf))
  28  *
  29  * and not
  30  *
  31  *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
  32  *
  33  * because of the "wierd" order of evaluation of OCaml.
  34  *
  35  *
  36  *
  37  * note: can't use Lexer_parser._lexer_hint here to do different
  38  * things, because now we call the lexer to get all the tokens
  39  * (tokens_all), and then we parse. So we can't have the _lexer_hint
  40  * info here. We can have it only in parse_c. For the same reason, the
  41  * typedef handling here is now useless.
  42  *)
  43 (*****************************************************************************)
  44
  45 (*****************************************************************************)
  46 (* Wrappers *)
  47 (*****************************************************************************)
  48 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
  49
  50 (*****************************************************************************)
  51
  52
  53 exception Lexical of string
  54
  55 let tok     lexbuf  = Lexing.lexeme lexbuf
  56
  57 let tokinfo lexbuf  =
  58   {
  59     pinfo = Ast_c.OriginTok {
  60       Common.charpos = Lexing.lexeme_start lexbuf;
  61       Common.str     = Lexing.lexeme lexbuf;
  62       (* info filled in a post-lexing phase *)
  63       Common.line = -1;
  64       Common.column = -1;
  65       Common.file = "";
  66     };
  67    (* must generate a new ref each time, otherwise share *)
  68     cocci_tag = ref Ast_c.emptyAnnot;
  69     annots_tag = Token_annot.empty;
  70     comments_tag = ref Ast_c.emptyComments;
  71   }
  72
  73 (* cppext: must generate a new ref each time, otherwise share *)
  74 let no_ifdef_mark () = ref (None: (int * int) option)
  75
  76 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
  77
  78
  79 (* opti: less convenient, but using a hash is faster than using a match *)
  80 let keyword_table = Common.hash_of_list [
  81
  82   (* c: *)
  83   "void",   (fun ii -> Tvoid ii);
  84   "char",   (fun ii -> Tchar ii);
  85   "short",  (fun ii -> Tshort ii);
  86   "int",    (fun ii -> Tint ii);
  87   "long",   (fun ii -> Tlong ii);
  88   "float",  (fun ii -> Tfloat ii);
  89   "double", (fun ii -> Tdouble ii);
  90   "size_t", (fun ii -> Tsize_t ii);
  91   "ssize_t", (fun ii -> Tssize_t ii);
  92   "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
  93
  94   "unsigned", (fun ii -> Tunsigned ii);
  95   "signed",   (fun ii -> Tsigned ii);
  96
  97   "auto",     (fun ii -> Tauto ii);
  98   "register", (fun ii -> Tregister ii);
  99   "extern",   (fun ii -> Textern ii);
 100   "static",   (fun ii -> Tstatic ii);
 101
 102   "const",    (fun ii -> Tconst ii);
 103   "volatile", (fun ii -> Tvolatile ii);
 104
 105   "struct",  (fun ii -> Tstruct ii);
 106   "union",   (fun ii -> Tunion ii);
 107   "enum",    (fun ii -> Tenum ii);
 108   "typedef", (fun ii -> Ttypedef ii);
 109
 110   "if",      (fun ii -> Tif ii);
 111   "else",     (fun ii -> Telse ii);
 112   "break",   (fun ii -> Tbreak ii);
 113   "continue", (fun ii -> Tcontinue ii);
 114   "switch",  (fun ii -> Tswitch ii);
 115   "case",     (fun ii -> Tcase ii);
 116   "default", (fun ii -> Tdefault ii);
 117   "for",     (fun ii -> Tfor ii);
 118   "do",      (fun ii -> Tdo ii);
 119   "while",   (fun ii -> Twhile ii);
 120   "return",  (fun ii -> Treturn ii);
 121   "goto",    (fun ii -> Tgoto ii);
 122
 123   "sizeof", (fun ii -> Tsizeof ii);
 124
 125
 126   (* gccext: cppext: linuxext: synonyms *)
 127   "asm",     (fun ii -> Tasm ii);
 128   "__asm__", (fun ii -> Tasm ii);
 129   "__asm",   (fun ii -> Tasm ii);
 130
 131   "inline",     (fun ii -> Tinline ii);
 132   "__inline__", (fun ii -> Tinline ii);
 133   "__inline",   (fun ii -> Tinline ii);
 134
 135   "__attribute__", (fun ii -> Tattribute ii);
 136   "__attribute", (fun ii -> Tattribute ii);
 137
 138   "typeof", (fun ii -> Ttypeof ii);
 139   "__typeof__", (fun ii -> Ttypeof ii);
 140   "__typeof", (fun ii -> Ttypeof ii);
 141
 142         (* found a lot in expanded code *)
 143   "__extension__", (fun ii -> TattributeNoarg ii);
 144
 145
 146   (* gccext: alias *)
 147   "__signed__",     (fun ii -> Tsigned ii);
 148
 149   "__const__",     (fun ii -> Tconst ii);
 150   "__const",     (fun ii -> Tconst ii);
 151
 152   "__volatile__",  (fun ii -> Tvolatile ii);
 153   "__volatile",    (fun ii -> Tvolatile ii);
 154
 155   (* windowsext: *)
 156   "__declspec", (fun ii -> Tattribute ii);
 157
 158   "__stdcall", (fun ii -> TattributeNoarg ii);
 159   "__cdecl", (fun ii -> TattributeNoarg ii);
 160   "WINAPI", (fun ii -> TattributeNoarg ii);
 161   "APIENTRY", (fun ii -> TattributeNoarg ii);
 162   "CALLBACK", (fun ii -> TattributeNoarg ii);
 163
 164   (* c99:  *)
 165   (* no just "restrict" ? maybe for backward compatibility they avoided
 166    * to use restrict which people may have used in their program already
 167    *)
 168   "__restrict",    (fun ii -> Trestrict ii);
 169   "__restrict__",    (fun ii -> Trestrict ii);
 170
 171  ]
 172
 173 let cpp_keyword_table = Common.hash_of_list [
 174   "namespace", (fun ii -> Tnamespace ii);
 175   "new",       (fun ii -> Tnew ii);
 176   "delete",    (fun ii -> Tdelete ii);
 177   "using",     (fun ii -> TComment ii) ]
 178
 179 let error_radix s =
 180   ("numeric " ^ s ^ " constant contains digits beyond the radix:")
 181
 182 (* julia: functions for figuring out the type of integers *)
 183
 184 let is_long_dec s int uint long ulong =
 185   match !Flag_parsing_c.int_thresholds with
 186     None -> int
 187   | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
 188       let bn = Big_int.big_int_of_string s in
 189       if Big_int.ge_big_int bn ulong_threshold
 190       then ulong
 191       else
 192         if Big_int.ge_big_int bn long_threshold
 193         then long
 194         else
 195           if Big_int.ge_big_int bn uint_threshold
 196           then long
 197           else int
 198
 199 let is_long_ho s int uint long ulong drop bpd count =
 200   match !Flag_parsing_c.int_thresholds with
 201     None -> int
 202   | Some (uint_sz,ulong_sz,_,_,_) ->
 203       let len = String.length s in
 204       (* this assumes that all of the hex/oct digits are significant *)
 205       (* drop is 2 for hex (0x) and 1 for oct (0) *)
 206       let s = String.sub s drop (len - drop) in
 207       let len =
 208         ((len-drop) * bpd) -
 209           (count (int_of_string("0x"^(String.sub s 0 1)))) in
 210       if len < uint_sz
 211       then int
 212       else
 213         if len = uint_sz
 214         then uint
 215         else
 216           if len < ulong_sz
 217           then long
 218           else ulong
 219
 220 let is_long_oct s int uint long ulong =
 221   is_long_ho s int uint long ulong 1 3
 222     (* stupid, but probably more efficient than taking logs *)
 223     (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
 224 let is_long_hex s int uint long ulong =
 225   is_long_ho s int uint long ulong 2 4
 226     (* stupid, but probably more efficient than taking logs *)
 227     (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
 228       | _ -> 0)
 229
 230 let sint = (Signed,CInt)
 231 let uint = (UnSigned,CInt)
 232 let slong = (Signed,CLong)
 233 let ulong = (UnSigned,CLong)
 234
 235 }
 236
 237 (*****************************************************************************)
 238 let letter = ['A'-'Z' 'a'-'z' '_']
 239 let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*)
 240 let digit  = ['0'-'9']
 241
 242 let cplusplus_ident = (letter | '$') (letter | digit | '$') *
 243 let cplusplus_ident_ext = (letter | '~' | '$') (letter | digit | '~' | '$') *
 244
 245 (* not used for the moment *)
 246 let punctuation = ['!' '\"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
 247                    ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
 248 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
 249 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
 250 (* 7 = \a = bell in C. this is not the only char allowed !!
 251  * ex @ and $ ` are valid too
 252  *)
 253
 254 let cchar = (letter | digit | punctuation | additionnal)
 255
 256 let sp = [' ' '\t']+
 257 let spopt = [' ' '\t']*
 258
 259 let dec = ['0'-'9']
 260 let oct = ['0'-'7']
 261 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
 262
 263 let decimal = ('0' | (['1'-'9'] dec*))
 264 let octal   = ['0']        oct+
 265 let hexa    = ("0x" |"0X") hex+
 266
 267
 268 let pent   = dec+
 269 let pfract = dec+
 270 let sign = ['-' '+']
 271 let exp  = ['e''E'] sign? dec+
 272 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
 273
 274 let id = letter (letter | digit) *
 275
 276 (*****************************************************************************)
 277 rule token = parse
 278
 279   (* ----------------------------------------------------------------------- *)
 280   (* spacing/comments *)
 281   (* ----------------------------------------------------------------------- *)
 282
 283   (* note: this lexer generate tokens for comments!! so can not give
 284    * this lexer as-is to the parsing function. The caller must preprocess
 285    * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
 286    *
 287    * update: we now also generate a separate token for newlines, so now
 288    * the caller may also have to reagglomerate all those commentspace
 289    * tokens if he was assuming that spaces were agglomerate in a single
 290    * token.
 291    *)
 292
 293   | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
 294       (* starting a new line; the newline character followed by whitespace *)
 295       { TCommentNewline (tokinfo lexbuf) }
 296   | [' ' '\t' '\r' '\011' '\012' ]+
 297       { TCommentSpace (tokinfo lexbuf) }
 298   | "/*"
 299       { let info = tokinfo lexbuf in
 300         let com = comment lexbuf in
 301
 302         let info' = info +> tok_add_s com in
 303         let s = Ast_c.str_of_info info' in
 304         (* could be more flexible, use [\t ]* instead of hardcoded
 305          * single space. *)
 306         match s with
 307         | "/* {{coccinelle:skip_start}} */" ->
 308             TCommentSkipTagStart (info')
 309         | "/* {{coccinelle:skip_end}} */" ->
 310             TCommentSkipTagEnd (info')
 311         | _ -> TComment(info')
 312       }
 313
 314
 315   (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
 316    * So need this here only when dont call cpp before.
 317    * note that we don't keep the trailing \n; it will be in another token.
 318    *)
 319   | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) }
 320
 321   (* ----------------------------------------------------------------------- *)
 322   (* cpp *)
 323   (* ----------------------------------------------------------------------- *)
 324
 325   (* old:
 326    *   | '#'            { endline lexbuf} // should be line, and not endline
 327    *   and endline = parse  | '\n'      { token lexbuf}
 328    *                        |   _       { endline lexbuf}
 329    *)
 330
 331   (* less?:
 332    *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ...
 333    *    => just "#" -> token {lexbuf} (that is ignore)
 334    *  il y'a 1 #elif  sans rien  apres
 335    *  il y'a 1 #error sans rien  apres
 336    *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
 337    *  entouré par des #if 0
 338    *  => make as for comment, call a comment_cpp that when #endif finish the
 339    *   comment and if other cpp stuff raise exn
 340    *  il y'a environ 10  #if(xxx)  ou le ( est collé direct
 341    *  il y'a des include"" et include<
 342    *  il y'a 1 ` (derriere un #ifndef linux)
 343    *)
 344
 345
 346
 347   (* ---------------------- *)
 348   (* misc *)
 349   (* ---------------------- *)
 350
 351   (* bugfix: I want now to keep comments for the cComment study
 352    * so cant do:    sp [^'\n']+ '\n'
 353    * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
 354    *)
 355
 356   | "#" spopt "pragma"  sp  [^'\n' '\r']* ('\n' | "\r\n")
 357   | "#" spopt "ident"   sp  [^'\n' '\r']* ('\n' | "\r\n")
 358   | "#" spopt "line"    sp  [^'\n' '\r']* ('\n' | "\r\n")
 359   | "#" spopt "error"   sp  [^'\n' '\r']* ('\n' | "\r\n")
 360   | "#" spopt "warning" sp  [^'\n' '\r']* ('\n' | "\r\n")
 361   | "#" spopt "abort"   sp  [^'\n' '\r']* ('\n' | "\r\n")
 362       { TCppDirectiveOther (tokinfo lexbuf) }
 363
 364   | "#" [' ' '\t']* ('\n' | "\r\n")
 365       { TCppDirectiveOther (tokinfo lexbuf) }
 366
 367   (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
 368   | "#" sp pent sp  '\"' [^ '\"']* '\"' (spopt pent)*  spopt ('\n' | "\r\n")
 369       { TCppDirectiveOther (tokinfo lexbuf) }
 370
 371
 372
 373   (* ---------------------- *)
 374   (* #define, #undef *)
 375   (* ---------------------- *)
 376
 377   (* the rest of the lexing/parsing of define is done in fix_tokens_define
 378    * where we parse until a TCppEscapedNewline and generate a TDefEol
 379    *)
 380   | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
 381
 382   (* note: in some cases can have stuff after the ident as in #undef XXX 50,
 383    * but I currently don't handle it cos I think it's bad code.
 384    *)
 385   | "#" [' ' '\t']* "undef" { TUndef (tokinfo lexbuf) }
 386
 387   (* ---------------------- *)
 388   (* #include *)
 389   (* ---------------------- *)
 390
 391   (* The difference between a local "" and standard <> include is computed
 392    * later in parser_c.mly. So redo a little bit of lexing there; ugly but
 393    * simpler to generate a single token here.  *)
 394   | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
 395     (('\"' ([^ '\"']+) '\"' |
 396      '<' [^ '>']+ '>' |
 397       ['A'-'Z''_']+
 398     ) as filename)
 399       { let info = tokinfo lexbuf in
 400         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 401       }
 402   (* gccext: found in glibc *)
 403   | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
 404     (('\"' ([^ '\"']+) '\"' |
 405      '<' [^ '>']+ '>' |
 406       ['A'-'Z''_']+
 407     ) as filename)
 408       { let info = tokinfo lexbuf in
 409         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 410       }
 411
 412   (* ---------------------- *)
 413   (* #ifdef *)
 414   (* ---------------------- *)
 415
 416   (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
 417    * when working on the ifdef view.
 418    *)
 419
 420   (* '0'+ because sometimes it is a #if 000 *)
 421   | "#" [' ' '\t']* "if" [' ' '\t']* '0'+           (* [^'\n']*  '\n' *)
 422       { let info = tokinfo lexbuf in
 423         TIfdefBool (false, no_ifdef_mark(), info)
 424           (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
 425       }
 426
 427   | "#" [' ' '\t']* "if" [' ' '\t']* '1'   (* [^'\n']*  '\n' *)
 428       { let info = tokinfo lexbuf in
 429         TIfdefBool (true, no_ifdef_mark(), info)
 430
 431       }
 432
 433  (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
 434   | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n' '\r']* ('\n' | "\r\n")
 435       { let info = tokinfo lexbuf in
 436         TIfdefMisc (false, no_ifdef_mark(), info)
 437       }
 438
 439  (* DO NOT cherry pick to lexer_cplusplus !!! *)
 440   | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus"   [^'\n']*  '\n'
 441       { let info = tokinfo lexbuf in
 442         TIfdefMisc (false, no_ifdef_mark(), info)
 443       }
 444
 445   (* in glibc *)
 446   | "#" spopt ("ifdef"|"if") sp "__STDC__"
 447       { let info = tokinfo lexbuf in
 448         TIfdefVersion (true, no_ifdef_mark(),
 449                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 450       }
 451
 452
 453   (* linuxext: different possible variations (we do not manage all of them):
 454
 455     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
 456     #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
 457     #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 458     #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
 459     #if LINUX_VERSION_CODE < 0x020600
 460     #if LINUX_VERSION_CODE >= 0x2051c
 461     #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 462     #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
 463     #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 464     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)  &&  LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 465     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
 466     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 467     #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
 468     #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
 469     #ifndef LINUX_VERSION_CODE
 470     #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
 471     (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
 472     LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
 473     #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
 474     #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
 475     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 476
 477   *)
 478
 479   (* linuxext: must be before the generic rules for if and ifdef *)
 480   | "#" spopt "if" sp "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 481       { let info = tokinfo lexbuf in
 482         TIfdefVersion (true, no_ifdef_mark(),
 483                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 484       }
 485   (* linuxext: *)
 486   | "#" spopt "if" sp "!" "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 487   | "#" spopt "if" sp ['(']?  "LINUX_VERSION_CODE" sp ("<=" | "<") sp
 488
 489       { let info = tokinfo lexbuf in
 490         TIfdefVersion (false, no_ifdef_mark(),
 491                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 492       }
 493
 494
 495
 496
 497   (* can have some ifdef 0  hence the letter|digit even at beginning of word *)
 498   | "#" [' ''\t']* "ifdef"  [' ''\t']+
 499     (((letter|digit) ((letter|digit)*)) as x) [' ''\t']*
 500       { if List.mem x !Flag_parsing_c.undefined
 501         then TIfdefBool (false, no_ifdef_mark(), tokinfo lexbuf)
 502         else if List.mem x !Flag_parsing_c.defined
 503         then TIfdefBool (true, no_ifdef_mark(), tokinfo lexbuf)
 504         else TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 505   | "#" [' ''\t']* "ifndef" [' ''\t']+
 506      (((letter|digit) ((letter|digit)*)) as x) [' ''\t']*
 507       { if List.mem x !Flag_parsing_c.defined
 508         then TIfdefBool (false, no_ifdef_mark(), tokinfo lexbuf)
 509         else if List.mem x !Flag_parsing_c.undefined
 510         then TIfdefBool (true, no_ifdef_mark(), tokinfo lexbuf)
 511         else TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 512   | "#" [' ''\t']* "if" [' ' '\t']+
 513       { let info = tokinfo lexbuf in
 514         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 515       }
 516   | "#" [' ' '\t']* "if" '('
 517       { let info = tokinfo lexbuf in
 518         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 519       }
 520
 521   | "#" [' ' '\t']* "elif" [' ' '\t']+
 522       { let info = tokinfo lexbuf in
 523         TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 524       }
 525
 526
 527   | "#" [' ''\t']* "endif"  [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 528       { TEndif (no_ifdef_mark(), tokinfo lexbuf) }
 529   (* bugfix: can have #endif LINUX  but at the same time if I eat everything
 530    * until next line, I may miss some TComment which for some tools
 531    * are important such as aComment
 532    *)
 533   | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
 534       TEndif     (no_ifdef_mark(), tokinfo lexbuf)
 535     }
 536   (* can be at eof *)
 537   (*| "#" [' ' '\t']* "endif"                { TEndif     (tokinfo lexbuf) }*)
 538
 539   | "#" [' ' '\t']* "else" ([' ' '\t' '\n'] | "\r\n")
 540       { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
 541
 542
 543
 544
 545   (* ---------------------- *)
 546   (* #define body *)
 547   (* ---------------------- *)
 548
 549   (* only in cpp directives normally *)
 550   | "\\" ('\n' | "\r\n") { TCppEscapedNewline (tokinfo lexbuf) }
 551
 552   (* We must generate separate tokens for #, ## and extend the grammar.
 553    * Note there can be "elaborated" idents in many different places, in
 554    * expression but also in declaration, in function name. So having 3 tokens
 555    * for an ident does not work well with how we add info in
 556    * ast_c. Was easier to generate just one token, just one info,
 557    * even if have later to reanalyse those tokens and unsplit. But then,
 558    * handling C++ lead to having not just a string for ident but something
 559    * more complex. Also when we want to parse elaborated function headers
 560    * (e.g. void METH(foo)(int x)), we need anyway to go from a string
 561    * to something more. So having also for C something more than just
 562    * string for ident is natural.
 563    *
 564    * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
 565    * an easier solution would be to augment the TIdent type such as
 566    *   TIdent of string * info * cpp_ident_additionnal_info
 567    *
 568    * old:
 569    * |  id   ([' ''\t']* "##" [' ''\t']* id)+
 570    *   { let info = tokinfo lexbuf in
 571    *     TIdent (tok lexbuf, info)
 572    *   }
 573    * |  "##" spopt id
 574    *   { let info = tokinfo lexbuf in
 575    *     TIdent (tok lexbuf, info)
 576    *   }
 577    *
 578    *)
 579   (* cppext: string concatenation of idents, also ##args for variadic macro. *)
 580   | "##" { TCppConcatOp (tokinfo lexbuf) }
 581
 582   (* cppext: stringification.
 583    * bugfix: this case must be after the other cases such as #endif
 584    * otherwise take precedent.
 585    *)
 586   |  "#" spopt id
 587       { let info = tokinfo lexbuf in
 588         TIdent (tok lexbuf, info)
 589       }
 590   (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
 591   | ((id as s)  "...")
 592       { TDefParamVariadic (s, tokinfo lexbuf) }
 593
 594
 595
 596
 597
 598   (* ----------------------------------------------------------------------- *)
 599   (* C symbols *)
 600   (* ----------------------------------------------------------------------- *)
 601    (* stdC:
 602     ...   &&   -=   >=   ~   +   ;   ]
 603     <<=   &=   ->   >>   %   ,   <   ^
 604     >>=   *=   /=   ^=   &   -   =   {
 605     !=    ++   <<   |=   (   .   >   |
 606     %=    +=   <=   ||   )   /   ?   }
 607         --   ==   !    *   :   [
 608     recent addition:    <:  :>  <%  %>
 609     only at processing: %:  %:%: # ##
 610    *)
 611
 612
 613   | '[' { TOCro(tokinfo lexbuf) }   | ']' { TCCro(tokinfo lexbuf) }
 614   | '(' { TOPar(tokinfo lexbuf)   } | ')' { TCPar(tokinfo lexbuf)   }
 615   | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
 616
 617   | '+' { TPlus(tokinfo lexbuf) }   | '*' { TMul(tokinfo lexbuf) }
 618   | '-' { TMinus(tokinfo lexbuf) }  | '/' { TDiv(tokinfo lexbuf) }
 619   | '%' { TMod(tokinfo lexbuf) }    | ">?" { TMax(tokinfo lexbuf) }
 620   | "<?" { TMin(tokinfo lexbuf) }
 621
 622   | "++"{ TInc(tokinfo lexbuf) }    | "--"{ TDec(tokinfo lexbuf) }
 623
 624   | "="  { TEq(tokinfo lexbuf) }
 625
 626   | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
 627   | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
 628   | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
 629   | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
 630   | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
 631   | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
 632   | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
 633   | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
 634   | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
 635   | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
 636   | ">?=" {TAssign (OpAssign Max, (tokinfo lexbuf))}
 637   | "<?=" {TAssign (OpAssign Min, (tokinfo lexbuf))}
 638
 639   | "==" { TEqEq(tokinfo lexbuf) }  | "!=" { TNotEq(tokinfo lexbuf) }
 640   | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
 641   | "<"  { TInf(tokinfo lexbuf) }   | ">"  { TSup(tokinfo lexbuf) }
 642
 643   | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
 644   | ">>" { TShr(tokinfo lexbuf) }    | "<<" { TShl(tokinfo lexbuf) }
 645   | "&"  { TAnd(tokinfo lexbuf) }    | "|" { TOr(tokinfo lexbuf) }
 646   | "^"  { TXor(tokinfo lexbuf) }
 647   | "..." { TEllipsis(tokinfo lexbuf) }
 648   | "->"   { TPtrOp(tokinfo lexbuf) }  | '.'  { TDot(tokinfo lexbuf) }
 649   | ','    { TComma(tokinfo lexbuf) }
 650   | ";"    { TPtVirg(tokinfo lexbuf) }
 651   | "?"    { TWhy(tokinfo lexbuf) }    | ":"   { TDotDot(tokinfo lexbuf) }
 652   | "!"    { TBang(tokinfo lexbuf) }   | "~"   { TTilde(tokinfo lexbuf) }
 653
 654   | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
 655   | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
 656
 657
 658
 659   (* ----------------------------------------------------------------------- *)
 660   (* C keywords and ident *)
 661   (* ----------------------------------------------------------------------- *)
 662
 663   (* StdC: must handle at least name of length > 509, but can
 664    * truncate to 31 when compare and truncate to 6 and even lowerise
 665    * in the external linkage phase
 666    *)
 667   | letter (letter | digit) *
 668       { let info = tokinfo lexbuf in
 669         let s = tok lexbuf in
 670         Common.profile_code "C parsing.lex_ident" (fun () ->
 671           let tok =
 672             if !Flag.c_plus_plus
 673             then Common.optionise (fun () -> Hashtbl.find cpp_keyword_table s)
 674             else None in
 675           match tok with
 676             Some f -> f info
 677           | None ->
 678               match Common.optionise (fun () -> Hashtbl.find keyword_table s)
 679               with
 680               | Some f -> f info
 681
 682            (* parse_typedef_fix.
 683             *    if Lexer_parser.is_typedef s
 684             *    then TypedefIdent (s, info)
 685             *    else TIdent (s, info)
 686             *
 687             * update: now this is no more useful, cos
 688             * as we use tokens_all, it first parse all as an ident and
 689             * later transform an indent in a typedef. so the typedef job is
 690             * now done in parse_c.ml.
 691             *)
 692
 693               | None -> TIdent (s, info)
 694         )
 695       }
 696   (* gccext: apparently gcc allows dollar in variable names. found such
 697    * thing a few time in linux and in glibc. No need look in keyword_table
 698    * here.
 699    *)
 700   | (cplusplus_ident "::")+ "operator new"
 701       {
 702         let info = tokinfo lexbuf in
 703         let s = tok lexbuf in
 704         TIdent (s, info)
 705       }
 706   | cplusplus_ident
 707       {
 708         let info = tokinfo lexbuf in
 709         let s = tok lexbuf in
 710         pr2 ("LEXER: identifier with dollar: "  ^ s);
 711         TIdent (s, info)
 712       }
 713
 714   | cplusplus_ident
 715       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 716       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
 717     ("::~" cplusplus_ident
 718       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 719       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) +
 720
 721       {
 722         let info = tokinfo lexbuf in
 723         let s = tok lexbuf in
 724         if !Flag.c_plus_plus
 725         then Tconstructorname (s, info)
 726         else
 727           begin
 728             pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
 729             TIdent (s, info)
 730           end
 731       }
 732   | cplusplus_ident
 733       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 734       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>')
 735
 736       {
 737         let info = tokinfo lexbuf in
 738         let s = tok lexbuf in
 739         if !Flag.c_plus_plus
 740         then TypedefIdent (s, info)
 741         else
 742           begin
 743             pr2_once "<> detected, try -c++ option";
 744             TIdent (s, info)
 745           end
 746       }
 747
 748
 749   | (cplusplus_ident as first)
 750       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 751       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
 752     "::" (cplusplus_ident as second)
 753       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 754       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
 755     ("::" cplusplus_ident
 756       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 757       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) *
 758
 759       {
 760         let info = tokinfo lexbuf in
 761         let s = tok lexbuf in
 762         if !Flag.c_plus_plus
 763         then
 764           begin
 765             if first = second
 766             then Tconstructorname (s, info)
 767             else TIdent (s, info)
 768           end
 769         else
 770           begin
 771             pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
 772             TIdent (s, info)
 773           end
 774       }
 775
 776    | "::" cplusplus_ident
 777       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 778       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
 779     ("::" cplusplus_ident
 780       ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
 781       (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) *
 782
 783       {
 784         let info = tokinfo lexbuf in
 785         let s = tok lexbuf in
 786         (if not !Flag.c_plus_plus
 787         then
 788           pr2_once "~ and :: not allowed in C identifiers, try -c++ option");
 789         TIdent (s, info)
 790       }
 791
 792   (* ----------------------------------------------------------------------- *)
 793   (* C constant *)
 794   (* ----------------------------------------------------------------------- *)
 795
 796   | "'"
 797       { let info = tokinfo lexbuf in
 798         let s = char lexbuf   in
 799         TChar     ((s,   IsChar),  (info +> tok_add_s (s ^ "'")))
 800       }
 801   | '\"'
 802       { let info = tokinfo lexbuf in
 803         let s = string lexbuf in
 804         TString   ((s,   IsChar),  (info +> tok_add_s (s ^ "\"")))
 805       }
 806   (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
 807   | 'L' "'"
 808       { let info = tokinfo lexbuf in
 809         let s = char lexbuf   in
 810         TChar     ((s,   IsWchar),  (info +> tok_add_s (s ^ "'")))
 811       }
 812   | 'L' '\"'
 813       { let info = tokinfo lexbuf in
 814         let s = string lexbuf in
 815         TString   ((s,   IsWchar),  (info +> tok_add_s (s ^ "\"")))
 816       }
 817
 818
 819   (* Take care of the order ? No because lex tries the longest match. The
 820    * strange diff between decimal and octal constant semantic is not
 821    * understood too by refman :) refman:11.1.4, and ritchie.
 822    *)
 823
 824   | decimal as x
 825       { TInt ((x, is_long_dec x sint slong slong ulong), tokinfo lexbuf) }
 826   | hexa as x
 827       { TInt ((x, is_long_hex x sint uint slong ulong), tokinfo lexbuf) }
 828   | octal as x
 829       { TInt ((x, is_long_oct x sint uint slong ulong), tokinfo lexbuf) }
 830   | ((decimal as s) ['u' 'U']) as x
 831       { TInt ((x, is_long_dec s uint uint ulong ulong), tokinfo lexbuf) }
 832   | ((hexa as s) ['u' 'U']) as x
 833       { TInt ((x, is_long_hex s uint uint ulong ulong), tokinfo lexbuf) }
 834   | ((octal as s) ['u' 'U']) as x
 835       { TInt ((x, is_long_oct s uint uint ulong ulong), tokinfo lexbuf) }
 836   | (( decimal as s) ['l' 'L']) as x
 837       { TInt ((x, is_long_dec s slong slong slong ulong), tokinfo lexbuf) }
 838   | ((hexa as s) ['l' 'L']) as x
 839       { TInt ((x, is_long_hex s slong slong slong ulong), tokinfo lexbuf) }
 840   | ((octal as s) ['l' 'L']) as x
 841       { TInt ((x, is_long_oct s slong slong slong ulong), tokinfo lexbuf) }
 842   | ((( decimal | hexa | octal) ['l' 'L'] ['u' 'U'])
 843   | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'])) as x
 844       { TInt ((x, (UnSigned,CLong)), tokinfo lexbuf) }
 845   | (( decimal | hexa | octal) ['l' 'L'] ['l' 'L']) as x
 846       { TInt ((x, (Signed,CLongLong)), tokinfo lexbuf) }
 847   | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'] ['l' 'L']) as x
 848       { TInt ((x, (UnSigned,CLongLong)), tokinfo lexbuf) }
 849
 850   | (real ['f' 'F']) as x { TFloat ((x, CFloat),      tokinfo lexbuf) }
 851   | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
 852   | (real as x)           { TFloat ((x, CDouble),     tokinfo lexbuf) }
 853
 854   | ['0'] ['0'-'9']+
 855       { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
 856         TUnknown (tokinfo lexbuf)
 857       }
 858   | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
 859       { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
 860         TUnknown (tokinfo lexbuf)
 861       }
 862
 863
 864  (* !!! to put after other rules !!! otherwise 0xff
 865   * will be parsed as an ident.
 866   *)
 867   | ['0'-'9']+ letter (letter | digit) *
 868       { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
 869         TIdent (tok lexbuf, tokinfo lexbuf)
 870       }
 871
 872 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
 873 (*
 874  | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
 875  | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
 876 *)
 877
 878
 879   (*------------------------------------------------------------------------ *)
 880   | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
 881
 882   | _
 883       {
 884         if !Flag_parsing_c.verbose_lexing
 885         then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
 886         TUnknown (tokinfo lexbuf)
 887       }
 888
 889
 890
 891 (*****************************************************************************)
 892 and char = parse
 893   | (_ as x)                           { String.make 1 x ^ restchars lexbuf }
 894   (* todo?: as for octal, do exception  beyond radix exception ? *)
 895   | (("\\" (oct | oct oct | oct oct oct)) as x     ) { x ^ restchars lexbuf }
 896   (* this rule must be after the one with octal, lex try first longest
 897    * and when \7  we want an octal, not an exn.
 898    *)
 899   | (("\\x" ((hex | hex hex))) as x           )      { x ^ restchars lexbuf }
 900   | (("\\" (_ as v))           as x           )
 901         {
 902           (match v with (* Machine specific ? *)
 903           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 904           | 'f' -> () | 'a' -> ()
 905           | '\\' -> () | '?'  -> () | '\'' -> ()  | '\"' -> ()
 906           | 'e' -> () (* linuxext: ? *)
 907           | _ ->
 908               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 909           );
 910           x ^ restchars lexbuf
 911         }
 912   | _
 913       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 914         tok lexbuf ^ restchars lexbuf
 915       }
 916
 917 and restchars = parse
 918   | "'"                                { "" }
 919   | (_ as x)                           { String.make 1 x ^ restchars lexbuf }
 920   (* todo?: as for octal, do exception  beyond radix exception ? *)
 921   | (("\\" (oct | oct oct | oct oct oct)) as x     ) { x ^ restchars lexbuf }
 922   (* this rule must be after the one with octal, lex try first longest
 923    * and when \7  we want an octal, not an exn.
 924    *)
 925   | (("\\x" ((hex | hex hex))) as x           )      { x ^ restchars lexbuf }
 926   | (("\\" (_ as v))           as x           )
 927         {
 928           (match v with (* Machine specific ? *)
 929           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 930           | 'f' -> () | 'a' -> ()
 931           | '\\' -> () | '?'  -> () | '\'' -> ()  | '\"' -> ()
 932           | 'e' -> () (* linuxext: ? *)
 933           | _ ->
 934               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 935           );
 936           x ^ restchars lexbuf
 937         }
 938   | _
 939       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 940         tok lexbuf ^ restchars lexbuf
 941       }
 942
 943
 944 (*****************************************************************************)
 945
 946 (* todo? factorise code with char ? but not same ending token so hard. *)
 947 and string  = parse
 948   | '\"'                                       { "" }
 949   | (_ as x)                                  { string_of_char x^string lexbuf}
 950   | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
 951   | ("\\x" (hex | hex hex)) as x              { x ^ string lexbuf }
 952   | ("\\" (_ as v)) as x
 953        {
 954          (match v with (* Machine specific ? *)
 955          | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 956          | 'f' -> () | 'a' -> ()
 957          | '\\' -> () | '?'  -> () | '\'' -> ()  | '\"' -> ()
 958          | 'e' -> () (* linuxext: ? *)
 959
 960          (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
 961
 962          (* cppext:  can have   \ for multiline in string too *)
 963          | '\n' -> ()
 964          | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
 965          );
 966           x ^ string lexbuf
 967        }
 968
 969   | eof { pr2 "LEXER: WIERD end of file in string"; ""}
 970
 971  (* Bug if add following code, cos match also the '"' that is needed
 972   * to finish the string, and so go until end of file.
 973   *)
 974  (*
 975   | [^ '\\']+
 976     { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
 977       cs ++ string lexbuf
 978     }
 979   *)
 980
 981
 982
 983 (*****************************************************************************)
 984
 985 (* less: allow only char-'*' ? *)
 986 and comment = parse
 987   | "*/"     { tok lexbuf }
 988   (* noteopti: *)
 989   | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
 990   | [ '*']   { let s = tok lexbuf in s ^ comment lexbuf }
 991   | eof { pr2 "LEXER: end of file in comment"; "*/"}
 992   | _
 993       { let s = tok lexbuf in
 994         pr2 ("LEXER: unrecognised symbol in comment:"^s);
 995         s ^ comment lexbuf
 996       }
 997
 998
 999
1000 (*****************************************************************************)
1001
1002 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
1003  * then he has already erased the /* comment. So:
1004  * - dont eat the start of the comment otherwise afterwards we are in the middle
1005  *   of a comment and so will problably get a parse error somewhere.
1006  * - have to recognize comments in cpp_eat_until_nl.
1007  *)
1008
1009 and cpp_eat_until_nl = parse
1010   (* bugfix: *)
1011   | "/*"
1012       { let s = tok lexbuf in
1013         let s2 = comment lexbuf in
1014         let s3 = cpp_eat_until_nl lexbuf in
1015         s ^ s2 ^ s3
1016       }
1017   | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
1018
1019   | "\n"      { tok lexbuf }
1020   (* noteopti:
1021    * update: need also deal with comments chars now
1022    *)
1023   | [^ '\n' '\r' '\\'      '/' '*'  ]+
1024      { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
1025   | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
1026   | _   { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }