parsing_c/lexer_c.mll

   1 {
   2 (* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License (GPL)
   6  * version 2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * file license.txt for more details.
  12  *)
  13 open Common
  14
  15 open Parser_c
  16
  17 open Ast_c (* to factorise tokens, OpAssign, ... *)
  18
  19 (*****************************************************************************)
  20 (*
  21  * subtil: ocamllex use side effect on lexbuf, so must take care.
  22  * For instance must do
  23  *
  24  *  let info = tokinfo lexbuf in
  25  *  TComment (info +> tok_add_s (comment lexbuf))
  26  *
  27  * and not
  28  *
  29  *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
  30  *
  31  * because of the "wierd" order of evaluation of OCaml.
  32  *
  33  * note: can't use Lexer_parser._lexer_hint here to do different
  34  * things, because now we call the lexer to get all the tokens
  35  * (tokens_all), and then we parse. So we can't have the _lexer_hint
  36  * info here. We can have it only in parse_c. For the same reason, the
  37  * typedef handling here is now useless.
  38  *)
  39 (*****************************************************************************)
  40
  41 (*****************************************************************************)
  42 (* Wrappers *)
  43 (*****************************************************************************)
  44 let pr2 s =
  45   if !Flag_parsing_c.verbose_lexing
  46   then Common.pr2 s
  47
  48 (*****************************************************************************)
  49
  50
  51 exception Lexical of string
  52
  53 let tok     lexbuf  = Lexing.lexeme lexbuf
  54
  55 let tokinfo lexbuf  =
  56   {
  57     pinfo = Ast_c.OriginTok {
  58       Common.charpos = Lexing.lexeme_start lexbuf;
  59       Common.str     = Lexing.lexeme lexbuf;
  60       (* info filled in a post-lexing phase *)
  61       Common.line = -1;
  62       Common.column = -1;
  63       Common.file = "";
  64     };
  65    (* must generate a new ref each time, otherwise share *)
  66     cocci_tag = ref Ast_c.emptyAnnot;
  67     comments_tag = ref Ast_c.emptyComments;
  68   }
  69
  70 (* must generate a new ref each time, otherwise share *)
  71 let no_ifdef_mark () = ref (None: (int * int) option)
  72
  73 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
  74
  75
  76 (* opti: less convenient, but using a hash is faster than using a match *)
  77 let keyword_table = Common.hash_of_list [
  78
  79   (* c: *)
  80   "void",   (fun ii -> Tvoid ii);
  81   "char",   (fun ii -> Tchar ii);
  82   "short",  (fun ii -> Tshort ii);
  83   "int",    (fun ii -> Tint ii);
  84   "long",   (fun ii -> Tlong ii);
  85   "float",  (fun ii -> Tfloat ii);
  86   "double", (fun ii -> Tdouble ii);
  87
  88   "unsigned", (fun ii -> Tunsigned ii);
  89   "signed",   (fun ii -> Tsigned ii);
  90
  91   "auto",     (fun ii -> Tauto ii);
  92   "register", (fun ii -> Tregister ii);
  93   "extern",   (fun ii -> Textern ii);
  94   "static",   (fun ii -> Tstatic ii);
  95
  96   "const",    (fun ii -> Tconst ii);
  97   "volatile", (fun ii -> Tvolatile ii);
  98
  99   "struct",  (fun ii -> Tstruct ii);
 100   "union",   (fun ii -> Tunion ii);
 101   "enum",    (fun ii -> Tenum ii);
 102   "typedef", (fun ii -> Ttypedef ii);
 103
 104   "if",      (fun ii -> Tif ii);
 105   "else",     (fun ii -> Telse ii);
 106   "break",   (fun ii -> Tbreak ii);
 107   "continue", (fun ii -> Tcontinue ii);
 108   "switch",  (fun ii -> Tswitch ii);
 109   "case",     (fun ii -> Tcase ii);
 110   "default", (fun ii -> Tdefault ii);
 111   "for",     (fun ii -> Tfor ii);
 112   "do",      (fun ii -> Tdo ii);
 113   "while",   (fun ii -> Twhile ii);
 114   "return",  (fun ii -> Treturn ii);
 115   "goto",    (fun ii -> Tgoto ii);
 116
 117   "sizeof", (fun ii -> Tsizeof ii);
 118
 119
 120   (* gccext: cppext: linuxext: synonyms *)
 121   "asm",     (fun ii -> Tasm ii);
 122   "__asm__", (fun ii -> Tasm ii);
 123   "__asm",   (fun ii -> Tasm ii);
 124
 125   "inline",     (fun ii -> Tinline ii);
 126   "__inline__", (fun ii -> Tinline ii);
 127   "__inline",   (fun ii -> Tinline ii);
 128
 129   "__attribute__", (fun ii -> Tattribute ii);
 130   "__attribute", (fun ii -> Tattribute ii);
 131
 132   "typeof", (fun ii -> Ttypeof ii);
 133   "__typeof__", (fun ii -> Ttypeof ii);
 134   "__typeof", (fun ii -> Ttypeof ii);
 135
 136
 137   (* gccext: alias *)
 138   "__signed__",     (fun ii -> Tsigned ii);
 139
 140   "__const__",     (fun ii -> Tconst ii);
 141   "__const",     (fun ii -> Tconst ii);
 142
 143   "__volatile__",  (fun ii -> Tvolatile ii);
 144   "__volatile",    (fun ii -> Tvolatile ii);
 145
 146
 147   (* c99:  *)
 148   (* no just "restrict" ? maybe for backward compatibility they avoided
 149    * to use restrict which people may have used in their program already
 150    *)
 151   "__restrict",    (fun ii -> Trestrict ii);
 152   "__restrict__",    (fun ii -> Trestrict ii);
 153
 154  ]
 155
 156 let error_radix s =
 157   ("numeric " ^ s ^ " constant contains digits beyond the radix:")
 158
 159 }
 160
 161 (*****************************************************************************)
 162 let letter = ['A'-'Z' 'a'-'z' '_']
 163 let digit  = ['0'-'9']
 164
 165 (* not used for the moment *)
 166 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
 167                    ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
 168 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
 169 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
 170 (* 7 = \a = bell in C. this is not the only char allowed !!
 171  * ex @ and $ ` are valid too
 172  *)
 173
 174 let cchar = (letter | digit | punctuation | additionnal)
 175
 176 let sp = [' ' '\t']+
 177 let spopt = [' ' '\t']*
 178
 179 let dec = ['0'-'9']
 180 let oct = ['0'-'7']
 181 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
 182
 183 let decimal = ('0' | (['1'-'9'] dec*))
 184 let octal   = ['0']        oct+
 185 let hexa    = ("0x" |"0X") hex+
 186
 187
 188 let pent   = dec+
 189 let pfract = dec+
 190 let sign = ['-' '+']
 191 let exp  = ['e''E'] sign? dec+
 192 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
 193
 194 let id = letter (letter | digit) *
 195
 196 (*****************************************************************************)
 197 rule token = parse
 198
 199   (* ----------------------------------------------------------------------- *)
 200   (* spacing/comments *)
 201   (* ----------------------------------------------------------------------- *)
 202
 203   (* note: this lexer generate tokens for comments!! so can not give
 204    * this lexer as-is to the parsing function. Must preprocess it, hence
 205    * use techniques like cur_tok ref in parse_c.ml
 206    *)
 207
 208   | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
 209       (* starting a new line; the newline character followed by whitespace *)
 210       { TCommentNewline (tokinfo lexbuf) }
 211   | [' ' '\t' '\r' '\011' '\012' ]+
 212       { TCommentSpace (tokinfo lexbuf) }
 213   | "/*"
 214       { let info = tokinfo lexbuf in
 215         let com = comment lexbuf in
 216         TComment(info +> tok_add_s com)
 217       }
 218
 219
 220   (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
 221    * So need this here only when dont call cpp before.
 222    * note that we don't keep the trailing \n; it will be in another token.
 223    *)
 224   | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) }
 225
 226   (* ----------------------------------------------------------------------- *)
 227   (* cpp *)
 228   (* ----------------------------------------------------------------------- *)
 229
 230   (* old:
 231    *   | '#'            { endline lexbuf} // should be line, and not endline
 232    *   and endline = parse  | '\n'      { token lexbuf}
 233    *                        |   _       { endline lexbuf}
 234    *)
 235
 236   (* todo?:
 237    *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ...
 238    *    => just "#" -> token {lexbuf} (that is ignore)
 239    *  il y'a 1 #elif  sans rien  apres
 240    *  il y'a 1 #error sans rien  apres
 241    *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
 242    *  entouré par des #if 0
 243    *  => make as for comment, call a comment_cpp that when #endif finish the
 244    *   comment and if other cpp stuff raise exn
 245    *  il y'a environ 10  #if(xxx)  ou le ( est collé direct
 246    *  il y'a des include"" et include<
 247    *  il y'a 1 ` (derriere un #ifndef linux)
 248    *)
 249
 250
 251
 252   (* ---------------------- *)
 253   (* misc *)
 254   (* ---------------------- *)
 255
 256   (* bugfix: I want now to keep comments for the cComment study
 257    * so cant do:    sp [^'\n']+ '\n'
 258    * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
 259    *)
 260
 261   | "#" spopt "pragma"  sp [^'\n']*  '\n'
 262   | "#" spopt "ident"   sp  [^'\n']* '\n'
 263   | "#" spopt "line"    sp  [^'\n']* '\n'
 264   | "#" spopt "error"   sp  [^'\n']* '\n'
 265   | "#" spopt "warning" sp  [^'\n']* '\n'
 266   | "#" spopt "abort"   sp  [^'\n']* '\n'
 267       { TCppDirectiveOther (tokinfo lexbuf) }
 268
 269   | "#" [' ' '\t']* '\n'
 270       { TCppDirectiveOther (tokinfo lexbuf) }
 271
 272   (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
 273   | "#" sp pent sp  '"' [^ '"']* '"' (spopt pent)*  spopt '\n'
 274       { TCppDirectiveOther (tokinfo lexbuf) }
 275
 276
 277
 278   (* ---------------------- *)
 279   (* #define, #undef *)
 280   (* ---------------------- *)
 281
 282   (* the rest of the lexing/parsing of define is done in fix_tokens_define
 283    * where we parse until a TCppEscapedNewline and generate a TDefEol
 284    *)
 285   | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
 286
 287   (* note: in some cases can have stuff after the ident as in #undef XXX 50,
 288    * but I currently don't handle it cos I think it's bad code.
 289    *)
 290   | (("#" [' ' '\t']* "undef" [' ' '\t']+) as _undef) (id as id)
 291       { let info = tokinfo lexbuf in
 292         TUndef (id, info)
 293         (*+> tok_add_s (cpp_eat_until_nl lexbuf))*)
 294       }
 295
 296
 297   (* ---------------------- *)
 298   (* #include *)
 299   (* ---------------------- *)
 300
 301   (* The difference between a local "" and standard <> include is computed
 302    * later in parser_c.mly. So redo a little bit of lexing there; ugly but
 303    * simpler to generate a single token here.  *)
 304   | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
 305     (('"' ([^ '"']+) '"' |
 306      '<' [^ '>']+ '>' |
 307       ['A'-'Z''_']+
 308     ) as filename)
 309       { let info = tokinfo lexbuf in
 310         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 311       }
 312   (* gccext: found in glibc *)
 313   | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
 314     (('"' ([^ '"']+) '"' |
 315      '<' [^ '>']+ '>' |
 316       ['A'-'Z''_']+
 317     ) as filename)
 318       { let info = tokinfo lexbuf in
 319         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 320       }
 321
 322   (* ---------------------- *)
 323   (* #ifdef *)
 324   (* ---------------------- *)
 325
 326   (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
 327    * when working on the ifdef view.
 328    *)
 329
 330   (* '0'+ because sometimes it is a #if 000 *)
 331   | "#" [' ' '\t']* "if" [' ' '\t']* '0'+           (* [^'\n']*  '\n' *)
 332       { let info = tokinfo lexbuf in
 333         TIfdefBool (false, no_ifdef_mark(), info)
 334           (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
 335       }
 336
 337   | "#" [' ' '\t']* "if" [' ' '\t']* '1'   (* [^'\n']*  '\n' *)
 338       { let info = tokinfo lexbuf in
 339         TIfdefBool (true, no_ifdef_mark(), info)
 340
 341       }
 342
 343  (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
 344   | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
 345       { let info = tokinfo lexbuf in
 346         TIfdefMisc (false, no_ifdef_mark(), info)
 347       }
 348
 349  (* DO NOT cherry pick to lexer_cplusplus !!! *)
 350   | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus"   [^'\n']*  '\n'
 351       { let info = tokinfo lexbuf in
 352         TIfdefMisc (false, no_ifdef_mark(), info)
 353       }
 354
 355   (* in glibc *)
 356   | "#" spopt ("ifdef"|"if") sp "__STDC__"
 357       { let info = tokinfo lexbuf in
 358         TIfdefVersion (true, no_ifdef_mark(),
 359                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 360       }
 361
 362
 363   (* linuxext: different possible variations (we do not manage all of them):
 364
 365     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
 366     #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
 367     #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 368     #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
 369     #if LINUX_VERSION_CODE < 0x020600
 370     #if LINUX_VERSION_CODE >= 0x2051c
 371     #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 372     #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
 373     #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 374     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)  &&  LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 375     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
 376     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 377     #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
 378     #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
 379     #ifndef LINUX_VERSION_CODE
 380     #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
 381     (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
 382     LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
 383     #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
 384     #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
 385     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 386
 387   *)
 388
 389   (* linuxext: must be before the generic rules for if and ifdef *)
 390   | "#" spopt "if" sp "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 391       { let info = tokinfo lexbuf in
 392         TIfdefVersion (true, no_ifdef_mark(),
 393                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 394       }
 395   (* linuxext: *)
 396   | "#" spopt "if" sp "!" "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 397   | "#" spopt "if" sp ['(']?  "LINUX_VERSION_CODE" sp ("<=" | "<") sp
 398
 399       { let info = tokinfo lexbuf in
 400         TIfdefVersion (false, no_ifdef_mark(),
 401                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 402       }
 403
 404
 405
 406
 407   (* can have some ifdef 0  hence the letter|digit even at beginning of word *)
 408   | "#" [' ''\t']* "ifdef"  [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 409       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 410   | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 411       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 412   | "#" [' ''\t']* "if" [' ' '\t']+
 413       { let info = tokinfo lexbuf in
 414         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 415       }
 416   | "#" [' ' '\t']* "if" '('
 417       { let info = tokinfo lexbuf in
 418         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 419       }
 420
 421   | "#" [' ' '\t']* "elif" [' ' '\t']+
 422       { let info = tokinfo lexbuf in
 423         TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 424       }
 425
 426
 427   (* bugfix: can have #endif LINUX  but at the same time if I eat everything
 428    * until next line, I may miss some TComment which for some tools
 429    * are important such as aComment
 430    *)
 431   | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
 432       TEndif     (no_ifdef_mark(), tokinfo lexbuf)
 433     }
 434   (* can be at eof *)
 435   (*| "#" [' ' '\t']* "endif"                { TEndif     (tokinfo lexbuf) }*)
 436
 437   | "#" [' ' '\t']* "else" [' ' '\t' '\n']
 438       { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
 439
 440
 441
 442
 443   (* ---------------------- *)
 444   (* #define body *)
 445   (* ---------------------- *)
 446
 447   (* only in cpp directives normally *)
 448   | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
 449
 450
 451   | ((id as s)  "...")
 452       { TDefParamVariadic (s, tokinfo lexbuf) }
 453
 454
 455   (* could generate separate token for #, ## and then extend grammar,
 456    * but there can be ident in many different places, in expression
 457    * but also in declaration, in function name. So having 3 tokens
 458    * for an ident does not work well with how we add info in
 459    * ast_c. So better to generate just one token, for now, just one info,
 460    * even if have later to reanalyse those tokens and unsplit.
 461    *
 462    * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
 463    * an easier solution would be to augment the TIdent type such as
 464    *   TIdent of string * info * cpp_ident_additionnal_info
 465    *)
 466
 467
 468   (* cppext: string concatenation of idents *)
 469   |  id   ([' ''\t']* "##" [' ''\t']* id)+
 470       { let info = tokinfo lexbuf in
 471         TIdent (tok lexbuf, info)
 472       }
 473
 474   (* cppext: stringification.
 475    * bugfix: this case must be after the other cases such as #endif
 476    * otherwise take precedent.
 477    *)
 478   |  "#" spopt id
 479       { let info = tokinfo lexbuf in
 480         TIdent (tok lexbuf, info)
 481       }
 482
 483   (* cppext: gccext: ##args for variadic macro *)
 484   |  "##" spopt id
 485       { let info = tokinfo lexbuf in
 486         TIdent (tok lexbuf, info)
 487       }
 488
 489
 490
 491   (* ----------------------------------------------------------------------- *)
 492   (* C symbols *)
 493   (* ----------------------------------------------------------------------- *)
 494    (* stdC:
 495     ...   &&   -=   >=   ~   +   ;   ]
 496     <<=   &=   ->   >>   %   ,   <   ^
 497     >>=   *=   /=   ^=   &   -   =   {
 498     !=    ++   <<   |=   (   .   >   |
 499     %=    +=   <=   ||   )   /   ?   }
 500         --   ==   !    *   :   [
 501     recent addition:    <:  :>  <%  %>
 502     only at processing: %:  %:%: # ##
 503    *)
 504
 505
 506   | '[' { TOCro(tokinfo lexbuf) }   | ']' { TCCro(tokinfo lexbuf) }
 507   | '(' { TOPar(tokinfo lexbuf)   } | ')' { TCPar(tokinfo lexbuf)   }
 508   | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
 509
 510   | '+' { TPlus(tokinfo lexbuf) }   | '*' { TMul(tokinfo lexbuf) }
 511   | '-' { TMinus(tokinfo lexbuf) }  | '/' { TDiv(tokinfo lexbuf) }
 512   | '%' { TMod(tokinfo lexbuf) }
 513
 514   | "++"{ TInc(tokinfo lexbuf) }    | "--"{ TDec(tokinfo lexbuf) }
 515
 516   | "="  { TEq(tokinfo lexbuf) }
 517
 518   | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
 519   | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
 520   | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
 521   | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
 522   | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
 523   | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
 524   | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
 525   | "^=" { TAssign(OpAssign Xor, (tokinfo lexbuf))}
 526   | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
 527   | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
 528
 529   | "==" { TEqEq(tokinfo lexbuf) }  | "!=" { TNotEq(tokinfo lexbuf) }
 530   | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
 531   | "<"  { TInf(tokinfo lexbuf) }   | ">"  { TSup(tokinfo lexbuf) }
 532
 533   | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
 534   | ">>" { TShr(tokinfo lexbuf) }    | "<<" { TShl(tokinfo lexbuf) }
 535   | "&"  { TAnd(tokinfo lexbuf) }    | "|" { TOr(tokinfo lexbuf) }
 536   | "^"  { TXor(tokinfo lexbuf) }
 537   | "..." { TEllipsis(tokinfo lexbuf) }
 538   | "->"   { TPtrOp(tokinfo lexbuf) }  | '.'  { TDot(tokinfo lexbuf) }
 539   | ','    { TComma(tokinfo lexbuf) }
 540   | ";"    { TPtVirg(tokinfo lexbuf) }
 541   | "?"    { TWhy(tokinfo lexbuf) }    | ":"   { TDotDot(tokinfo lexbuf) }
 542   | "!"    { TBang(tokinfo lexbuf) }   | "~"   { TTilde(tokinfo lexbuf) }
 543
 544   | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
 545   | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
 546
 547
 548
 549   (* ----------------------------------------------------------------------- *)
 550   (* C keywords and ident *)
 551   (* ----------------------------------------------------------------------- *)
 552
 553   (* StdC: must handle at least name of length > 509, but can
 554    * truncate to 31 when compare and truncate to 6 and even lowerise
 555    * in the external linkage phase
 556    *)
 557   | letter (letter | digit) *
 558       { let info = tokinfo lexbuf in
 559         let s = tok lexbuf in
 560         Common.profile_code "C parsing.lex_ident" (fun () ->
 561           match Common.optionise (fun () -> Hashtbl.find keyword_table s)
 562           with
 563           | Some f -> f info
 564
 565            (* parse_typedef_fix.
 566             *    if Lexer_parser.is_typedef s
 567             *    then TypedefIdent (s, info)
 568             *    else TIdent (s, info)
 569             *
 570             * update: now this is no more useful, cos
 571             * as we use tokens_all, it first parse all as an ident and
 572             * later transform an indent in a typedef. so the typedef job is
 573             * now done in parse_c.ml.
 574             *)
 575
 576           | None -> TIdent (s, info)
 577         )
 578       }
 579   (* gccext: apparently gcc allows dollar in variable names. found such
 580    * thing a few time in linux and in glibc. No need look in keyword_table
 581    * here.
 582    *)
 583   | (letter | '$') (letter | digit | '$') *
 584       {
 585         let info = tokinfo lexbuf in
 586         let s = tok lexbuf in
 587         pr2 ("LEXER: identifier with dollar: "  ^ s);
 588         TIdent (s, info)
 589       }
 590
 591
 592   (* ----------------------------------------------------------------------- *)
 593   (* C constant *)
 594   (* ----------------------------------------------------------------------- *)
 595
 596   | "'"
 597       { let info = tokinfo lexbuf in
 598         let s = char lexbuf   in
 599         TChar     ((s,   IsChar),  (info +> tok_add_s (s ^ "'")))
 600       }
 601   | '"'
 602       { let info = tokinfo lexbuf in
 603         let s = string lexbuf in
 604         TString   ((s,   IsChar),  (info +> tok_add_s (s ^ "\"")))
 605       }
 606   (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
 607   | 'L' "'"
 608       { let info = tokinfo lexbuf in
 609         let s = char lexbuf   in
 610         TChar     ((s,   IsWchar),  (info +> tok_add_s (s ^ "'")))
 611       }
 612   | 'L' '"'
 613       { let info = tokinfo lexbuf in
 614         let s = string lexbuf in
 615         TString   ((s,   IsWchar),  (info +> tok_add_s (s ^ "\"")))
 616       }
 617
 618
 619   (* Take care of the order ? No because lex try the longest match. The
 620    * strange diff between decimal and octal constant semantic is not
 621    * understood too by refman :) refman:11.1.4, and ritchie.
 622    *)
 623
 624   | (( decimal | hexa | octal)
 625         ( ['u' 'U']
 626         | ['l' 'L']
 627         | (['l' 'L'] ['u' 'U'])
 628         | (['u' 'U'] ['l' 'L'])
 629         | (['u' 'U'] ['l' 'L'] ['l' 'L'])
 630         | (['l' 'L'] ['l' 'L'])
 631         )?
 632     ) as x { TInt (x, tokinfo lexbuf) }
 633
 634
 635   | (real ['f' 'F']) as x { TFloat ((x, CFloat),      tokinfo lexbuf) }
 636   | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
 637   | (real as x)           { TFloat ((x, CDouble),     tokinfo lexbuf) }
 638
 639   | ['0'] ['0'-'9']+
 640       { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
 641         TUnknown (tokinfo lexbuf)
 642       }
 643   | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
 644       { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
 645         TUnknown (tokinfo lexbuf)
 646       }
 647
 648
 649  (* !!! to put after other rules !!! otherwise 0xff
 650   * will be parsed as an ident.
 651   *)
 652   | ['0'-'9']+ letter (letter | digit) *
 653       { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
 654         TIdent (tok lexbuf, tokinfo lexbuf)
 655       }
 656
 657 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
 658 (*
 659  | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
 660  | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
 661 *)
 662
 663
 664   (*------------------------------------------------------------------------ *)
 665   | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
 666
 667   | _
 668       {
 669         if !Flag_parsing_c.verbose_lexing
 670         then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
 671         TUnknown (tokinfo lexbuf)
 672       }
 673
 674
 675
 676 (*****************************************************************************)
 677 and char = parse
 678   | (_ as x)                                    "'"  { String.make 1 x }
 679   (* todo?: as for octal, do exception  beyond radix exception ? *)
 680   | (("\\" (oct | oct oct | oct oct oct)) as x  "'") { x }
 681   (* this rule must be after the one with octal, lex try first longest
 682    * and when \7  we want an octal, not an exn.
 683    *)
 684   | (("\\x" ((hex | hex hex))) as x        "'")      { x }
 685   | (("\\" (_ as v))           as x        "'")
 686         {
 687           (match v with (* Machine specific ? *)
 688           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 689           | 'f' -> () | 'a' -> ()
 690           | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 691           | 'e' -> () (* linuxext: ? *)
 692           | _ ->
 693               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 694           );
 695           x
 696         }
 697   | _
 698       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 699         tok lexbuf
 700       }
 701
 702
 703
 704 (*****************************************************************************)
 705
 706 (* todo? factorise code with char ? but not same ending token so hard. *)
 707 and string  = parse
 708   | '"'                                       { "" }
 709   | (_ as x)                                  { string_of_char x^string lexbuf}
 710   | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
 711   | ("\\x" (hex | hex hex)) as x              { x ^ string lexbuf }
 712   | ("\\" (_ as v)) as x
 713        {
 714          (match v with (* Machine specific ? *)
 715          | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 716          | 'f' -> () | 'a' -> ()
 717          | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 718          | 'e' -> () (* linuxext: ? *)
 719
 720          (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
 721
 722          (* cppext:  can have   \ for multiline in string too *)
 723          | '\n' -> ()
 724          | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
 725          );
 726           x ^ string lexbuf
 727        }
 728
 729   | eof { pr2 "LEXER: WIERD end of file in string"; ""}
 730
 731  (* Bug if add following code, cos match also the '"' that is needed
 732   * to finish the string, and so go until end of file.
 733   *)
 734  (*
 735   | [^ '\\']+
 736     { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
 737       cs ++ string lexbuf
 738     }
 739   *)
 740
 741
 742
 743 (*****************************************************************************)
 744
 745 (* less: allow only char-'*' ? *)
 746 and comment = parse
 747   | "*/"     { tok lexbuf }
 748   (* noteopti: *)
 749   | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
 750   | [ '*']   { let s = tok lexbuf in s ^ comment lexbuf }
 751   | _
 752       { let s = tok lexbuf in
 753         pr2 ("LEXER: unrecognised symbol in comment:"^s);
 754         s ^ comment lexbuf
 755       }
 756
 757
 758
 759 (*****************************************************************************)
 760
 761 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
 762  * then he has already erased the /* comment. So:
 763  * - dont eat the start of the comment otherwise afterwards we are in the middle
 764  *   of a comment and so will problably get a parse error somewhere.
 765  * - have to recognize comments in cpp_eat_until_nl.
 766  *)
 767
 768 and cpp_eat_until_nl = parse
 769   (* bugfix: *)
 770   | "/*"
 771       { let s = tok lexbuf in
 772         let s2 = comment lexbuf in
 773         let s3 = cpp_eat_until_nl lexbuf in
 774         s ^ s2 ^ s3
 775       }
 776   | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 777
 778   | "\n"      { tok lexbuf }
 779   (* noteopti:
 780    * update: need also deal with comments chars now
 781    *)
 782   | [^ '\n' '\\'      '/' '*'  ]+
 783      { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 784   | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
 785   | _   { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }