parsing_c/lexer_c.mll

   1 {
   2 (* Yoann Padioleau
   3  *
   4  * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License (GPL)
   8  * version 2 as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * file license.txt for more details.
  14  *)
  15 open Common
  16
  17 open Parser_c
  18
  19 open Ast_c (* to factorise tokens, OpAssign, ... *)
  20
  21 (*****************************************************************************)
  22 (*
  23  * subtil: ocamllex use side effect on lexbuf, so must take care.
  24  * For instance must do
  25  *
  26  *  let info = tokinfo lexbuf in
  27  *  TComment (info +> tok_add_s (comment lexbuf))
  28  *
  29  * and not
  30  *
  31  *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
  32  *
  33  * because of the "wierd" order of evaluation of OCaml.
  34  *
  35  *
  36  *
  37  * note: can't use Lexer_parser._lexer_hint here to do different
  38  * things, because now we call the lexer to get all the tokens
  39  * (tokens_all), and then we parse. So we can't have the _lexer_hint
  40  * info here. We can have it only in parse_c. For the same reason, the
  41  * typedef handling here is now useless.
  42  *)
  43 (*****************************************************************************)
  44
  45 (*****************************************************************************)
  46 (* Wrappers *)
  47 (*****************************************************************************)
  48 let pr2 s =
  49   if !Flag_parsing_c.verbose_lexing
  50   then Common.pr2 s
  51
  52 (*****************************************************************************)
  53
  54
  55 exception Lexical of string
  56
  57 let tok     lexbuf  = Lexing.lexeme lexbuf
  58
  59 let tokinfo lexbuf  =
  60   {
  61     pinfo = Ast_c.OriginTok {
  62       Common.charpos = Lexing.lexeme_start lexbuf;
  63       Common.str     = Lexing.lexeme lexbuf;
  64       (* info filled in a post-lexing phase *)
  65       Common.line = -1;
  66       Common.column = -1;
  67       Common.file = "";
  68     };
  69    (* must generate a new ref each time, otherwise share *)
  70     cocci_tag = ref Ast_c.emptyAnnot;
  71     comments_tag = ref Ast_c.emptyComments;
  72   }
  73
  74 (* cppext: must generate a new ref each time, otherwise share *)
  75 let no_ifdef_mark () = ref (None: (int * int) option)
  76
  77 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
  78
  79
  80 (* opti: less convenient, but using a hash is faster than using a match *)
  81 let keyword_table = Common.hash_of_list [
  82
  83   (* c: *)
  84   "void",   (fun ii -> Tvoid ii);
  85   "char",   (fun ii -> Tchar ii);
  86   "short",  (fun ii -> Tshort ii);
  87   "int",    (fun ii -> Tint ii);
  88   "long",   (fun ii -> Tlong ii);
  89   "float",  (fun ii -> Tfloat ii);
  90   "double", (fun ii -> Tdouble ii);
  91
  92   "unsigned", (fun ii -> Tunsigned ii);
  93   "signed",   (fun ii -> Tsigned ii);
  94
  95   "auto",     (fun ii -> Tauto ii);
  96   "register", (fun ii -> Tregister ii);
  97   "extern",   (fun ii -> Textern ii);
  98   "static",   (fun ii -> Tstatic ii);
  99
 100   "const",    (fun ii -> Tconst ii);
 101   "volatile", (fun ii -> Tvolatile ii);
 102
 103   "struct",  (fun ii -> Tstruct ii);
 104   "union",   (fun ii -> Tunion ii);
 105   "enum",    (fun ii -> Tenum ii);
 106   "typedef", (fun ii -> Ttypedef ii);
 107
 108   "if",      (fun ii -> Tif ii);
 109   "else",     (fun ii -> Telse ii);
 110   "break",   (fun ii -> Tbreak ii);
 111   "continue", (fun ii -> Tcontinue ii);
 112   "switch",  (fun ii -> Tswitch ii);
 113   "case",     (fun ii -> Tcase ii);
 114   "default", (fun ii -> Tdefault ii);
 115   "for",     (fun ii -> Tfor ii);
 116   "do",      (fun ii -> Tdo ii);
 117   "while",   (fun ii -> Twhile ii);
 118   "return",  (fun ii -> Treturn ii);
 119   "goto",    (fun ii -> Tgoto ii);
 120
 121   "sizeof", (fun ii -> Tsizeof ii);
 122
 123
 124   (* gccext: cppext: linuxext: synonyms *)
 125   "asm",     (fun ii -> Tasm ii);
 126   "__asm__", (fun ii -> Tasm ii);
 127   "__asm",   (fun ii -> Tasm ii);
 128
 129   "inline",     (fun ii -> Tinline ii);
 130   "__inline__", (fun ii -> Tinline ii);
 131   "__inline",   (fun ii -> Tinline ii);
 132
 133   "__attribute__", (fun ii -> Tattribute ii);
 134   "__attribute", (fun ii -> Tattribute ii);
 135
 136   "typeof", (fun ii -> Ttypeof ii);
 137   "__typeof__", (fun ii -> Ttypeof ii);
 138   "__typeof", (fun ii -> Ttypeof ii);
 139
 140
 141   (* gccext: alias *)
 142   "__signed__",     (fun ii -> Tsigned ii);
 143
 144   "__const__",     (fun ii -> Tconst ii);
 145   "__const",     (fun ii -> Tconst ii);
 146
 147   "__volatile__",  (fun ii -> Tvolatile ii);
 148   "__volatile",    (fun ii -> Tvolatile ii);
 149
 150
 151   (* c99:  *)
 152   (* no just "restrict" ? maybe for backward compatibility they avoided
 153    * to use restrict which people may have used in their program already
 154    *)
 155   "__restrict",    (fun ii -> Trestrict ii);
 156   "__restrict__",    (fun ii -> Trestrict ii);
 157
 158  ]
 159
 160 let error_radix s =
 161   ("numeric " ^ s ^ " constant contains digits beyond the radix:")
 162
 163 }
 164
 165 (*****************************************************************************)
 166 let letter = ['A'-'Z' 'a'-'z' '_']
 167 let digit  = ['0'-'9']
 168
 169 (* not used for the moment *)
 170 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
 171                    ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
 172 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
 173 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
 174 (* 7 = \a = bell in C. this is not the only char allowed !!
 175  * ex @ and $ ` are valid too
 176  *)
 177
 178 let cchar = (letter | digit | punctuation | additionnal)
 179
 180 let sp = [' ' '\t']+
 181 let spopt = [' ' '\t']*
 182
 183 let dec = ['0'-'9']
 184 let oct = ['0'-'7']
 185 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
 186
 187 let decimal = ('0' | (['1'-'9'] dec*))
 188 let octal   = ['0']        oct+
 189 let hexa    = ("0x" |"0X") hex+
 190
 191
 192 let pent   = dec+
 193 let pfract = dec+
 194 let sign = ['-' '+']
 195 let exp  = ['e''E'] sign? dec+
 196 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
 197
 198 let id = letter (letter | digit) *
 199
 200 (*****************************************************************************)
 201 rule token = parse
 202
 203   (* ----------------------------------------------------------------------- *)
 204   (* spacing/comments *)
 205   (* ----------------------------------------------------------------------- *)
 206
 207   (* note: this lexer generate tokens for comments!! so can not give
 208    * this lexer as-is to the parsing function. Must preprocess it, hence
 209    * use techniques like cur_tok ref in parse_c.ml
 210    *)
 211
 212   | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
 213       (* starting a new line; the newline character followed by whitespace *)
 214       { TCommentNewline (tokinfo lexbuf) }
 215   | [' ' '\t' '\r' '\011' '\012' ]+
 216       { TCommentSpace (tokinfo lexbuf) }
 217   | "/*"
 218       { let info = tokinfo lexbuf in
 219         let com = comment lexbuf in
 220
 221         let info' = info +> tok_add_s com in
 222         let s = Ast_c.str_of_info info' in
 223         (* could be more flexible, use [\t ]* instead of hardcoded
 224          * single space. *)
 225         match s with
 226         | "/* {{coccinelle:skip_start}} */" ->
 227             TCommentSkipTagStart (info')
 228         | "/* {{coccinelle:skip_end}} */" ->
 229             TCommentSkipTagEnd (info')
 230         | _ -> TComment(info')
 231       }
 232
 233
 234   (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
 235    * So need this here only when dont call cpp before.
 236    * note that we don't keep the trailing \n; it will be in another token.
 237    *)
 238   | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) }
 239
 240   (* ----------------------------------------------------------------------- *)
 241   (* cpp *)
 242   (* ----------------------------------------------------------------------- *)
 243
 244   (* old:
 245    *   | '#'            { endline lexbuf} // should be line, and not endline
 246    *   and endline = parse  | '\n'      { token lexbuf}
 247    *                        |   _       { endline lexbuf}
 248    *)
 249
 250   (* less?:
 251    *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ...
 252    *    => just "#" -> token {lexbuf} (that is ignore)
 253    *  il y'a 1 #elif  sans rien  apres
 254    *  il y'a 1 #error sans rien  apres
 255    *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
 256    *  entouré par des #if 0
 257    *  => make as for comment, call a comment_cpp that when #endif finish the
 258    *   comment and if other cpp stuff raise exn
 259    *  il y'a environ 10  #if(xxx)  ou le ( est collé direct
 260    *  il y'a des include"" et include<
 261    *  il y'a 1 ` (derriere un #ifndef linux)
 262    *)
 263
 264
 265
 266   (* ---------------------- *)
 267   (* misc *)
 268   (* ---------------------- *)
 269
 270   (* bugfix: I want now to keep comments for the cComment study
 271    * so cant do:    sp [^'\n']+ '\n'
 272    * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
 273    *)
 274
 275   | "#" spopt "pragma"  sp  [^'\n']* '\n'
 276   | "#" spopt "ident"   sp  [^'\n']* '\n'
 277   | "#" spopt "line"    sp  [^'\n']* '\n'
 278   | "#" spopt "error"   sp  [^'\n']* '\n'
 279   | "#" spopt "warning" sp  [^'\n']* '\n'
 280   | "#" spopt "abort"   sp  [^'\n']* '\n'
 281       { TCppDirectiveOther (tokinfo lexbuf) }
 282
 283   | "#" [' ' '\t']* '\n'
 284       { TCppDirectiveOther (tokinfo lexbuf) }
 285
 286   (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
 287   | "#" sp pent sp  '"' [^ '"']* '"' (spopt pent)*  spopt '\n'
 288       { TCppDirectiveOther (tokinfo lexbuf) }
 289
 290
 291
 292   (* ---------------------- *)
 293   (* #define, #undef *)
 294   (* ---------------------- *)
 295
 296   (* the rest of the lexing/parsing of define is done in fix_tokens_define
 297    * where we parse until a TCppEscapedNewline and generate a TDefEol
 298    *)
 299   | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
 300
 301   (* note: in some cases can have stuff after the ident as in #undef XXX 50,
 302    * but I currently don't handle it cos I think it's bad code.
 303    *)
 304   | (("#" [' ' '\t']* "undef" [' ' '\t']+) as _undef) (id as id)
 305       { let info = tokinfo lexbuf in
 306         TUndef (id, info)
 307         (*+> tok_add_s (cpp_eat_until_nl lexbuf))*)
 308       }
 309
 310
 311   (* ---------------------- *)
 312   (* #include *)
 313   (* ---------------------- *)
 314
 315   (* The difference between a local "" and standard <> include is computed
 316    * later in parser_c.mly. So redo a little bit of lexing there; ugly but
 317    * simpler to generate a single token here.  *)
 318   | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
 319     (('"' ([^ '"']+) '"' |
 320      '<' [^ '>']+ '>' |
 321       ['A'-'Z''_']+
 322     ) as filename)
 323       { let info = tokinfo lexbuf in
 324         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 325       }
 326   (* gccext: found in glibc *)
 327   | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
 328     (('"' ([^ '"']+) '"' |
 329      '<' [^ '>']+ '>' |
 330       ['A'-'Z''_']+
 331     ) as filename)
 332       { let info = tokinfo lexbuf in
 333         TInclude (includes, filename, Ast_c.noInIfdef(), info)
 334       }
 335
 336   (* ---------------------- *)
 337   (* #ifdef *)
 338   (* ---------------------- *)
 339
 340   (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
 341    * when working on the ifdef view.
 342    *)
 343
 344   (* '0'+ because sometimes it is a #if 000 *)
 345   | "#" [' ' '\t']* "if" [' ' '\t']* '0'+           (* [^'\n']*  '\n' *)
 346       { let info = tokinfo lexbuf in
 347         TIfdefBool (false, no_ifdef_mark(), info)
 348           (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
 349       }
 350
 351   | "#" [' ' '\t']* "if" [' ' '\t']* '1'   (* [^'\n']*  '\n' *)
 352       { let info = tokinfo lexbuf in
 353         TIfdefBool (true, no_ifdef_mark(), info)
 354
 355       }
 356
 357  (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
 358   | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
 359       { let info = tokinfo lexbuf in
 360         TIfdefMisc (false, no_ifdef_mark(), info)
 361       }
 362
 363  (* DO NOT cherry pick to lexer_cplusplus !!! *)
 364   | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus"   [^'\n']*  '\n'
 365       { let info = tokinfo lexbuf in
 366         TIfdefMisc (false, no_ifdef_mark(), info)
 367       }
 368
 369   (* in glibc *)
 370   | "#" spopt ("ifdef"|"if") sp "__STDC__"
 371       { let info = tokinfo lexbuf in
 372         TIfdefVersion (true, no_ifdef_mark(),
 373                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 374       }
 375
 376
 377   (* linuxext: different possible variations (we do not manage all of them):
 378
 379     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
 380     #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
 381     #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 382     #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
 383     #if LINUX_VERSION_CODE < 0x020600
 384     #if LINUX_VERSION_CODE >= 0x2051c
 385     #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 386     #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
 387     #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 388     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)  &&  LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
 389     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
 390     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 391     #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
 392     #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
 393     #ifndef LINUX_VERSION_CODE
 394     #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
 395     (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
 396     LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
 397     #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
 398     #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
 399     # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
 400
 401   *)
 402
 403   (* linuxext: must be before the generic rules for if and ifdef *)
 404   | "#" spopt "if" sp "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 405       { let info = tokinfo lexbuf in
 406         TIfdefVersion (true, no_ifdef_mark(),
 407                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 408       }
 409   (* linuxext: *)
 410   | "#" spopt "if" sp "!" "("?  "LINUX_VERSION_CODE" sp (">=" | ">") sp
 411   | "#" spopt "if" sp ['(']?  "LINUX_VERSION_CODE" sp ("<=" | "<") sp
 412
 413       { let info = tokinfo lexbuf in
 414         TIfdefVersion (false, no_ifdef_mark(),
 415                       info +> tok_add_s (cpp_eat_until_nl lexbuf))
 416       }
 417
 418
 419
 420
 421   (* can have some ifdef 0  hence the letter|digit even at beginning of word *)
 422   | "#" [' ''\t']* "ifdef"  [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 423       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 424   | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
 425       { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
 426   | "#" [' ''\t']* "if" [' ' '\t']+
 427       { let info = tokinfo lexbuf in
 428         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 429       }
 430   | "#" [' ' '\t']* "if" '('
 431       { let info = tokinfo lexbuf in
 432         TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 433       }
 434
 435   | "#" [' ' '\t']* "elif" [' ' '\t']+
 436       { let info = tokinfo lexbuf in
 437         TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
 438       }
 439
 440
 441   (* bugfix: can have #endif LINUX  but at the same time if I eat everything
 442    * until next line, I may miss some TComment which for some tools
 443    * are important such as aComment
 444    *)
 445   | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
 446       TEndif     (no_ifdef_mark(), tokinfo lexbuf)
 447     }
 448   (* can be at eof *)
 449   (*| "#" [' ' '\t']* "endif"                { TEndif     (tokinfo lexbuf) }*)
 450
 451   | "#" [' ' '\t']* "else" [' ' '\t' '\n']
 452       { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
 453
 454
 455
 456
 457   (* ---------------------- *)
 458   (* #define body *)
 459   (* ---------------------- *)
 460
 461   (* only in cpp directives normally *)
 462   | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
 463
 464   (* We must generate separate tokens for #, ## and extend the grammar.
 465    * Note there can be "elaborated" idents in many different places, in
 466    * expression but also in declaration, in function name. So having 3 tokens
 467    * for an ident does not work well with how we add info in
 468    * ast_c. Was easier to generate just one token, just one info,
 469    * even if have later to reanalyse those tokens and unsplit. But then,
 470    * handling C++ lead to having not just a string for ident but something
 471    * more complex. Also when we want to parse elaborated function headers
 472    * (e.g. void METH(foo)(int x)), we need anyway to go from a string
 473    * to something more. So having also for C something more than just
 474    * string for ident is natural.
 475    *
 476    * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
 477    * an easier solution would be to augment the TIdent type such as
 478    *   TIdent of string * info * cpp_ident_additionnal_info
 479    *
 480    * old:
 481    * |  id   ([' ''\t']* "##" [' ''\t']* id)+
 482    *   { let info = tokinfo lexbuf in
 483    *     TIdent (tok lexbuf, info)
 484    *   }
 485    * |  "##" spopt id
 486    *   { let info = tokinfo lexbuf in
 487    *     TIdent (tok lexbuf, info)
 488    *   }
 489    *
 490    *)
 491   (* cppext: string concatenation of idents, also ##args for variadic macro. *)
 492   | "##" { TCppConcatOp (tokinfo lexbuf) }
 493
 494   (* cppext: stringification.
 495    * bugfix: this case must be after the other cases such as #endif
 496    * otherwise take precedent.
 497    *)
 498   |  "#" spopt id
 499       { let info = tokinfo lexbuf in
 500         TIdent (tok lexbuf, info)
 501       }
 502   (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
 503   | ((id as s)  "...")
 504       { TDefParamVariadic (s, tokinfo lexbuf) }
 505
 506
 507
 508
 509
 510   (* ----------------------------------------------------------------------- *)
 511   (* C symbols *)
 512   (* ----------------------------------------------------------------------- *)
 513    (* stdC:
 514     ...   &&   -=   >=   ~   +   ;   ]
 515     <<=   &=   ->   >>   %   ,   <   ^
 516     >>=   *=   /=   ^=   &   -   =   {
 517     !=    ++   <<   |=   (   .   >   |
 518     %=    +=   <=   ||   )   /   ?   }
 519         --   ==   !    *   :   [
 520     recent addition:    <:  :>  <%  %>
 521     only at processing: %:  %:%: # ##
 522    *)
 523
 524
 525   | '[' { TOCro(tokinfo lexbuf) }   | ']' { TCCro(tokinfo lexbuf) }
 526   | '(' { TOPar(tokinfo lexbuf)   } | ')' { TCPar(tokinfo lexbuf)   }
 527   | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
 528
 529   | '+' { TPlus(tokinfo lexbuf) }   | '*' { TMul(tokinfo lexbuf) }
 530   | '-' { TMinus(tokinfo lexbuf) }  | '/' { TDiv(tokinfo lexbuf) }
 531   | '%' { TMod(tokinfo lexbuf) }
 532
 533   | "++"{ TInc(tokinfo lexbuf) }    | "--"{ TDec(tokinfo lexbuf) }
 534
 535   | "="  { TEq(tokinfo lexbuf) }
 536
 537   | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
 538   | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
 539   | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
 540   | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
 541   | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
 542   | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
 543   | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
 544   | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
 545   | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
 546   | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
 547
 548   | "==" { TEqEq(tokinfo lexbuf) }  | "!=" { TNotEq(tokinfo lexbuf) }
 549   | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
 550   | "<"  { TInf(tokinfo lexbuf) }   | ">"  { TSup(tokinfo lexbuf) }
 551
 552   | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
 553   | ">>" { TShr(tokinfo lexbuf) }    | "<<" { TShl(tokinfo lexbuf) }
 554   | "&"  { TAnd(tokinfo lexbuf) }    | "|" { TOr(tokinfo lexbuf) }
 555   | "^"  { TXor(tokinfo lexbuf) }
 556   | "..." { TEllipsis(tokinfo lexbuf) }
 557   | "->"   { TPtrOp(tokinfo lexbuf) }  | '.'  { TDot(tokinfo lexbuf) }
 558   | ','    { TComma(tokinfo lexbuf) }
 559   | ";"    { TPtVirg(tokinfo lexbuf) }
 560   | "?"    { TWhy(tokinfo lexbuf) }    | ":"   { TDotDot(tokinfo lexbuf) }
 561   | "!"    { TBang(tokinfo lexbuf) }   | "~"   { TTilde(tokinfo lexbuf) }
 562
 563   | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
 564   | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
 565
 566
 567
 568   (* ----------------------------------------------------------------------- *)
 569   (* C keywords and ident *)
 570   (* ----------------------------------------------------------------------- *)
 571
 572   (* StdC: must handle at least name of length > 509, but can
 573    * truncate to 31 when compare and truncate to 6 and even lowerise
 574    * in the external linkage phase
 575    *)
 576   | letter (letter | digit) *
 577       { let info = tokinfo lexbuf in
 578         let s = tok lexbuf in
 579         Common.profile_code "C parsing.lex_ident" (fun () ->
 580           match Common.optionise (fun () -> Hashtbl.find keyword_table s)
 581           with
 582           | Some f -> f info
 583
 584            (* parse_typedef_fix.
 585             *    if Lexer_parser.is_typedef s
 586             *    then TypedefIdent (s, info)
 587             *    else TIdent (s, info)
 588             *
 589             * update: now this is no more useful, cos
 590             * as we use tokens_all, it first parse all as an ident and
 591             * later transform an indent in a typedef. so the typedef job is
 592             * now done in parse_c.ml.
 593             *)
 594
 595           | None -> TIdent (s, info)
 596         )
 597       }
 598   (* gccext: apparently gcc allows dollar in variable names. found such
 599    * thing a few time in linux and in glibc. No need look in keyword_table
 600    * here.
 601    *)
 602   | (letter | '$') (letter | digit | '$') *
 603       {
 604         let info = tokinfo lexbuf in
 605         let s = tok lexbuf in
 606         pr2 ("LEXER: identifier with dollar: "  ^ s);
 607         TIdent (s, info)
 608       }
 609
 610
 611   (* ----------------------------------------------------------------------- *)
 612   (* C constant *)
 613   (* ----------------------------------------------------------------------- *)
 614
 615   | "'"
 616       { let info = tokinfo lexbuf in
 617         let s = char lexbuf   in
 618         TChar     ((s,   IsChar),  (info +> tok_add_s (s ^ "'")))
 619       }
 620   | '"'
 621       { let info = tokinfo lexbuf in
 622         let s = string lexbuf in
 623         TString   ((s,   IsChar),  (info +> tok_add_s (s ^ "\"")))
 624       }
 625   (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
 626   | 'L' "'"
 627       { let info = tokinfo lexbuf in
 628         let s = char lexbuf   in
 629         TChar     ((s,   IsWchar),  (info +> tok_add_s (s ^ "'")))
 630       }
 631   | 'L' '"'
 632       { let info = tokinfo lexbuf in
 633         let s = string lexbuf in
 634         TString   ((s,   IsWchar),  (info +> tok_add_s (s ^ "\"")))
 635       }
 636
 637
 638   (* Take care of the order ? No because lex try the longest match. The
 639    * strange diff between decimal and octal constant semantic is not
 640    * understood too by refman :) refman:11.1.4, and ritchie.
 641    *
 642    * todo: attach type info to constant, like for float
 643    *)
 644
 645   | (( decimal | hexa | octal)
 646         ( ['u' 'U']
 647         | ['l' 'L']
 648         | (['l' 'L'] ['u' 'U'])
 649         | (['u' 'U'] ['l' 'L'])
 650         | (['u' 'U'] ['l' 'L'] ['l' 'L'])
 651         | (['l' 'L'] ['l' 'L'])
 652         )?
 653     ) as x { TInt (x, tokinfo lexbuf) }
 654
 655
 656   | (real ['f' 'F']) as x { TFloat ((x, CFloat),      tokinfo lexbuf) }
 657   | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
 658   | (real as x)           { TFloat ((x, CDouble),     tokinfo lexbuf) }
 659
 660   | ['0'] ['0'-'9']+
 661       { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
 662         TUnknown (tokinfo lexbuf)
 663       }
 664   | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
 665       { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
 666         TUnknown (tokinfo lexbuf)
 667       }
 668
 669
 670  (* !!! to put after other rules !!! otherwise 0xff
 671   * will be parsed as an ident.
 672   *)
 673   | ['0'-'9']+ letter (letter | digit) *
 674       { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
 675         TIdent (tok lexbuf, tokinfo lexbuf)
 676       }
 677
 678 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
 679 (*
 680  | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
 681  | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
 682 *)
 683
 684
 685   (*------------------------------------------------------------------------ *)
 686   | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
 687
 688   | _
 689       {
 690         if !Flag_parsing_c.verbose_lexing
 691         then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
 692         TUnknown (tokinfo lexbuf)
 693       }
 694
 695
 696
 697 (*****************************************************************************)
 698 and char = parse
 699   | (_ as x)                                    "'"  { String.make 1 x }
 700   (* todo?: as for octal, do exception  beyond radix exception ? *)
 701   | (("\\" (oct | oct oct | oct oct oct)) as x  "'") { x }
 702   (* this rule must be after the one with octal, lex try first longest
 703    * and when \7  we want an octal, not an exn.
 704    *)
 705   | (("\\x" ((hex | hex hex))) as x        "'")      { x }
 706   | (("\\" (_ as v))           as x        "'")
 707         {
 708           (match v with (* Machine specific ? *)
 709           | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 710           | 'f' -> () | 'a' -> ()
 711           | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 712           | 'e' -> () (* linuxext: ? *)
 713           | _ ->
 714               pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 715           );
 716           x
 717         }
 718   | _
 719       { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
 720         tok lexbuf
 721       }
 722
 723
 724
 725 (*****************************************************************************)
 726
 727 (* todo? factorise code with char ? but not same ending token so hard. *)
 728 and string  = parse
 729   | '"'                                       { "" }
 730   | (_ as x)                                  { string_of_char x^string lexbuf}
 731   | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
 732   | ("\\x" (hex | hex hex)) as x              { x ^ string lexbuf }
 733   | ("\\" (_ as v)) as x
 734        {
 735          (match v with (* Machine specific ? *)
 736          | 'n' -> ()  | 't' -> ()   | 'v' -> ()  | 'b' -> () | 'r' -> ()
 737          | 'f' -> () | 'a' -> ()
 738          | '\\' -> () | '?'  -> () | '\'' -> ()  | '"' -> ()
 739          | 'e' -> () (* linuxext: ? *)
 740
 741          (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
 742
 743          (* cppext:  can have   \ for multiline in string too *)
 744          | '\n' -> ()
 745          | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
 746          );
 747           x ^ string lexbuf
 748        }
 749
 750   | eof { pr2 "LEXER: WIERD end of file in string"; ""}
 751
 752  (* Bug if add following code, cos match also the '"' that is needed
 753   * to finish the string, and so go until end of file.
 754   *)
 755  (*
 756   | [^ '\\']+
 757     { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
 758       cs ++ string lexbuf
 759     }
 760   *)
 761
 762
 763
 764 (*****************************************************************************)
 765
 766 (* less: allow only char-'*' ? *)
 767 and comment = parse
 768   | "*/"     { tok lexbuf }
 769   (* noteopti: *)
 770   | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
 771   | [ '*']   { let s = tok lexbuf in s ^ comment lexbuf }
 772   | _
 773       { let s = tok lexbuf in
 774         pr2 ("LEXER: unrecognised symbol in comment:"^s);
 775         s ^ comment lexbuf
 776       }
 777
 778
 779
 780 (*****************************************************************************)
 781
 782 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
 783  * then he has already erased the /* comment. So:
 784  * - dont eat the start of the comment otherwise afterwards we are in the middle
 785  *   of a comment and so will problably get a parse error somewhere.
 786  * - have to recognize comments in cpp_eat_until_nl.
 787  *)
 788
 789 and cpp_eat_until_nl = parse
 790   (* bugfix: *)
 791   | "/*"
 792       { let s = tok lexbuf in
 793         let s2 = comment lexbuf in
 794         let s3 = cpp_eat_until_nl lexbuf in
 795         s ^ s2 ^ s3
 796       }
 797   | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 798
 799   | "\n"      { tok lexbuf }
 800   (* noteopti:
 801    * update: need also deal with comments chars now
 802    *)
 803   | [^ '\n' '\\'      '/' '*'  ]+
 804      { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
 805   | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
 806   | _   { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }