[bpt/coccinelle.git] / parsing_c / lexer_c.mll

{
(* Yoann Padioleau
 * 
 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License (GPL)
 * version 2 as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * file license.txt for more details.
 *)
open Common

open Parser_c

open Ast_c (* to factorise tokens, OpAssign, ... *)

(*****************************************************************************)
(*
 * subtil: ocamllex use side effect on lexbuf, so must take care. 
 * For instance must do   
 * 
 *  let info = tokinfo lexbuf in 
 *  TComment (info +> tok_add_s (comment lexbuf)) 
 * 
 * and not 
 * 
 *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) 
 * 
 * because of the "wierd" order of evaluation of OCaml.
 * 
 * 
 *
 * note: can't use Lexer_parser._lexer_hint here to do different
 * things, because now we call the lexer to get all the tokens
 * (tokens_all), and then we parse. So we can't have the _lexer_hint
 * info here. We can have it only in parse_c. For the same reason, the
 * typedef handling here is now useless. 
 *)
(*****************************************************************************)

(*****************************************************************************)
(* Wrappers *)
(*****************************************************************************)
let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing 

(*****************************************************************************)


exception Lexical of string

let tok     lexbuf  = Lexing.lexeme lexbuf

let tokinfo lexbuf  = 
  { 
    pinfo = Ast_c.OriginTok {
      Common.charpos = Lexing.lexeme_start lexbuf; 
      Common.str     = Lexing.lexeme lexbuf;
      (* info filled in a post-lexing phase *)
      Common.line = -1; 
      Common.column = -1; 
      Common.file = "";
    };
   (* must generate a new ref each time, otherwise share *)
    cocci_tag = ref Ast_c.emptyAnnot;
    comments_tag = ref Ast_c.emptyComments;
  }

(* cppext: must generate a new ref each time, otherwise share *)
let no_ifdef_mark () = ref (None: (int * int) option)

let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
    

(* opti: less convenient, but using a hash is faster than using a match *)
let keyword_table = Common.hash_of_list [

  (* c: *)
  "void",   (fun ii -> Tvoid ii); 
  "char",   (fun ii -> Tchar ii);    
  "short",  (fun ii -> Tshort ii); 
  "int",    (fun ii -> Tint ii); 
  "long",   (fun ii -> Tlong ii); 
  "float",  (fun ii -> Tfloat ii); 
  "double", (fun ii -> Tdouble ii);  

  "unsigned", (fun ii -> Tunsigned ii);  
  "signed",   (fun ii -> Tsigned ii);
  
  "auto",     (fun ii -> Tauto ii);    
  "register", (fun ii -> Tregister ii);  
  "extern",   (fun ii -> Textern ii); 
  "static",   (fun ii -> Tstatic ii);

  "const",    (fun ii -> Tconst ii);
  "volatile", (fun ii -> Tvolatile ii); 
  
  "struct",  (fun ii -> Tstruct ii); 
  "union",   (fun ii -> Tunion ii); 
  "enum",    (fun ii -> Tenum ii);  
  "typedef", (fun ii -> Ttypedef ii);  
  
  "if",      (fun ii -> Tif ii);      
  "else",     (fun ii -> Telse ii); 
  "break",   (fun ii -> Tbreak ii);   
  "continue", (fun ii -> Tcontinue ii);
  "switch",  (fun ii -> Tswitch ii);  
  "case",     (fun ii -> Tcase ii);  
  "default", (fun ii -> Tdefault ii); 
  "for",     (fun ii -> Tfor ii);  
  "do",      (fun ii -> Tdo ii);      
  "while",   (fun ii -> Twhile ii);  
  "return",  (fun ii -> Treturn ii);
  "goto",    (fun ii -> Tgoto ii); 
  
  "sizeof", (fun ii -> Tsizeof ii);   


  (* gccext: cppext: linuxext: synonyms *)
  "asm",     (fun ii -> Tasm ii);
  "__asm__", (fun ii -> Tasm ii);
  "__asm",   (fun ii -> Tasm ii);

  "inline",     (fun ii -> Tinline ii);
  "__inline__", (fun ii -> Tinline ii);
  "__inline",   (fun ii -> Tinline ii);

  "__attribute__", (fun ii -> Tattribute ii);
  "__attribute", (fun ii -> Tattribute ii);

  "typeof", (fun ii -> Ttypeof ii);
  "__typeof__", (fun ii -> Ttypeof ii);
  "__typeof", (fun ii -> Ttypeof ii);


  (* gccext: alias *)
  "__signed__",     (fun ii -> Tsigned ii);

  "__const__",     (fun ii -> Tconst ii);
  "__const",     (fun ii -> Tconst ii);

  "__volatile__",  (fun ii -> Tvolatile ii); 
  "__volatile",    (fun ii -> Tvolatile ii);  


  (* c99:  *)
  (* no just "restrict" ? maybe for backward compatibility they avoided 
   * to use restrict which people may have used in their program already 
   *)
  "__restrict",    (fun ii -> Trestrict ii);  
  "__restrict__",    (fun ii -> Trestrict ii);  
  
 ]

let error_radix s = 
  ("numeric " ^ s ^ " constant contains digits beyond the radix:")

(* functions for figuring out the type of integers *)

let is_long_dec s int uint long ulong =
  match !Flag_parsing_c.int_thresholds with
    None -> int
  | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
      let bn = Big_int.big_int_of_string s in
      if Big_int.ge_big_int bn ulong_threshold
      then ulong
      else
	if Big_int.ge_big_int bn long_threshold
	then long
	else
	  if Big_int.ge_big_int bn uint_threshold
	  then long
	  else int

let is_long_ho s int uint long ulong drop bpd count =
  match !Flag_parsing_c.int_thresholds with
    None -> int
  | Some (uint_sz,ulong_sz,_,_,_) ->
      let len = String.length s in
      (* this assumes that all of the hex/oct digits are significant *)
      (* drop is 2 for hex (0x) and 1 for oct (0) *)
      let s = String.sub s drop (len - drop) in
      let len =
	((len-drop) * bpd) -
	  (count (int_of_string("0x"^(String.sub s 0 1)))) in
      if len < uint_sz
      then int
      else
	if len = uint_sz
	then uint
	else
	  if len < ulong_sz
	  then long
	  else ulong

let is_long_oct s int uint long ulong =
  is_long_ho s int uint long ulong 1 3
    (* stupid, but probably more efficient than taking logs *)
    (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
let is_long_hex s int uint long ulong =
  is_long_ho s int uint long ulong 2 4
    (* stupid, but probably more efficient than taking logs *)
    (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
      | _ -> 0)

let sint = (Signed,CInt)
let uint = (UnSigned,CInt)
let slong = (Signed,CLong)
let ulong = (UnSigned,CLong)

}

(*****************************************************************************)
let letter = ['A'-'Z' 'a'-'z' '_']
let digit  = ['0'-'9']

(* not used for the moment *)
let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
		   ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] 
(* 7 = \a = bell in C. this is not the only char allowed !! 
 * ex @ and $ ` are valid too 
 *)

let cchar = (letter | digit | punctuation | additionnal) 

let sp = [' ' '\t']+
let spopt = [' ' '\t']*

let dec = ['0'-'9']
let oct = ['0'-'7']
let hex = ['0'-'9' 'a'-'f' 'A'-'F']

let decimal = ('0' | (['1'-'9'] dec*))
let octal   = ['0']        oct+
let hexa    = ("0x" |"0X") hex+ 


let pent   = dec+
let pfract = dec+
let sign = ['-' '+']
let exp  = ['e''E'] sign? dec+
let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)

let id = letter (letter | digit) *

(*****************************************************************************)
rule token = parse

  (* ----------------------------------------------------------------------- *)
  (* spacing/comments *)
  (* ----------------------------------------------------------------------- *)

  (* note: this lexer generate tokens for comments!! so can not give 
   * this lexer as-is to the parsing function. Must preprocess it, hence
   * use techniques like cur_tok ref in parse_c.ml
   *)

  | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
      (* starting a new line; the newline character followed by whitespace *)
      { TCommentNewline (tokinfo lexbuf) }
  | [' ' '\t' '\r' '\011' '\012' ]+  
      { TCommentSpace (tokinfo lexbuf) }
  | "/*" 
      { let info = tokinfo lexbuf in 
        let com = comment lexbuf in

        let info' = info +> tok_add_s com in
        let s = Ast_c.str_of_info info' in
        (* could be more flexible, use [\t ]* instead of hardcoded 
         * single space. *)
        match s with
        | "/* {{coccinelle:skip_start}} */" -> 
            TCommentSkipTagStart (info')
        | "/* {{coccinelle:skip_end}} */" -> 
            TCommentSkipTagEnd (info')
        | _ -> TComment(info') 
      }


  (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
   * So need this here only when dont call cpp before.
   * note that we don't keep the trailing \n; it will be in another token.
   *)
  | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) } 

  (* ----------------------------------------------------------------------- *)
  (* cpp *)
  (* ----------------------------------------------------------------------- *)

  (* old:
   *   | '#'		{ endline lexbuf} // should be line, and not endline 
   *   and endline = parse  | '\n' 	{ token lexbuf}  
   *                        |	_	{ endline lexbuf} 
   *)

  (* less?:
   *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ... 
   *    => just "#" -> token {lexbuf} (that is ignore)
   *  il y'a 1 #elif  sans rien  apres
   *  il y'a 1 #error sans rien  apres
   *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
Commit	Line	Data
34e49164	1	{
0708f913 C	2	(* Yoann Padioleau
	3	*
	4	* Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
34e49164 C	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License (GPL)
	8	* version 2 as published by the Free Software Foundation.
	9	*
	10	* This program is distributed in the hope that it will be useful,
	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	* file license.txt for more details.
	14	*)
	15	open Common
	16
	17	open Parser_c
	18
	19	open Ast_c (* to factorise tokens, OpAssign, ... *)
	20
	21	(*****************************************************************************)
	22	(*
	23	* subtil: ocamllex use side effect on lexbuf, so must take care.
	24	* For instance must do
	25	*
	26	* let info = tokinfo lexbuf in
	27	* TComment (info +> tok_add_s (comment lexbuf))
	28	*
	29	* and not
	30	*
	31	* TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
	32	*
	33	* because of the "wierd" order of evaluation of OCaml.
b1b2de81 C	34	*
b1b2de81 C	35	*
34e49164 C	36	*
	37	* note: can't use Lexer_parser._lexer_hint here to do different
	38	* things, because now we call the lexer to get all the tokens
	39	* (tokens_all), and then we parse. So we can't have the _lexer_hint
	40	* info here. We can have it only in parse_c. For the same reason, the
	41	* typedef handling here is now useless.
	42	*)
	43	(*****************************************************************************)
	44
	45	(*****************************************************************************)
	46	(* Wrappers *)
	47	(*****************************************************************************)
708f4980	48	let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
34e49164 C	49
	50	(*****************************************************************************)
	51
	52
	53	exception Lexical of string
	54
	55	let tok lexbuf = Lexing.lexeme lexbuf
	56
	57	let tokinfo lexbuf =
	58	{
	59	pinfo = Ast_c.OriginTok {
	60	Common.charpos = Lexing.lexeme_start lexbuf;
	61	Common.str = Lexing.lexeme lexbuf;
	62	(* info filled in a post-lexing phase *)
	63	Common.line = -1;
	64	Common.column = -1;
	65	Common.file = "";
	66	};
	67	(* must generate a new ref each time, otherwise share *)
	68	cocci_tag = ref Ast_c.emptyAnnot;
	69	comments_tag = ref Ast_c.emptyComments;
	70	}
	71
b1b2de81	72	(* cppext: must generate a new ref each time, otherwise share *)
485bce71 C	73	let no_ifdef_mark () = ref (None: (int * int) option)
485bce71 C	74
34e49164 C	75	let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
	76
	77
	78	(* opti: less convenient, but using a hash is faster than using a match *)
	79	let keyword_table = Common.hash_of_list [
	80
485bce71	81	(* c: *)
34e49164 C	82	"void", (fun ii -> Tvoid ii);
	83	"char", (fun ii -> Tchar ii);
	84	"short", (fun ii -> Tshort ii);
	85	"int", (fun ii -> Tint ii);
	86	"long", (fun ii -> Tlong ii);
	87	"float", (fun ii -> Tfloat ii);
	88	"double", (fun ii -> Tdouble ii);
	89
	90	"unsigned", (fun ii -> Tunsigned ii);
	91	"signed", (fun ii -> Tsigned ii);
	92
	93	"auto", (fun ii -> Tauto ii);
	94	"register", (fun ii -> Tregister ii);
	95	"extern", (fun ii -> Textern ii);
	96	"static", (fun ii -> Tstatic ii);
	97
	98	"const", (fun ii -> Tconst ii);
	99	"volatile", (fun ii -> Tvolatile ii);
	100
	101	"struct", (fun ii -> Tstruct ii);
	102	"union", (fun ii -> Tunion ii);
	103	"enum", (fun ii -> Tenum ii);
	104	"typedef", (fun ii -> Ttypedef ii);
	105
	106	"if", (fun ii -> Tif ii);
	107	"else", (fun ii -> Telse ii);
	108	"break", (fun ii -> Tbreak ii);
	109	"continue", (fun ii -> Tcontinue ii);
	110	"switch", (fun ii -> Tswitch ii);
	111	"case", (fun ii -> Tcase ii);
	112	"default", (fun ii -> Tdefault ii);
	113	"for", (fun ii -> Tfor ii);
	114	"do", (fun ii -> Tdo ii);
	115	"while", (fun ii -> Twhile ii);
	116	"return", (fun ii -> Treturn ii);
	117	"goto", (fun ii -> Tgoto ii);
	118
	119	"sizeof", (fun ii -> Tsizeof ii);
	120
	121
	122	(* gccext: cppext: linuxext: synonyms *)
	123	"asm", (fun ii -> Tasm ii);
	124	"__asm__", (fun ii -> Tasm ii);
	125	"__asm", (fun ii -> Tasm ii);
	126
	127	"inline", (fun ii -> Tinline ii);
	128	"__inline__", (fun ii -> Tinline ii);
	129	"__inline", (fun ii -> Tinline ii);
34e49164 C	130
	131	"__attribute__", (fun ii -> Tattribute ii);
	132	"__attribute", (fun ii -> Tattribute ii);
	133
	134	"typeof", (fun ii -> Ttypeof ii);
	135	"__typeof__", (fun ii -> Ttypeof ii);
485bce71 C	136	"__typeof", (fun ii -> Ttypeof ii);
485bce71 C	137
34e49164	138
485bce71	139	(* gccext: alias *)
34e49164 C	140	"__signed__", (fun ii -> Tsigned ii);
	141
	142	"__const__", (fun ii -> Tconst ii);
	143	"__const", (fun ii -> Tconst ii);
	144
	145	"__volatile__", (fun ii -> Tvolatile ii);
	146	"__volatile", (fun ii -> Tvolatile ii);
485bce71 C	147
	148
	149	(* c99: *)
	150	(* no just "restrict" ? maybe for backward compatibility they avoided
	151	* to use restrict which people may have used in their program already
	152	*)
	153	"__restrict", (fun ii -> Trestrict ii);
	154	"__restrict__", (fun ii -> Trestrict ii);
34e49164 C	155
	156	]
	157
	158	let error_radix s =
	159	("numeric " ^ s ^ " constant contains digits beyond the radix:")
	160
708f4980 C	161	(* functions for figuring out the type of integers *)
	162
	163	let is_long_dec s int uint long ulong =
	164	match !Flag_parsing_c.int_thresholds with
	165	None -> int
	166	\| Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
	167	let bn = Big_int.big_int_of_string s in
	168	if Big_int.ge_big_int bn ulong_threshold
	169	then ulong
	170	else
	171	if Big_int.ge_big_int bn long_threshold
	172	then long
	173	else
	174	if Big_int.ge_big_int bn uint_threshold
	175	then long
	176	else int
	177
	178	let is_long_ho s int uint long ulong drop bpd count =
	179	match !Flag_parsing_c.int_thresholds with
	180	None -> int
	181	\| Some (uint_sz,ulong_sz,_,_,_) ->
	182	let len = String.length s in
	183	(* this assumes that all of the hex/oct digits are significant *)
	184	(* drop is 2 for hex (0x) and 1 for oct (0) *)
	185	let s = String.sub s drop (len - drop) in
	186	let len =
	187	((len-drop) * bpd) -
	188	(count (int_of_string("0x"^(String.sub s 0 1)))) in
	189	if len < uint_sz
	190	then int
	191	else
	192	if len = uint_sz
	193	then uint
	194	else
	195	if len < ulong_sz
	196	then long
	197	else ulong
	198
	199	let is_long_oct s int uint long ulong =
	200	is_long_ho s int uint long ulong 1 3
	201	(* stupid, but probably more efficient than taking logs *)
	202	(function 0 -> 3 \| 1 -> 2 \| n when n < 4 -> 1 \| _ -> 0)
	203	let is_long_hex s int uint long ulong =
	204	is_long_ho s int uint long ulong 2 4
	205	(* stupid, but probably more efficient than taking logs *)
	206	(function 0 -> 4 \| 1 -> 3 \| n when n < 4 -> 2 \| n when n < 8 -> 1
	207	\| _ -> 0)
	208
	209	let sint = (Signed,CInt)
	210	let uint = (UnSigned,CInt)
	211	let slong = (Signed,CLong)
	212	let ulong = (UnSigned,CLong)
	213
34e49164 C	214	}
	215
	216	(*****************************************************************************)
	217	let letter = ['A'-'Z' 'a'-'z' '_']
	218	let digit = ['0'-'9']
	219
	220	(* not used for the moment *)
	221	let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
	222	';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '\|' '}' '~']
	223	let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
	224	let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
	225	(* 7 = \a = bell in C. this is not the only char allowed !!
	226	* ex @ and $ ` are valid too
	227	*)
	228
	229	let cchar = (letter \| digit \| punctuation \| additionnal)
	230
	231	let sp = [' ' '\t']+
	232	let spopt = [' ' '\t']*
	233
	234	let dec = ['0'-'9']
	235	let oct = ['0'-'7']
	236	let hex = ['0'-'9' 'a'-'f' 'A'-'F']
	237
	238	let decimal = ('0' \| (['1'-'9'] dec*))
	239	let octal = ['0'] oct+
	240	let hexa = ("0x" \|"0X") hex+
	241
	242
	243	let pent = dec+
	244	let pfract = dec+
	245	let sign = ['-' '+']
	246	let exp = ['e''E'] sign? dec+
	247	let real = pent exp \| ((pent? '.' pfract \| pent '.' pfract? ) exp?)
	248
	249	let id = letter (letter \| digit) *
	250
	251	(*****************************************************************************)
	252	rule token = parse
	253
	254	(* ----------------------------------------------------------------------- *)
	255	(* spacing/comments *)
	256	(* ----------------------------------------------------------------------- *)
	257
	258	(* note: this lexer generate tokens for comments!! so can not give
	259	* this lexer as-is to the parsing function. Must preprocess it, hence
	260	* use techniques like cur_tok ref in parse_c.ml
	261	*)
	262
	263	\| ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
	264	(* starting a new line; the newline character followed by whitespace *)
	265	{ TCommentNewline (tokinfo lexbuf) }
	266	\| [' ' '\t' '\r' '\011' '\012' ]+
	267	{ TCommentSpace (tokinfo lexbuf) }
	268	\| "/*"
	269	{ let info = tokinfo lexbuf in
	270	let com = comment lexbuf in
0708f913 C	271
	272	let info' = info +> tok_add_s com in
	273	let s = Ast_c.str_of_info info' in
b1b2de81 C	274	(* could be more flexible, use [\t ]* instead of hardcoded
b1b2de81 C	275	* single space. *)
0708f913 C	276	match s with
	277	\| "/* {{coccinelle:skip_start}} */" ->
	278	TCommentSkipTagStart (info')
	279	\| "/* {{coccinelle:skip_end}} */" ->
	280	TCommentSkipTagEnd (info')
	281	\| _ -> TComment(info')
34e49164 C	282	}
	283
	284
	285	(* C++ comment are allowed via gccext, but normally they are deleted by cpp.
	286	* So need this here only when dont call cpp before.
485bce71	287	* note that we don't keep the trailing \n; it will be in another token.
34e49164 C	288	*)
	289	\| "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
	290
	291	(* ----------------------------------------------------------------------- *)
	292	(* cpp *)
	293	(* ----------------------------------------------------------------------- *)
	294
	295	(* old:
	296	* \| '#' { endline lexbuf} // should be line, and not endline
	297	* and endline = parse \| '\n' { token lexbuf}
	298	* \| _ { endline lexbuf}
	299	*)
	300
b1b2de81	301	(* less?:
34e49164 C	302	* have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
	303	* => just "#" -> token {lexbuf} (that is ignore)
	304	* il y'a 1 #elif sans rien apres
	305	* il y'a 1 #error sans rien apres
	306	* il y'a 2 mov dede, #xxx qui genere du coup exn car
	307