[bpt/coccinelle.git] / parsing_c / lexer_c.mll

{
(* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License (GPL)
 * version 2 as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * file license.txt for more details.
 *)
open Common

open Parser_c

open Ast_c (* to factorise tokens, OpAssign, ... *)

(*****************************************************************************)
(*
 * subtil: ocamllex use side effect on lexbuf, so must take care. 
 * For instance must do   
 * 
 *  let info = tokinfo lexbuf in 
 *  TComment (info +> tok_add_s (comment lexbuf)) 
 * 
 * and not 
 * 
 *   TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) 
 * 
 * because of the "wierd" order of evaluation of OCaml.
 *
 * note: can't use Lexer_parser._lexer_hint here to do different
 * things, because now we call the lexer to get all the tokens
 * (tokens_all), and then we parse. So we can't have the _lexer_hint
 * info here. We can have it only in parse_c. For the same reason, the
 * typedef handling here is now useless. 
 *)
(*****************************************************************************)

(*****************************************************************************)
(* Wrappers *)
(*****************************************************************************)
let pr2 s = 
  if !Flag_parsing_c.verbose_lexing 
  then Common.pr2 s

(*****************************************************************************)


exception Lexical of string

let tok     lexbuf  = Lexing.lexeme lexbuf

let tokinfo lexbuf  = 
  { 
    pinfo = Ast_c.OriginTok {
      Common.charpos = Lexing.lexeme_start lexbuf; 
      Common.str     = Lexing.lexeme lexbuf;
      (* info filled in a post-lexing phase *)
      Common.line = -1; 
      Common.column = -1; 
      Common.file = "";
    };
   (* must generate a new ref each time, otherwise share *)
    cocci_tag = ref Ast_c.emptyAnnot;
    comments_tag = ref Ast_c.emptyComments;
  }

(* must generate a new ref each time, otherwise share *)
let no_ifdef_mark () = ref (None: (int * int) option)

let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
    

(* opti: less convenient, but using a hash is faster than using a match *)
let keyword_table = Common.hash_of_list [

  (* c: *)
  "void",   (fun ii -> Tvoid ii); 
  "char",   (fun ii -> Tchar ii);    
  "short",  (fun ii -> Tshort ii); 
  "int",    (fun ii -> Tint ii); 
  "long",   (fun ii -> Tlong ii); 
  "float",  (fun ii -> Tfloat ii); 
  "double", (fun ii -> Tdouble ii);  

  "unsigned", (fun ii -> Tunsigned ii);  
  "signed",   (fun ii -> Tsigned ii);
  
  "auto",     (fun ii -> Tauto ii);    
  "register", (fun ii -> Tregister ii);  
  "extern",   (fun ii -> Textern ii); 
  "static",   (fun ii -> Tstatic ii);

  "const",    (fun ii -> Tconst ii);
  "volatile", (fun ii -> Tvolatile ii); 
  
  "struct",  (fun ii -> Tstruct ii); 
  "union",   (fun ii -> Tunion ii); 
  "enum",    (fun ii -> Tenum ii);  
  "typedef", (fun ii -> Ttypedef ii);  
  
  "if",      (fun ii -> Tif ii);      
  "else",     (fun ii -> Telse ii); 
  "break",   (fun ii -> Tbreak ii);   
  "continue", (fun ii -> Tcontinue ii);
  "switch",  (fun ii -> Tswitch ii);  
  "case",     (fun ii -> Tcase ii);  
  "default", (fun ii -> Tdefault ii); 
  "for",     (fun ii -> Tfor ii);  
  "do",      (fun ii -> Tdo ii);      
  "while",   (fun ii -> Twhile ii);  
  "return",  (fun ii -> Treturn ii);
  "goto",    (fun ii -> Tgoto ii); 
  
  "sizeof", (fun ii -> Tsizeof ii);   


  (* gccext: cppext: linuxext: synonyms *)
  "asm",     (fun ii -> Tasm ii);
  "__asm__", (fun ii -> Tasm ii);
  "__asm",   (fun ii -> Tasm ii);

  "inline",     (fun ii -> Tinline ii);
  "__inline__", (fun ii -> Tinline ii);
  "__inline",   (fun ii -> Tinline ii);

  "__attribute__", (fun ii -> Tattribute ii);
  "__attribute", (fun ii -> Tattribute ii);

  "typeof", (fun ii -> Ttypeof ii);
  "__typeof__", (fun ii -> Ttypeof ii);
  "__typeof", (fun ii -> Ttypeof ii);


  (* gccext: alias *)
  "__signed__",     (fun ii -> Tsigned ii);

  "__const__",     (fun ii -> Tconst ii);
  "__const",     (fun ii -> Tconst ii);

  "__volatile__",  (fun ii -> Tvolatile ii); 
  "__volatile",    (fun ii -> Tvolatile ii);  


  (* c99:  *)
  (* no just "restrict" ? maybe for backward compatibility they avoided 
   * to use restrict which people may have used in their program already 
   *)
  "__restrict",    (fun ii -> Trestrict ii);  
  "__restrict__",    (fun ii -> Trestrict ii);  
  
 ]

let error_radix s = 
  ("numeric " ^ s ^ " constant contains digits beyond the radix:")

}

(*****************************************************************************)
let letter = ['A'-'Z' 'a'-'z' '_']
let digit  = ['0'-'9']

(* not used for the moment *)
let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
		   ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] 
(* 7 = \a = bell in C. this is not the only char allowed !! 
 * ex @ and $ ` are valid too 
 *)

let cchar = (letter | digit | punctuation | additionnal) 

let sp = [' ' '\t']+
let spopt = [' ' '\t']*

let dec = ['0'-'9']
let oct = ['0'-'7']
let hex = ['0'-'9' 'a'-'f' 'A'-'F']

let decimal = ('0' | (['1'-'9'] dec*))
let octal   = ['0']        oct+
let hexa    = ("0x" |"0X") hex+ 


let pent   = dec+
let pfract = dec+
let sign = ['-' '+']
let exp  = ['e''E'] sign? dec+
let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)

let id = letter (letter | digit) *

(*****************************************************************************)
rule token = parse

  (* ----------------------------------------------------------------------- *)
  (* spacing/comments *)
  (* ----------------------------------------------------------------------- *)

  (* note: this lexer generate tokens for comments!! so can not give 
   * this lexer as-is to the parsing function. Must preprocess it, hence
   * use techniques like cur_tok ref in parse_c.ml
   *)

  | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
      (* starting a new line; the newline character followed by whitespace *)
      { TCommentNewline (tokinfo lexbuf) }
  | [' ' '\t' '\r' '\011' '\012' ]+  
      { TCommentSpace (tokinfo lexbuf) }
  | "/*" 
      { let info = tokinfo lexbuf in 
        let com = comment lexbuf in
        TComment(info +> tok_add_s com) 
      }


  (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
   * So need this here only when dont call cpp before.
   * note that we don't keep the trailing \n; it will be in another token.
   *)
  | "//" [^'\r' '\n' '\011']*    { TComment (tokinfo lexbuf) } 

  (* ----------------------------------------------------------------------- *)
  (* cpp *)
  (* ----------------------------------------------------------------------- *)

  (* old:
   *   | '#'		{ endline lexbuf} // should be line, and not endline 
   *   and endline = parse  | '\n' 	{ token lexbuf}  
   *                        |	_	{ endline lexbuf} 
   *)

  (* todo?:
   *  have found a # #else  in "newfile-2.6.c",  legal ?   and also a  #/* ... 
   *    => just "#" -> token {lexbuf} (that is ignore)
   *  il y'a 1 #elif  sans rien  apres
   *  il y'a 1 #error sans rien  apres
   *  il y'a 2  mov dede, #xxx    qui genere du coup exn car
Commit	Line	Data
34e49164	1	{
485bce71	2	(* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau
34e49164 C	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License (GPL)
	6	* version 2 as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	* file license.txt for more details.
	12	*)
	13	open Common
	14
	15	open Parser_c
	16
	17	open Ast_c (* to factorise tokens, OpAssign, ... *)
	18
	19	(*****************************************************************************)
	20	(*
	21	* subtil: ocamllex use side effect on lexbuf, so must take care.
	22	* For instance must do
	23	*
	24	* let info = tokinfo lexbuf in
	25	* TComment (info +> tok_add_s (comment lexbuf))
	26	*
	27	* and not
	28	*
	29	* TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
	30	*
	31	* because of the "wierd" order of evaluation of OCaml.
	32	*
	33	* note: can't use Lexer_parser._lexer_hint here to do different
	34	* things, because now we call the lexer to get all the tokens
	35	* (tokens_all), and then we parse. So we can't have the _lexer_hint
	36	* info here. We can have it only in parse_c. For the same reason, the
	37	* typedef handling here is now useless.
	38	*)
	39	(*****************************************************************************)
	40
	41	(*****************************************************************************)
	42	(* Wrappers *)
	43	(*****************************************************************************)
	44	let pr2 s =
	45	if !Flag_parsing_c.verbose_lexing
	46	then Common.pr2 s
	47
	48	(*****************************************************************************)
	49
	50
	51	exception Lexical of string
	52
	53	let tok lexbuf = Lexing.lexeme lexbuf
	54
	55	let tokinfo lexbuf =
	56	{
	57	pinfo = Ast_c.OriginTok {
	58	Common.charpos = Lexing.lexeme_start lexbuf;
	59	Common.str = Lexing.lexeme lexbuf;
	60	(* info filled in a post-lexing phase *)
	61	Common.line = -1;
	62	Common.column = -1;
	63	Common.file = "";
	64	};
	65	(* must generate a new ref each time, otherwise share *)
	66	cocci_tag = ref Ast_c.emptyAnnot;
67	comments_tag = ref Ast_c.emptyComments;
68	}
69
485bce71 C	70	(* must generate a new ref each time, otherwise share *)
	71	let no_ifdef_mark () = ref (None: (int * int) option)
	72
34e49164 C	73	let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
	74
	75
	76	(* opti: less convenient, but using a hash is faster than using a match *)
	77	let keyword_table = Common.hash_of_list [
	78
485bce71	79	(* c: *)
34e49164 C	80	"void", (fun ii -> Tvoid ii);
	81	"char", (fun ii -> Tchar ii);
	82	"short", (fun ii -> Tshort ii);
	83	"int", (fun ii -> Tint ii);
	84	"long", (fun ii -> Tlong ii);
	85	"float", (fun ii -> Tfloat ii);
	86	"double", (fun ii -> Tdouble ii);
	87
	88	"unsigned", (fun ii -> Tunsigned ii);
	89	"signed", (fun ii -> Tsigned ii);
	90
	91	"auto", (fun ii -> Tauto ii);
	92	"register", (fun ii -> Tregister ii);
	93	"extern", (fun ii -> Textern ii);
	94	"static", (fun ii -> Tstatic ii);
	95
	96	"const", (fun ii -> Tconst ii);
	97	"volatile", (fun ii -> Tvolatile ii);
	98
	99	"struct", (fun ii -> Tstruct ii);
	100	"union", (fun ii -> Tunion ii);
	101	"enum", (fun ii -> Tenum ii);
	102	"typedef", (fun ii -> Ttypedef ii);
	103
	104	"if", (fun ii -> Tif ii);
	105	"else", (fun ii -> Telse ii);
	106	"break", (fun ii -> Tbreak ii);
	107	"continue", (fun ii -> Tcontinue ii);
	108	"switch", (fun ii -> Tswitch ii);
	109	"case", (fun ii -> Tcase ii);
	110	"default", (fun ii -> Tdefault ii);
	111	"for", (fun ii -> Tfor ii);
	112	"do", (fun ii -> Tdo ii);
	113	"while", (fun ii -> Twhile ii);
	114	"return", (fun ii -> Treturn ii);
	115	"goto", (fun ii -> Tgoto ii);
	116
	117	"sizeof", (fun ii -> Tsizeof ii);
	118
	119
	120	(* gccext: cppext: linuxext: synonyms *)
	121	"asm", (fun ii -> Tasm ii);
	122	"__asm__", (fun ii -> Tasm ii);
	123	"__asm", (fun ii -> Tasm ii);
	124
	125	"inline", (fun ii -> Tinline ii);
	126	"__inline__", (fun ii -> Tinline ii);
	127	"__inline", (fun ii -> Tinline ii);
34e49164 C	128
	129	"__attribute__", (fun ii -> Tattribute ii);
	130	"__attribute", (fun ii -> Tattribute ii);
	131
	132	"typeof", (fun ii -> Ttypeof ii);
	133	"__typeof__", (fun ii -> Ttypeof ii);
485bce71 C	134	"__typeof", (fun ii -> Ttypeof ii);
485bce71 C	135
34e49164	136
485bce71	137	(* gccext: alias *)
34e49164 C	138	"__signed__", (fun ii -> Tsigned ii);
	139
	140	"__const__", (fun ii -> Tconst ii);
	141	"__const", (fun ii -> Tconst ii);
	142
	143	"__volatile__", (fun ii -> Tvolatile ii);
	144	"__volatile", (fun ii -> Tvolatile ii);
485bce71 C	145
	146
	147	(* c99: *)
	148	(* no just "restrict" ? maybe for backward compatibility they avoided
	149	* to use restrict which people may have used in their program already
	150	*)
	151	"__restrict", (fun ii -> Trestrict ii);
	152	"__restrict__", (fun ii -> Trestrict ii);
34e49164 C	153
	154	]
	155
	156	let error_radix s =
	157	("numeric " ^ s ^ " constant contains digits beyond the radix:")
	158
	159	}
	160
	161	(*****************************************************************************)
	162	let letter = ['A'-'Z' 'a'-'z' '_']
	163	let digit = ['0'-'9']
	164
	165	(* not used for the moment *)
	166	let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
	167	';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '\|' '}' '~']
	168	let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
	169	let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
	170	(* 7 = \a = bell in C. this is not the only char allowed !!
	171	* ex @ and $ ` are valid too
	172	*)
	173
	174	let cchar = (letter \| digit \| punctuation \| additionnal)
	175
	176	let sp = [' ' '\t']+
	177	let spopt = [' ' '\t']*
	178
	179	let dec = ['0'-'9']
	180	let oct = ['0'-'7']
	181	let hex = ['0'-'9' 'a'-'f' 'A'-'F']
	182
	183	let decimal = ('0' \| (['1'-'9'] dec*))
	184	let octal = ['0'] oct+
	185	let hexa = ("0x" \|"0X") hex+
	186
	187
	188	let pent = dec+
	189	let pfract = dec+
	190	let sign = ['-' '+']
	191	let exp = ['e''E'] sign? dec+
	192	let real = pent exp \| ((pent? '.' pfract \| pent '.' pfract? ) exp?)
	193
	194	let id = letter (letter \| digit) *
	195
	196	(*****************************************************************************)
	197	rule token = parse
	198
	199	(* ----------------------------------------------------------------------- *)
	200	(* spacing/comments *)
	201	(* ----------------------------------------------------------------------- *)
	202
	203	(* note: this lexer generate tokens for comments!! so can not give
	204	* this lexer as-is to the parsing function. Must preprocess it, hence
	205	* use techniques like cur_tok ref in parse_c.ml
	206	*)
	207
	208	\| ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
	209	(* starting a new line; the newline character followed by whitespace *)
	210	{ TCommentNewline (tokinfo lexbuf) }
	211	\| [' ' '\t' '\r' '\011' '\012' ]+
	212	{ TCommentSpace (tokinfo lexbuf) }
	213	\| "/*"
	214	{ let info = tokinfo lexbuf in
	215	let com = comment lexbuf in
	216	TComment(info +> tok_add_s com)
217	}
218
219
220	(* C++ comment are allowed via gccext, but normally they are deleted by cpp.
221	* So need this here only when dont call cpp before.
485bce71	222	* note that we don't keep the trailing \n; it will be in another token.
34e49164 C	223	*)
	224	\| "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
	225
	226	(* ----------------------------------------------------------------------- *)
	227	(* cpp *)
	228	(* ----------------------------------------------------------------------- *)
	229
	230	(* old:
	231	* \| '#' { endline lexbuf} // should be line, and not endline
	232	* and endline = parse \| '\n' { token lexbuf}
	233	* \| _ { endline lexbuf}
	234	*)
	235
	236	(* todo?:
	237	* have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
	238	* => just "#" -> token {lexbuf} (that is ignore)
	239	* il y'a 1 #elif sans rien apres
	240	* il y'a 1 #error sans rien apres
	241	* il y'a 2 mov dede, #xxx qui genere du coup exn car
	242