Release coccinelle-0.1.8
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
CommitLineData
34e49164 1{
0708f913
C
2(* Yoann Padioleau
3 *
4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
34e49164
C
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15open Common
16
17open Parser_c
18
19open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21(*****************************************************************************)
22(*
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
33 * because of the "wierd" order of evaluation of OCaml.
b1b2de81
C
34 *
35 *
34e49164
C
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
41 * typedef handling here is now useless.
42 *)
43(*****************************************************************************)
44
45(*****************************************************************************)
46(* Wrappers *)
47(*****************************************************************************)
708f4980 48let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
34e49164
C
49
50(*****************************************************************************)
51
52
53exception Lexical of string
54
55let tok lexbuf = Lexing.lexeme lexbuf
56
57let tokinfo lexbuf =
58 {
59 pinfo = Ast_c.OriginTok {
60 Common.charpos = Lexing.lexeme_start lexbuf;
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
63 Common.line = -1;
64 Common.column = -1;
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
69 comments_tag = ref Ast_c.emptyComments;
70 }
71
b1b2de81 72(* cppext: must generate a new ref each time, otherwise share *)
485bce71
C
73let no_ifdef_mark () = ref (None: (int * int) option)
74
34e49164
C
75let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
76
77
78(* opti: less convenient, but using a hash is faster than using a match *)
79let keyword_table = Common.hash_of_list [
80
485bce71 81 (* c: *)
34e49164
C
82 "void", (fun ii -> Tvoid ii);
83 "char", (fun ii -> Tchar ii);
84 "short", (fun ii -> Tshort ii);
85 "int", (fun ii -> Tint ii);
86 "long", (fun ii -> Tlong ii);
87 "float", (fun ii -> Tfloat ii);
88 "double", (fun ii -> Tdouble ii);
89
90 "unsigned", (fun ii -> Tunsigned ii);
91 "signed", (fun ii -> Tsigned ii);
92
93 "auto", (fun ii -> Tauto ii);
94 "register", (fun ii -> Tregister ii);
95 "extern", (fun ii -> Textern ii);
96 "static", (fun ii -> Tstatic ii);
97
98 "const", (fun ii -> Tconst ii);
99 "volatile", (fun ii -> Tvolatile ii);
100
101 "struct", (fun ii -> Tstruct ii);
102 "union", (fun ii -> Tunion ii);
103 "enum", (fun ii -> Tenum ii);
104 "typedef", (fun ii -> Ttypedef ii);
105
106 "if", (fun ii -> Tif ii);
107 "else", (fun ii -> Telse ii);
108 "break", (fun ii -> Tbreak ii);
109 "continue", (fun ii -> Tcontinue ii);
110 "switch", (fun ii -> Tswitch ii);
111 "case", (fun ii -> Tcase ii);
112 "default", (fun ii -> Tdefault ii);
113 "for", (fun ii -> Tfor ii);
114 "do", (fun ii -> Tdo ii);
115 "while", (fun ii -> Twhile ii);
116 "return", (fun ii -> Treturn ii);
117 "goto", (fun ii -> Tgoto ii);
118
119 "sizeof", (fun ii -> Tsizeof ii);
120
121
122 (* gccext: cppext: linuxext: synonyms *)
123 "asm", (fun ii -> Tasm ii);
124 "__asm__", (fun ii -> Tasm ii);
125 "__asm", (fun ii -> Tasm ii);
126
127 "inline", (fun ii -> Tinline ii);
128 "__inline__", (fun ii -> Tinline ii);
129 "__inline", (fun ii -> Tinline ii);
34e49164
C
130
131 "__attribute__", (fun ii -> Tattribute ii);
132 "__attribute", (fun ii -> Tattribute ii);
133
134 "typeof", (fun ii -> Ttypeof ii);
135 "__typeof__", (fun ii -> Ttypeof ii);
485bce71
C
136 "__typeof", (fun ii -> Ttypeof ii);
137
34e49164 138
485bce71 139 (* gccext: alias *)
34e49164
C
140 "__signed__", (fun ii -> Tsigned ii);
141
142 "__const__", (fun ii -> Tconst ii);
143 "__const", (fun ii -> Tconst ii);
144
145 "__volatile__", (fun ii -> Tvolatile ii);
146 "__volatile", (fun ii -> Tvolatile ii);
485bce71
C
147
148
149 (* c99: *)
150 (* no just "restrict" ? maybe for backward compatibility they avoided
151 * to use restrict which people may have used in their program already
152 *)
153 "__restrict", (fun ii -> Trestrict ii);
154 "__restrict__", (fun ii -> Trestrict ii);
34e49164
C
155
156 ]
157
158let error_radix s =
159 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
160
708f4980
C
161(* functions for figuring out the type of integers *)
162
163let is_long_dec s int uint long ulong =
164 match !Flag_parsing_c.int_thresholds with
165 None -> int
166 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
167 let bn = Big_int.big_int_of_string s in
168 if Big_int.ge_big_int bn ulong_threshold
169 then ulong
170 else
171 if Big_int.ge_big_int bn long_threshold
172 then long
173 else
174 if Big_int.ge_big_int bn uint_threshold
175 then long
176 else int
177
178let is_long_ho s int uint long ulong drop bpd count =
179 match !Flag_parsing_c.int_thresholds with
180 None -> int
181 | Some (uint_sz,ulong_sz,_,_,_) ->
182 let len = String.length s in
183 (* this assumes that all of the hex/oct digits are significant *)
184 (* drop is 2 for hex (0x) and 1 for oct (0) *)
185 let s = String.sub s drop (len - drop) in
186 let len =
187 ((len-drop) * bpd) -
188 (count (int_of_string("0x"^(String.sub s 0 1)))) in
189 if len < uint_sz
190 then int
191 else
192 if len = uint_sz
193 then uint
194 else
195 if len < ulong_sz
196 then long
197 else ulong
198
199let is_long_oct s int uint long ulong =
200 is_long_ho s int uint long ulong 1 3
201 (* stupid, but probably more efficient than taking logs *)
202 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
203let is_long_hex s int uint long ulong =
204 is_long_ho s int uint long ulong 2 4
205 (* stupid, but probably more efficient than taking logs *)
206 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
207 | _ -> 0)
208
209let sint = (Signed,CInt)
210let uint = (UnSigned,CInt)
211let slong = (Signed,CLong)
212let ulong = (UnSigned,CLong)
213
34e49164
C
214}
215
216(*****************************************************************************)
217let letter = ['A'-'Z' 'a'-'z' '_']
218let digit = ['0'-'9']
219
220(* not used for the moment *)
221let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
222 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
223let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
224let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
225(* 7 = \a = bell in C. this is not the only char allowed !!
226 * ex @ and $ ` are valid too
227 *)
228
229let cchar = (letter | digit | punctuation | additionnal)
230
231let sp = [' ' '\t']+
232let spopt = [' ' '\t']*
233
234let dec = ['0'-'9']
235let oct = ['0'-'7']
236let hex = ['0'-'9' 'a'-'f' 'A'-'F']
237
238let decimal = ('0' | (['1'-'9'] dec*))
239let octal = ['0'] oct+
240let hexa = ("0x" |"0X") hex+
241
242
243let pent = dec+
244let pfract = dec+
245let sign = ['-' '+']
246let exp = ['e''E'] sign? dec+
247let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
248
249let id = letter (letter | digit) *
250
251(*****************************************************************************)
252rule token = parse
253
254 (* ----------------------------------------------------------------------- *)
255 (* spacing/comments *)
256 (* ----------------------------------------------------------------------- *)
257
258 (* note: this lexer generate tokens for comments!! so can not give
259 * this lexer as-is to the parsing function. Must preprocess it, hence
260 * use techniques like cur_tok ref in parse_c.ml
261 *)
262
263 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
264 (* starting a new line; the newline character followed by whitespace *)
265 { TCommentNewline (tokinfo lexbuf) }
266 | [' ' '\t' '\r' '\011' '\012' ]+
267 { TCommentSpace (tokinfo lexbuf) }
268 | "/*"
269 { let info = tokinfo lexbuf in
270 let com = comment lexbuf in
0708f913
C
271
272 let info' = info +> tok_add_s com in
273 let s = Ast_c.str_of_info info' in
b1b2de81
C
274 (* could be more flexible, use [\t ]* instead of hardcoded
275 * single space. *)
0708f913
C
276 match s with
277 | "/* {{coccinelle:skip_start}} */" ->
278 TCommentSkipTagStart (info')
279 | "/* {{coccinelle:skip_end}} */" ->
280 TCommentSkipTagEnd (info')
281 | _ -> TComment(info')
34e49164
C
282 }
283
284
285 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
286 * So need this here only when dont call cpp before.
485bce71 287 * note that we don't keep the trailing \n; it will be in another token.
34e49164
C
288 *)
289 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
290
291 (* ----------------------------------------------------------------------- *)
292 (* cpp *)
293 (* ----------------------------------------------------------------------- *)
294
295 (* old:
296 * | '#' { endline lexbuf} // should be line, and not endline
297 * and endline = parse | '\n' { token lexbuf}
298 * | _ { endline lexbuf}
299 *)
300
b1b2de81 301 (* less?:
34e49164
C
302 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
303 * => just "#" -> token {lexbuf} (that is ignore)
304 * il y'a 1 #elif sans rien apres
305 * il y'a 1 #error sans rien apres
306 * il y'a 2 mov dede, #xxx qui genere du coup exn car
307