Release coccinelle-0.2.4rc6
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
CommitLineData
34e49164 1{
0708f913 2(* Yoann Padioleau
ae4735db 3 *
0708f913 4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
34e49164
C
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
ae4735db 9 *
34e49164
C
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15open Common
16
17open Parser_c
18
19open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21(*****************************************************************************)
22(*
ae4735db
C
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
34e49164 33 * because of the "wierd" order of evaluation of OCaml.
ae4735db
C
34 *
35 *
34e49164
C
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
ae4735db 41 * typedef handling here is now useless.
34e49164
C
42 *)
43(*****************************************************************************)
44
45(*****************************************************************************)
46(* Wrappers *)
47(*****************************************************************************)
ae4735db 48let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
34e49164
C
49
50(*****************************************************************************)
51
52
53exception Lexical of string
54
55let tok lexbuf = Lexing.lexeme lexbuf
56
ae4735db
C
57let tokinfo lexbuf =
58 {
34e49164 59 pinfo = Ast_c.OriginTok {
ae4735db 60 Common.charpos = Lexing.lexeme_start lexbuf;
34e49164
C
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
ae4735db
C
63 Common.line = -1;
64 Common.column = -1;
34e49164
C
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
69 comments_tag = ref Ast_c.emptyComments;
70 }
71
b1b2de81 72(* cppext: must generate a new ref each time, otherwise share *)
485bce71
C
73let no_ifdef_mark () = ref (None: (int * int) option)
74
34e49164 75let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
ae4735db 76
34e49164
C
77
78(* opti: less convenient, but using a hash is faster than using a match *)
79let keyword_table = Common.hash_of_list [
80
485bce71 81 (* c: *)
ae4735db
C
82 "void", (fun ii -> Tvoid ii);
83 "char", (fun ii -> Tchar ii);
84 "short", (fun ii -> Tshort ii);
85 "int", (fun ii -> Tint ii);
86 "long", (fun ii -> Tlong ii);
87 "float", (fun ii -> Tfloat ii);
88 "double", (fun ii -> Tdouble ii);
1eddfd50
C
89 "size_t", (fun ii -> Tsize_t ii);
90 "ssize_t", (fun ii -> Tssize_t ii);
91 "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
ae4735db
C
92
93 "unsigned", (fun ii -> Tunsigned ii);
34e49164 94 "signed", (fun ii -> Tsigned ii);
ae4735db
C
95
96 "auto", (fun ii -> Tauto ii);
97 "register", (fun ii -> Tregister ii);
98 "extern", (fun ii -> Textern ii);
34e49164
C
99 "static", (fun ii -> Tstatic ii);
100
101 "const", (fun ii -> Tconst ii);
ae4735db
C
102 "volatile", (fun ii -> Tvolatile ii);
103
104 "struct", (fun ii -> Tstruct ii);
105 "union", (fun ii -> Tunion ii);
106 "enum", (fun ii -> Tenum ii);
107 "typedef", (fun ii -> Ttypedef ii);
108
109 "if", (fun ii -> Tif ii);
110 "else", (fun ii -> Telse ii);
111 "break", (fun ii -> Tbreak ii);
34e49164 112 "continue", (fun ii -> Tcontinue ii);
ae4735db
C
113 "switch", (fun ii -> Tswitch ii);
114 "case", (fun ii -> Tcase ii);
115 "default", (fun ii -> Tdefault ii);
116 "for", (fun ii -> Tfor ii);
117 "do", (fun ii -> Tdo ii);
118 "while", (fun ii -> Twhile ii);
34e49164 119 "return", (fun ii -> Treturn ii);
ae4735db
C
120 "goto", (fun ii -> Tgoto ii);
121
122 "sizeof", (fun ii -> Tsizeof ii);
34e49164
C
123
124
125 (* gccext: cppext: linuxext: synonyms *)
126 "asm", (fun ii -> Tasm ii);
127 "__asm__", (fun ii -> Tasm ii);
128 "__asm", (fun ii -> Tasm ii);
129
130 "inline", (fun ii -> Tinline ii);
131 "__inline__", (fun ii -> Tinline ii);
132 "__inline", (fun ii -> Tinline ii);
34e49164
C
133
134 "__attribute__", (fun ii -> Tattribute ii);
135 "__attribute", (fun ii -> Tattribute ii);
136
137 "typeof", (fun ii -> Ttypeof ii);
138 "__typeof__", (fun ii -> Ttypeof ii);
485bce71
C
139 "__typeof", (fun ii -> Ttypeof ii);
140
978fd7e5 141 (* found a lot in expanded code *)
ae4735db 142 "__extension__", (fun ii -> TattributeNoarg ii);
978fd7e5 143
34e49164 144
485bce71 145 (* gccext: alias *)
34e49164
C
146 "__signed__", (fun ii -> Tsigned ii);
147
148 "__const__", (fun ii -> Tconst ii);
149 "__const", (fun ii -> Tconst ii);
150
ae4735db
C
151 "__volatile__", (fun ii -> Tvolatile ii);
152 "__volatile", (fun ii -> Tvolatile ii);
485bce71 153
978fd7e5
C
154 (* windowsext: *)
155 "__declspec", (fun ii -> Tattribute ii);
156
157 "__stdcall", (fun ii -> TattributeNoarg ii);
158 "__cdecl", (fun ii -> TattributeNoarg ii);
159 "WINAPI", (fun ii -> TattributeNoarg ii);
160 "APIENTRY", (fun ii -> TattributeNoarg ii);
161 "CALLBACK", (fun ii -> TattributeNoarg ii);
485bce71
C
162
163 (* c99: *)
ae4735db
C
164 (* no just "restrict" ? maybe for backward compatibility they avoided
165 * to use restrict which people may have used in their program already
485bce71 166 *)
ae4735db
C
167 "__restrict", (fun ii -> Trestrict ii);
168 "__restrict__", (fun ii -> Trestrict ii);
169
34e49164
C
170 ]
171
ae4735db 172let error_radix s =
34e49164
C
173 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
174
978fd7e5 175(* julia: functions for figuring out the type of integers *)
708f4980
C
176
177let is_long_dec s int uint long ulong =
178 match !Flag_parsing_c.int_thresholds with
179 None -> int
180 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
181 let bn = Big_int.big_int_of_string s in
182 if Big_int.ge_big_int bn ulong_threshold
183 then ulong
184 else
185 if Big_int.ge_big_int bn long_threshold
186 then long
187 else
188 if Big_int.ge_big_int bn uint_threshold
189 then long
190 else int
191
192let is_long_ho s int uint long ulong drop bpd count =
193 match !Flag_parsing_c.int_thresholds with
194 None -> int
195 | Some (uint_sz,ulong_sz,_,_,_) ->
196 let len = String.length s in
197 (* this assumes that all of the hex/oct digits are significant *)
198 (* drop is 2 for hex (0x) and 1 for oct (0) *)
199 let s = String.sub s drop (len - drop) in
200 let len =
201 ((len-drop) * bpd) -
202 (count (int_of_string("0x"^(String.sub s 0 1)))) in
203 if len < uint_sz
204 then int
205 else
206 if len = uint_sz
207 then uint
208 else
209 if len < ulong_sz
210 then long
211 else ulong
212
213let is_long_oct s int uint long ulong =
214 is_long_ho s int uint long ulong 1 3
215 (* stupid, but probably more efficient than taking logs *)
216 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
217let is_long_hex s int uint long ulong =
218 is_long_ho s int uint long ulong 2 4
219 (* stupid, but probably more efficient than taking logs *)
220 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
221 | _ -> 0)
222
223let sint = (Signed,CInt)
224let uint = (UnSigned,CInt)
225let slong = (Signed,CLong)
226let ulong = (UnSigned,CLong)
227
34e49164
C
228}
229
230(*****************************************************************************)
231let letter = ['A'-'Z' 'a'-'z' '_']
232let digit = ['0'-'9']
233
234(* not used for the moment *)
235let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
236 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
237let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
ae4735db
C
238let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
239(* 7 = \a = bell in C. this is not the only char allowed !!
240 * ex @ and $ ` are valid too
34e49164
C
241 *)
242
ae4735db 243let cchar = (letter | digit | punctuation | additionnal)
34e49164
C
244
245let sp = [' ' '\t']+
246let spopt = [' ' '\t']*
247
248let dec = ['0'-'9']
249let oct = ['0'-'7']
250let hex = ['0'-'9' 'a'-'f' 'A'-'F']
251
252let decimal = ('0' | (['1'-'9'] dec*))
253let octal = ['0'] oct+
ae4735db 254let hexa = ("0x" |"0X") hex+
34e49164
C
255
256
257let pent = dec+
258let pfract = dec+
259let sign = ['-' '+']
260let exp = ['e''E'] sign? dec+
261let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
262
263let id = letter (letter | digit) *
264
265(*****************************************************************************)
266rule token = parse
267
268 (* ----------------------------------------------------------------------- *)
269 (* spacing/comments *)
270 (* ----------------------------------------------------------------------- *)
271
ae4735db
C
272 (* note: this lexer generate tokens for comments!! so can not give
273 * this lexer as-is to the parsing function. The caller must preprocess
978fd7e5 274 * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
ae4735db 275 *
978fd7e5
C
276 * update: we now also generate a separate token for newlines, so now
277 * the caller may also have to reagglomerate all those commentspace
278 * tokens if he was assuming that spaces were agglomerate in a single
ae4735db 279 * token.
34e49164
C
280 *)
281
282 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
283 (* starting a new line; the newline character followed by whitespace *)
284 { TCommentNewline (tokinfo lexbuf) }
ae4735db 285 | [' ' '\t' '\r' '\011' '\012' ]+
34e49164 286 { TCommentSpace (tokinfo lexbuf) }
ae4735db
C
287 | "/*"
288 { let info = tokinfo lexbuf in
34e49164 289 let com = comment lexbuf in
0708f913
C
290
291 let info' = info +> tok_add_s com in
292 let s = Ast_c.str_of_info info' in
ae4735db 293 (* could be more flexible, use [\t ]* instead of hardcoded
b1b2de81 294 * single space. *)
0708f913 295 match s with
ae4735db 296 | "/* {{coccinelle:skip_start}} */" ->
0708f913 297 TCommentSkipTagStart (info')
ae4735db 298 | "/* {{coccinelle:skip_end}} */" ->
0708f913 299 TCommentSkipTagEnd (info')
ae4735db 300 | _ -> TComment(info')
34e49164
C
301 }
302
303
304 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
305 * So need this here only when dont call cpp before.
485bce71 306 * note that we don't keep the trailing \n; it will be in another token.
34e49164 307 *)
ae4735db 308 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
34e49164
C
309
310 (* ----------------------------------------------------------------------- *)
311 (* cpp *)
312 (* ----------------------------------------------------------------------- *)
313
314 (* old:
ae4735db
C
315 * | '#' { endline lexbuf} // should be line, and not endline
316 * and endline = parse | '\n' { token lexbuf}
317 * | _ { endline lexbuf}
34e49164
C
318 *)
319
b1b2de81 320 (* less?:
ae4735db 321 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
34e49164
C
322 * => just "#" -> token {lexbuf} (that is ignore)
323 * il y'a 1 #elif sans rien apres
324 * il y'a 1 #error sans rien apres
325 * il y'a 2 mov dede, #xxx qui genere du coup exn car
326