permit multiline comments and strings in macros
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
CommitLineData
34e49164 1{
0708f913 2(* Yoann Padioleau
ae4735db 3 *
0708f913 4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
34e49164
C
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
ae4735db 9 *
34e49164
C
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15open Common
16
17open Parser_c
18
19open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21(*****************************************************************************)
22(*
ae4735db
C
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
34e49164 33 * because of the "wierd" order of evaluation of OCaml.
ae4735db
C
34 *
35 *
34e49164
C
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
ae4735db 41 * typedef handling here is now useless.
34e49164
C
42 *)
43(*****************************************************************************)
44
45(*****************************************************************************)
46(* Wrappers *)
47(*****************************************************************************)
ae4735db 48let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
34e49164
C
49
50(*****************************************************************************)
51
52
53exception Lexical of string
54
55let tok lexbuf = Lexing.lexeme lexbuf
56
ae4735db
C
57let tokinfo lexbuf =
58 {
34e49164 59 pinfo = Ast_c.OriginTok {
ae4735db 60 Common.charpos = Lexing.lexeme_start lexbuf;
34e49164
C
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
ae4735db
C
63 Common.line = -1;
64 Common.column = -1;
34e49164
C
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
abad11c5 69 annots_tag = Token_annot.empty;
34e49164
C
70 comments_tag = ref Ast_c.emptyComments;
71 }
72
b1b2de81 73(* cppext: must generate a new ref each time, otherwise share *)
485bce71
C
74let no_ifdef_mark () = ref (None: (int * int) option)
75
34e49164 76let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
ae4735db 77
34e49164
C
78
79(* opti: less convenient, but using a hash is faster than using a match *)
80let keyword_table = Common.hash_of_list [
81
485bce71 82 (* c: *)
ae4735db
C
83 "void", (fun ii -> Tvoid ii);
84 "char", (fun ii -> Tchar ii);
85 "short", (fun ii -> Tshort ii);
86 "int", (fun ii -> Tint ii);
87 "long", (fun ii -> Tlong ii);
88 "float", (fun ii -> Tfloat ii);
89 "double", (fun ii -> Tdouble ii);
1eddfd50
C
90 "size_t", (fun ii -> Tsize_t ii);
91 "ssize_t", (fun ii -> Tssize_t ii);
92 "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
ae4735db
C
93
94 "unsigned", (fun ii -> Tunsigned ii);
34e49164 95 "signed", (fun ii -> Tsigned ii);
ae4735db
C
96
97 "auto", (fun ii -> Tauto ii);
98 "register", (fun ii -> Tregister ii);
99 "extern", (fun ii -> Textern ii);
34e49164
C
100 "static", (fun ii -> Tstatic ii);
101
102 "const", (fun ii -> Tconst ii);
ae4735db
C
103 "volatile", (fun ii -> Tvolatile ii);
104
105 "struct", (fun ii -> Tstruct ii);
106 "union", (fun ii -> Tunion ii);
107 "enum", (fun ii -> Tenum ii);
108 "typedef", (fun ii -> Ttypedef ii);
109
110 "if", (fun ii -> Tif ii);
111 "else", (fun ii -> Telse ii);
112 "break", (fun ii -> Tbreak ii);
34e49164 113 "continue", (fun ii -> Tcontinue ii);
ae4735db
C
114 "switch", (fun ii -> Tswitch ii);
115 "case", (fun ii -> Tcase ii);
116 "default", (fun ii -> Tdefault ii);
117 "for", (fun ii -> Tfor ii);
118 "do", (fun ii -> Tdo ii);
119 "while", (fun ii -> Twhile ii);
34e49164 120 "return", (fun ii -> Treturn ii);
ae4735db
C
121 "goto", (fun ii -> Tgoto ii);
122
123 "sizeof", (fun ii -> Tsizeof ii);
34e49164
C
124
125
126 (* gccext: cppext: linuxext: synonyms *)
127 "asm", (fun ii -> Tasm ii);
128 "__asm__", (fun ii -> Tasm ii);
129 "__asm", (fun ii -> Tasm ii);
130
131 "inline", (fun ii -> Tinline ii);
132 "__inline__", (fun ii -> Tinline ii);
133 "__inline", (fun ii -> Tinline ii);
34e49164
C
134
135 "__attribute__", (fun ii -> Tattribute ii);
136 "__attribute", (fun ii -> Tattribute ii);
137
138 "typeof", (fun ii -> Ttypeof ii);
139 "__typeof__", (fun ii -> Ttypeof ii);
485bce71
C
140 "__typeof", (fun ii -> Ttypeof ii);
141
978fd7e5 142 (* found a lot in expanded code *)
ae4735db 143 "__extension__", (fun ii -> TattributeNoarg ii);
978fd7e5 144
34e49164 145
485bce71 146 (* gccext: alias *)
34e49164
C
147 "__signed__", (fun ii -> Tsigned ii);
148
149 "__const__", (fun ii -> Tconst ii);
150 "__const", (fun ii -> Tconst ii);
151
ae4735db
C
152 "__volatile__", (fun ii -> Tvolatile ii);
153 "__volatile", (fun ii -> Tvolatile ii);
485bce71 154
978fd7e5
C
155 (* windowsext: *)
156 "__declspec", (fun ii -> Tattribute ii);
157
158 "__stdcall", (fun ii -> TattributeNoarg ii);
159 "__cdecl", (fun ii -> TattributeNoarg ii);
160 "WINAPI", (fun ii -> TattributeNoarg ii);
161 "APIENTRY", (fun ii -> TattributeNoarg ii);
162 "CALLBACK", (fun ii -> TattributeNoarg ii);
485bce71
C
163
164 (* c99: *)
ae4735db
C
165 (* no just "restrict" ? maybe for backward compatibility they avoided
166 * to use restrict which people may have used in their program already
485bce71 167 *)
ae4735db
C
168 "__restrict", (fun ii -> Trestrict ii);
169 "__restrict__", (fun ii -> Trestrict ii);
170
34e49164
C
171 ]
172
f59c9fb7 173let cpp_keyword_table = Common.hash_of_list [
1b9ae606
C
174 "namespace", (fun ii -> Tnamespace ii);
175 "new", (fun ii -> Tnew ii);
176 "delete", (fun ii -> Tdelete ii);
177 "using", (fun ii -> TComment ii) ]
f59c9fb7 178
ae4735db 179let error_radix s =
34e49164
C
180 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
181
978fd7e5 182(* julia: functions for figuring out the type of integers *)
708f4980
C
183
184let is_long_dec s int uint long ulong =
185 match !Flag_parsing_c.int_thresholds with
186 None -> int
187 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
188 let bn = Big_int.big_int_of_string s in
189 if Big_int.ge_big_int bn ulong_threshold
190 then ulong
191 else
192 if Big_int.ge_big_int bn long_threshold
193 then long
194 else
195 if Big_int.ge_big_int bn uint_threshold
196 then long
197 else int
198
199let is_long_ho s int uint long ulong drop bpd count =
200 match !Flag_parsing_c.int_thresholds with
201 None -> int
202 | Some (uint_sz,ulong_sz,_,_,_) ->
203 let len = String.length s in
204 (* this assumes that all of the hex/oct digits are significant *)
205 (* drop is 2 for hex (0x) and 1 for oct (0) *)
206 let s = String.sub s drop (len - drop) in
207 let len =
208 ((len-drop) * bpd) -
209 (count (int_of_string("0x"^(String.sub s 0 1)))) in
210 if len < uint_sz
211 then int
212 else
213 if len = uint_sz
214 then uint
215 else
216 if len < ulong_sz
217 then long
218 else ulong
219
220let is_long_oct s int uint long ulong =
221 is_long_ho s int uint long ulong 1 3
222 (* stupid, but probably more efficient than taking logs *)
223 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
224let is_long_hex s int uint long ulong =
225 is_long_ho s int uint long ulong 2 4
226 (* stupid, but probably more efficient than taking logs *)
227 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
228 | _ -> 0)
229
230let sint = (Signed,CInt)
231let uint = (UnSigned,CInt)
232let slong = (Signed,CLong)
233let ulong = (UnSigned,CLong)
234
34e49164
C
235}
236
237(*****************************************************************************)
238let letter = ['A'-'Z' 'a'-'z' '_']
3a314143 239let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*)
34e49164
C
240let digit = ['0'-'9']
241
1b9ae606
C
242let cplusplus_ident = (letter | '$') (letter | digit | '$') *
243let cplusplus_ident_ext = (letter | '~' | '$') (letter | digit | '~' | '$') *
244
34e49164 245(* not used for the moment *)
abad11c5
C
246let punctuation = ['!' '\"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
247 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
34e49164 248let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
ae4735db
C
249let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
250(* 7 = \a = bell in C. this is not the only char allowed !!
251 * ex @ and $ ` are valid too
34e49164
C
252 *)
253
ae4735db 254let cchar = (letter | digit | punctuation | additionnal)
34e49164
C
255
256let sp = [' ' '\t']+
257let spopt = [' ' '\t']*
258
259let dec = ['0'-'9']
260let oct = ['0'-'7']
261let hex = ['0'-'9' 'a'-'f' 'A'-'F']
262
263let decimal = ('0' | (['1'-'9'] dec*))
264let octal = ['0'] oct+
ae4735db 265let hexa = ("0x" |"0X") hex+
34e49164
C
266
267
268let pent = dec+
269let pfract = dec+
270let sign = ['-' '+']
271let exp = ['e''E'] sign? dec+
272let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
273
274let id = letter (letter | digit) *
275
276(*****************************************************************************)
277rule token = parse
278
279 (* ----------------------------------------------------------------------- *)
280 (* spacing/comments *)
281 (* ----------------------------------------------------------------------- *)
282
ae4735db
C
283 (* note: this lexer generate tokens for comments!! so can not give
284 * this lexer as-is to the parsing function. The caller must preprocess
978fd7e5 285 * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
ae4735db 286 *
978fd7e5
C
287 * update: we now also generate a separate token for newlines, so now
288 * the caller may also have to reagglomerate all those commentspace
289 * tokens if he was assuming that spaces were agglomerate in a single
ae4735db 290 * token.
34e49164
C
291 *)
292
293 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
294 (* starting a new line; the newline character followed by whitespace *)
295 { TCommentNewline (tokinfo lexbuf) }
ae4735db 296 | [' ' '\t' '\r' '\011' '\012' ]+
34e49164 297 { TCommentSpace (tokinfo lexbuf) }
ae4735db
C
298 | "/*"
299 { let info = tokinfo lexbuf in
34e49164 300 let com = comment lexbuf in
0708f913
C
301
302 let info' = info +> tok_add_s com in
303 let s = Ast_c.str_of_info info' in
ae4735db 304 (* could be more flexible, use [\t ]* instead of hardcoded
b1b2de81 305 * single space. *)
0708f913 306 match s with
ae4735db 307 | "/* {{coccinelle:skip_start}} */" ->
0708f913 308 TCommentSkipTagStart (info')
ae4735db 309 | "/* {{coccinelle:skip_end}} */" ->
0708f913 310 TCommentSkipTagEnd (info')
ae4735db 311 | _ -> TComment(info')
34e49164
C
312 }
313
314
315 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
316 * So need this here only when dont call cpp before.
485bce71 317 * note that we don't keep the trailing \n; it will be in another token.
34e49164 318 *)
ae4735db 319 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
34e49164
C
320
321 (* ----------------------------------------------------------------------- *)
322 (* cpp *)
323 (* ----------------------------------------------------------------------- *)
324
325 (* old:
ae4735db
C
326 * | '#' { endline lexbuf} // should be line, and not endline
327 * and endline = parse | '\n' { token lexbuf}
328 * | _ { endline lexbuf}
34e49164
C
329 *)
330
b1b2de81 331 (* less?:
ae4735db 332 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
34e49164
C
333 * => just "#" -> token {lexbuf} (that is ignore)
334 * il y'a 1 #elif sans rien apres
335 * il y'a 1 #error sans rien apres
336 * il y'a 2 mov dede, #xxx qui genere du coup exn car
337