Commit | Line | Data |
---|---|---|
34e49164 | 1 | { |
0708f913 | 2 | (* Yoann Padioleau |
ae4735db | 3 | * |
0708f913 | 4 | * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes |
34e49164 C |
5 | * |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License (GPL) | |
8 | * version 2 as published by the Free Software Foundation. | |
ae4735db | 9 | * |
34e49164 C |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * file license.txt for more details. | |
14 | *) | |
15 | open Common | |
16 | ||
17 | open Parser_c | |
18 | ||
19 | open Ast_c (* to factorise tokens, OpAssign, ... *) | |
20 | ||
21 | (*****************************************************************************) | |
22 | (* | |
ae4735db C |
23 | * subtil: ocamllex use side effect on lexbuf, so must take care. |
24 | * For instance must do | |
25 | * | |
26 | * let info = tokinfo lexbuf in | |
27 | * TComment (info +> tok_add_s (comment lexbuf)) | |
28 | * | |
29 | * and not | |
30 | * | |
31 | * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) | |
32 | * | |
34e49164 | 33 | * because of the "wierd" order of evaluation of OCaml. |
ae4735db C |
34 | * |
35 | * | |
34e49164 C |
36 | * |
37 | * note: can't use Lexer_parser._lexer_hint here to do different | |
38 | * things, because now we call the lexer to get all the tokens | |
39 | * (tokens_all), and then we parse. So we can't have the _lexer_hint | |
40 | * info here. We can have it only in parse_c. For the same reason, the | |
ae4735db | 41 | * typedef handling here is now useless. |
34e49164 C |
42 | *) |
43 | (*****************************************************************************) | |
44 | ||
45 | (*****************************************************************************) | |
46 | (* Wrappers *) | |
47 | (*****************************************************************************) | |
ae4735db | 48 | let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing |
34e49164 C |
49 | |
50 | (*****************************************************************************) | |
51 | ||
52 | ||
53 | exception Lexical of string | |
54 | ||
55 | let tok lexbuf = Lexing.lexeme lexbuf | |
56 | ||
ae4735db C |
57 | let tokinfo lexbuf = |
58 | { | |
34e49164 | 59 | pinfo = Ast_c.OriginTok { |
ae4735db | 60 | Common.charpos = Lexing.lexeme_start lexbuf; |
34e49164 C |
61 | Common.str = Lexing.lexeme lexbuf; |
62 | (* info filled in a post-lexing phase *) | |
ae4735db C |
63 | Common.line = -1; |
64 | Common.column = -1; | |
34e49164 C |
65 | Common.file = ""; |
66 | }; | |
67 | (* must generate a new ref each time, otherwise share *) | |
68 | cocci_tag = ref Ast_c.emptyAnnot; | |
69 | comments_tag = ref Ast_c.emptyComments; | |
70 | } | |
71 | ||
b1b2de81 | 72 | (* cppext: must generate a new ref each time, otherwise share *) |
485bce71 C |
73 | let no_ifdef_mark () = ref (None: (int * int) option) |
74 | ||
34e49164 | 75 | let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii |
ae4735db | 76 | |
34e49164 C |
77 | |
78 | (* opti: less convenient, but using a hash is faster than using a match *) | |
79 | let keyword_table = Common.hash_of_list [ | |
80 | ||
485bce71 | 81 | (* c: *) |
ae4735db C |
82 | "void", (fun ii -> Tvoid ii); |
83 | "char", (fun ii -> Tchar ii); | |
84 | "short", (fun ii -> Tshort ii); | |
85 | "int", (fun ii -> Tint ii); | |
86 | "long", (fun ii -> Tlong ii); | |
87 | "float", (fun ii -> Tfloat ii); | |
88 | "double", (fun ii -> Tdouble ii); | |
1eddfd50 C |
89 | "size_t", (fun ii -> Tsize_t ii); |
90 | "ssize_t", (fun ii -> Tssize_t ii); | |
91 | "ptrdiff_t", (fun ii -> Tptrdiff_t ii); | |
ae4735db C |
92 | |
93 | "unsigned", (fun ii -> Tunsigned ii); | |
34e49164 | 94 | "signed", (fun ii -> Tsigned ii); |
ae4735db C |
95 | |
96 | "auto", (fun ii -> Tauto ii); | |
97 | "register", (fun ii -> Tregister ii); | |
98 | "extern", (fun ii -> Textern ii); | |
34e49164 C |
99 | "static", (fun ii -> Tstatic ii); |
100 | ||
101 | "const", (fun ii -> Tconst ii); | |
ae4735db C |
102 | "volatile", (fun ii -> Tvolatile ii); |
103 | ||
104 | "struct", (fun ii -> Tstruct ii); | |
105 | "union", (fun ii -> Tunion ii); | |
106 | "enum", (fun ii -> Tenum ii); | |
107 | "typedef", (fun ii -> Ttypedef ii); | |
108 | ||
109 | "if", (fun ii -> Tif ii); | |
110 | "else", (fun ii -> Telse ii); | |
111 | "break", (fun ii -> Tbreak ii); | |
34e49164 | 112 | "continue", (fun ii -> Tcontinue ii); |
ae4735db C |
113 | "switch", (fun ii -> Tswitch ii); |
114 | "case", (fun ii -> Tcase ii); | |
115 | "default", (fun ii -> Tdefault ii); | |
116 | "for", (fun ii -> Tfor ii); | |
117 | "do", (fun ii -> Tdo ii); | |
118 | "while", (fun ii -> Twhile ii); | |
34e49164 | 119 | "return", (fun ii -> Treturn ii); |
ae4735db C |
120 | "goto", (fun ii -> Tgoto ii); |
121 | ||
122 | "sizeof", (fun ii -> Tsizeof ii); | |
34e49164 C |
123 | |
124 | ||
125 | (* gccext: cppext: linuxext: synonyms *) | |
126 | "asm", (fun ii -> Tasm ii); | |
127 | "__asm__", (fun ii -> Tasm ii); | |
128 | "__asm", (fun ii -> Tasm ii); | |
129 | ||
130 | "inline", (fun ii -> Tinline ii); | |
131 | "__inline__", (fun ii -> Tinline ii); | |
132 | "__inline", (fun ii -> Tinline ii); | |
34e49164 C |
133 | |
134 | "__attribute__", (fun ii -> Tattribute ii); | |
135 | "__attribute", (fun ii -> Tattribute ii); | |
136 | ||
137 | "typeof", (fun ii -> Ttypeof ii); | |
138 | "__typeof__", (fun ii -> Ttypeof ii); | |
485bce71 C |
139 | "__typeof", (fun ii -> Ttypeof ii); |
140 | ||
978fd7e5 | 141 | (* found a lot in expanded code *) |
ae4735db | 142 | "__extension__", (fun ii -> TattributeNoarg ii); |
978fd7e5 | 143 | |
34e49164 | 144 | |
485bce71 | 145 | (* gccext: alias *) |
34e49164 C |
146 | "__signed__", (fun ii -> Tsigned ii); |
147 | ||
148 | "__const__", (fun ii -> Tconst ii); | |
149 | "__const", (fun ii -> Tconst ii); | |
150 | ||
ae4735db C |
151 | "__volatile__", (fun ii -> Tvolatile ii); |
152 | "__volatile", (fun ii -> Tvolatile ii); | |
485bce71 | 153 | |
978fd7e5 C |
154 | (* windowsext: *) |
155 | "__declspec", (fun ii -> Tattribute ii); | |
156 | ||
157 | "__stdcall", (fun ii -> TattributeNoarg ii); | |
158 | "__cdecl", (fun ii -> TattributeNoarg ii); | |
159 | "WINAPI", (fun ii -> TattributeNoarg ii); | |
160 | "APIENTRY", (fun ii -> TattributeNoarg ii); | |
161 | "CALLBACK", (fun ii -> TattributeNoarg ii); | |
485bce71 C |
162 | |
163 | (* c99: *) | |
ae4735db C |
164 | (* no just "restrict" ? maybe for backward compatibility they avoided |
165 | * to use restrict which people may have used in their program already | |
485bce71 | 166 | *) |
ae4735db C |
167 | "__restrict", (fun ii -> Trestrict ii); |
168 | "__restrict__", (fun ii -> Trestrict ii); | |
169 | ||
34e49164 C |
170 | ] |
171 | ||
f59c9fb7 | 172 | let cpp_keyword_table = Common.hash_of_list [ |
4dfbc1c2 | 173 | "new", (fun ii -> Tnew ii); |
5427db06 C |
174 | "delete",(fun ii -> Tdelete ii); |
175 | "using", (fun ii -> TComment ii) ] | |
f59c9fb7 | 176 | |
ae4735db | 177 | let error_radix s = |
34e49164 C |
178 | ("numeric " ^ s ^ " constant contains digits beyond the radix:") |
179 | ||
978fd7e5 | 180 | (* julia: functions for figuring out the type of integers *) |
708f4980 C |
181 | |
182 | let is_long_dec s int uint long ulong = | |
183 | match !Flag_parsing_c.int_thresholds with | |
184 | None -> int | |
185 | | Some (_,_,uint_threshold,long_threshold,ulong_threshold) -> | |
186 | let bn = Big_int.big_int_of_string s in | |
187 | if Big_int.ge_big_int bn ulong_threshold | |
188 | then ulong | |
189 | else | |
190 | if Big_int.ge_big_int bn long_threshold | |
191 | then long | |
192 | else | |
193 | if Big_int.ge_big_int bn uint_threshold | |
194 | then long | |
195 | else int | |
196 | ||
197 | let is_long_ho s int uint long ulong drop bpd count = | |
198 | match !Flag_parsing_c.int_thresholds with | |
199 | None -> int | |
200 | | Some (uint_sz,ulong_sz,_,_,_) -> | |
201 | let len = String.length s in | |
202 | (* this assumes that all of the hex/oct digits are significant *) | |
203 | (* drop is 2 for hex (0x) and 1 for oct (0) *) | |
204 | let s = String.sub s drop (len - drop) in | |
205 | let len = | |
206 | ((len-drop) * bpd) - | |
207 | (count (int_of_string("0x"^(String.sub s 0 1)))) in | |
208 | if len < uint_sz | |
209 | then int | |
210 | else | |
211 | if len = uint_sz | |
212 | then uint | |
213 | else | |
214 | if len < ulong_sz | |
215 | then long | |
216 | else ulong | |
217 | ||
218 | let is_long_oct s int uint long ulong = | |
219 | is_long_ho s int uint long ulong 1 3 | |
220 | (* stupid, but probably more efficient than taking logs *) | |
221 | (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0) | |
222 | let is_long_hex s int uint long ulong = | |
223 | is_long_ho s int uint long ulong 2 4 | |
224 | (* stupid, but probably more efficient than taking logs *) | |
225 | (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1 | |
226 | | _ -> 0) | |
227 | ||
228 | let sint = (Signed,CInt) | |
229 | let uint = (UnSigned,CInt) | |
230 | let slong = (Signed,CLong) | |
231 | let ulong = (UnSigned,CLong) | |
232 | ||
34e49164 C |
233 | } |
234 | ||
235 | (*****************************************************************************) | |
236 | let letter = ['A'-'Z' 'a'-'z' '_'] | |
3a314143 | 237 | let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*) |
34e49164 C |
238 | let digit = ['0'-'9'] |
239 | ||
240 | (* not used for the moment *) | |
241 | let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':' | |
242 | ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~'] | |
243 | let space = [' ' '\t' '\n' '\r' '\011' '\012' ] | |
ae4735db C |
244 | let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] |
245 | (* 7 = \a = bell in C. this is not the only char allowed !! | |
246 | * ex @ and $ ` are valid too | |
34e49164 C |
247 | *) |
248 | ||
ae4735db | 249 | let cchar = (letter | digit | punctuation | additionnal) |
34e49164 C |
250 | |
251 | let sp = [' ' '\t']+ | |
252 | let spopt = [' ' '\t']* | |
253 | ||
254 | let dec = ['0'-'9'] | |
255 | let oct = ['0'-'7'] | |
256 | let hex = ['0'-'9' 'a'-'f' 'A'-'F'] | |
257 | ||
258 | let decimal = ('0' | (['1'-'9'] dec*)) | |
259 | let octal = ['0'] oct+ | |
ae4735db | 260 | let hexa = ("0x" |"0X") hex+ |
34e49164 C |
261 | |
262 | ||
263 | let pent = dec+ | |
264 | let pfract = dec+ | |
265 | let sign = ['-' '+'] | |
266 | let exp = ['e''E'] sign? dec+ | |
267 | let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?) | |
268 | ||
269 | let id = letter (letter | digit) * | |
270 | ||
271 | (*****************************************************************************) | |
272 | rule token = parse | |
273 | ||
274 | (* ----------------------------------------------------------------------- *) | |
275 | (* spacing/comments *) | |
276 | (* ----------------------------------------------------------------------- *) | |
277 | ||
ae4735db C |
278 | (* note: this lexer generate tokens for comments!! so can not give |
279 | * this lexer as-is to the parsing function. The caller must preprocess | |
978fd7e5 | 280 | * it, e.g. by using techniques like cur_tok ref in parse_c.ml. |
ae4735db | 281 | * |
978fd7e5 C |
282 | * update: we now also generate a separate token for newlines, so now |
283 | * the caller may also have to reagglomerate all those commentspace | |
284 | * tokens if he was assuming that spaces were agglomerate in a single | |
ae4735db | 285 | * token. |
34e49164 C |
286 | *) |
287 | ||
288 | | ['\n'] [' ' '\t' '\r' '\011' '\012' ]* | |
289 | (* starting a new line; the newline character followed by whitespace *) | |
290 | { TCommentNewline (tokinfo lexbuf) } | |
ae4735db | 291 | | [' ' '\t' '\r' '\011' '\012' ]+ |
34e49164 | 292 | { TCommentSpace (tokinfo lexbuf) } |
ae4735db C |
293 | | "/*" |
294 | { let info = tokinfo lexbuf in | |
34e49164 | 295 | let com = comment lexbuf in |
0708f913 C |
296 | |
297 | let info' = info +> tok_add_s com in | |
298 | let s = Ast_c.str_of_info info' in | |
ae4735db | 299 | (* could be more flexible, use [\t ]* instead of hardcoded |
b1b2de81 | 300 | * single space. *) |
0708f913 | 301 | match s with |
ae4735db | 302 | | "/* {{coccinelle:skip_start}} */" -> |
0708f913 | 303 | TCommentSkipTagStart (info') |
ae4735db | 304 | | "/* {{coccinelle:skip_end}} */" -> |
0708f913 | 305 | TCommentSkipTagEnd (info') |
ae4735db | 306 | | _ -> TComment(info') |
34e49164 C |
307 | } |
308 | ||
309 | ||
310 | (* C++ comment are allowed via gccext, but normally they are deleted by cpp. | |
311 | * So need this here only when dont call cpp before. | |
485bce71 | 312 | * note that we don't keep the trailing \n; it will be in another token. |
34e49164 | 313 | *) |
ae4735db | 314 | | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) } |
34e49164 C |
315 | |
316 | (* ----------------------------------------------------------------------- *) | |
317 | (* cpp *) | |
318 | (* ----------------------------------------------------------------------- *) | |
319 | ||
320 | (* old: | |
ae4735db C |
321 | * | '#' { endline lexbuf} // should be line, and not endline |
322 | * and endline = parse | '\n' { token lexbuf} | |
323 | * | _ { endline lexbuf} | |
34e49164 C |
324 | *) |
325 | ||
b1b2de81 | 326 | (* less?: |
ae4735db | 327 | * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ... |
34e49164 C |
328 | * => just "#" -> token {lexbuf} (that is ignore) |
329 | * il y'a 1 #elif sans rien apres | |
330 | * il y'a 1 #error sans rien apres | |
331 | * il y'a 2 mov dede, #xxx qui genere du coup exn car | |
332 |