Commit | Line | Data |
---|---|---|
34e49164 | 1 | { |
0708f913 | 2 | (* Yoann Padioleau |
ae4735db | 3 | * |
0708f913 | 4 | * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes |
34e49164 C |
5 | * |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License (GPL) | |
8 | * version 2 as published by the Free Software Foundation. | |
ae4735db | 9 | * |
34e49164 C |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * file license.txt for more details. | |
14 | *) | |
15 | open Common | |
16 | ||
17 | open Parser_c | |
18 | ||
19 | open Ast_c (* to factorise tokens, OpAssign, ... *) | |
20 | ||
21 | (*****************************************************************************) | |
22 | (* | |
ae4735db C |
23 | * subtil: ocamllex use side effect on lexbuf, so must take care. |
24 | * For instance must do | |
25 | * | |
26 | * let info = tokinfo lexbuf in | |
27 | * TComment (info +> tok_add_s (comment lexbuf)) | |
28 | * | |
29 | * and not | |
30 | * | |
31 | * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) | |
32 | * | |
34e49164 | 33 | * because of the "wierd" order of evaluation of OCaml. |
ae4735db C |
34 | * |
35 | * | |
34e49164 C |
36 | * |
37 | * note: can't use Lexer_parser._lexer_hint here to do different | |
38 | * things, because now we call the lexer to get all the tokens | |
39 | * (tokens_all), and then we parse. So we can't have the _lexer_hint | |
40 | * info here. We can have it only in parse_c. For the same reason, the | |
ae4735db | 41 | * typedef handling here is now useless. |
34e49164 C |
42 | *) |
43 | (*****************************************************************************) | |
44 | ||
45 | (*****************************************************************************) | |
46 | (* Wrappers *) | |
47 | (*****************************************************************************) | |
ae4735db | 48 | let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing |
34e49164 C |
49 | |
50 | (*****************************************************************************) | |
51 | ||
52 | ||
53 | exception Lexical of string | |
54 | ||
55 | let tok lexbuf = Lexing.lexeme lexbuf | |
56 | ||
ae4735db C |
57 | let tokinfo lexbuf = |
58 | { | |
34e49164 | 59 | pinfo = Ast_c.OriginTok { |
ae4735db | 60 | Common.charpos = Lexing.lexeme_start lexbuf; |
34e49164 C |
61 | Common.str = Lexing.lexeme lexbuf; |
62 | (* info filled in a post-lexing phase *) | |
ae4735db C |
63 | Common.line = -1; |
64 | Common.column = -1; | |
34e49164 C |
65 | Common.file = ""; |
66 | }; | |
67 | (* must generate a new ref each time, otherwise share *) | |
68 | cocci_tag = ref Ast_c.emptyAnnot; | |
abad11c5 | 69 | annots_tag = Token_annot.empty; |
34e49164 C |
70 | comments_tag = ref Ast_c.emptyComments; |
71 | } | |
72 | ||
b1b2de81 | 73 | (* cppext: must generate a new ref each time, otherwise share *) |
485bce71 C |
74 | let no_ifdef_mark () = ref (None: (int * int) option) |
75 | ||
34e49164 | 76 | let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii |
ae4735db | 77 | |
34e49164 C |
78 | |
79 | (* opti: less convenient, but using a hash is faster than using a match *) | |
80 | let keyword_table = Common.hash_of_list [ | |
81 | ||
485bce71 | 82 | (* c: *) |
ae4735db C |
83 | "void", (fun ii -> Tvoid ii); |
84 | "char", (fun ii -> Tchar ii); | |
85 | "short", (fun ii -> Tshort ii); | |
86 | "int", (fun ii -> Tint ii); | |
87 | "long", (fun ii -> Tlong ii); | |
88 | "float", (fun ii -> Tfloat ii); | |
89 | "double", (fun ii -> Tdouble ii); | |
1eddfd50 C |
90 | "size_t", (fun ii -> Tsize_t ii); |
91 | "ssize_t", (fun ii -> Tssize_t ii); | |
92 | "ptrdiff_t", (fun ii -> Tptrdiff_t ii); | |
ae4735db C |
93 | |
94 | "unsigned", (fun ii -> Tunsigned ii); | |
34e49164 | 95 | "signed", (fun ii -> Tsigned ii); |
ae4735db C |
96 | |
97 | "auto", (fun ii -> Tauto ii); | |
98 | "register", (fun ii -> Tregister ii); | |
99 | "extern", (fun ii -> Textern ii); | |
34e49164 C |
100 | "static", (fun ii -> Tstatic ii); |
101 | ||
102 | "const", (fun ii -> Tconst ii); | |
ae4735db C |
103 | "volatile", (fun ii -> Tvolatile ii); |
104 | ||
105 | "struct", (fun ii -> Tstruct ii); | |
106 | "union", (fun ii -> Tunion ii); | |
107 | "enum", (fun ii -> Tenum ii); | |
108 | "typedef", (fun ii -> Ttypedef ii); | |
109 | ||
110 | "if", (fun ii -> Tif ii); | |
111 | "else", (fun ii -> Telse ii); | |
112 | "break", (fun ii -> Tbreak ii); | |
34e49164 | 113 | "continue", (fun ii -> Tcontinue ii); |
ae4735db C |
114 | "switch", (fun ii -> Tswitch ii); |
115 | "case", (fun ii -> Tcase ii); | |
116 | "default", (fun ii -> Tdefault ii); | |
117 | "for", (fun ii -> Tfor ii); | |
118 | "do", (fun ii -> Tdo ii); | |
119 | "while", (fun ii -> Twhile ii); | |
34e49164 | 120 | "return", (fun ii -> Treturn ii); |
ae4735db C |
121 | "goto", (fun ii -> Tgoto ii); |
122 | ||
123 | "sizeof", (fun ii -> Tsizeof ii); | |
34e49164 C |
124 | |
125 | ||
126 | (* gccext: cppext: linuxext: synonyms *) | |
127 | "asm", (fun ii -> Tasm ii); | |
128 | "__asm__", (fun ii -> Tasm ii); | |
129 | "__asm", (fun ii -> Tasm ii); | |
130 | ||
131 | "inline", (fun ii -> Tinline ii); | |
132 | "__inline__", (fun ii -> Tinline ii); | |
133 | "__inline", (fun ii -> Tinline ii); | |
34e49164 C |
134 | |
135 | "__attribute__", (fun ii -> Tattribute ii); | |
136 | "__attribute", (fun ii -> Tattribute ii); | |
137 | ||
138 | "typeof", (fun ii -> Ttypeof ii); | |
139 | "__typeof__", (fun ii -> Ttypeof ii); | |
485bce71 C |
140 | "__typeof", (fun ii -> Ttypeof ii); |
141 | ||
978fd7e5 | 142 | (* found a lot in expanded code *) |
ae4735db | 143 | "__extension__", (fun ii -> TattributeNoarg ii); |
978fd7e5 | 144 | |
34e49164 | 145 | |
485bce71 | 146 | (* gccext: alias *) |
34e49164 C |
147 | "__signed__", (fun ii -> Tsigned ii); |
148 | ||
149 | "__const__", (fun ii -> Tconst ii); | |
150 | "__const", (fun ii -> Tconst ii); | |
151 | ||
ae4735db C |
152 | "__volatile__", (fun ii -> Tvolatile ii); |
153 | "__volatile", (fun ii -> Tvolatile ii); | |
485bce71 | 154 | |
978fd7e5 C |
155 | (* windowsext: *) |
156 | "__declspec", (fun ii -> Tattribute ii); | |
157 | ||
158 | "__stdcall", (fun ii -> TattributeNoarg ii); | |
159 | "__cdecl", (fun ii -> TattributeNoarg ii); | |
160 | "WINAPI", (fun ii -> TattributeNoarg ii); | |
161 | "APIENTRY", (fun ii -> TattributeNoarg ii); | |
162 | "CALLBACK", (fun ii -> TattributeNoarg ii); | |
485bce71 C |
163 | |
164 | (* c99: *) | |
ae4735db C |
165 | (* no just "restrict" ? maybe for backward compatibility they avoided |
166 | * to use restrict which people may have used in their program already | |
485bce71 | 167 | *) |
ae4735db C |
168 | "__restrict", (fun ii -> Trestrict ii); |
169 | "__restrict__", (fun ii -> Trestrict ii); | |
170 | ||
34e49164 C |
171 | ] |
172 | ||
f59c9fb7 | 173 | let cpp_keyword_table = Common.hash_of_list [ |
1b9ae606 C |
174 | "namespace", (fun ii -> Tnamespace ii); |
175 | "new", (fun ii -> Tnew ii); | |
176 | "delete", (fun ii -> Tdelete ii); | |
177 | "using", (fun ii -> TComment ii) ] | |
f59c9fb7 | 178 | |
ae4735db | 179 | let error_radix s = |
34e49164 C |
180 | ("numeric " ^ s ^ " constant contains digits beyond the radix:") |
181 | ||
978fd7e5 | 182 | (* julia: functions for figuring out the type of integers *) |
708f4980 C |
183 | |
184 | let is_long_dec s int uint long ulong = | |
185 | match !Flag_parsing_c.int_thresholds with | |
186 | None -> int | |
187 | | Some (_,_,uint_threshold,long_threshold,ulong_threshold) -> | |
188 | let bn = Big_int.big_int_of_string s in | |
189 | if Big_int.ge_big_int bn ulong_threshold | |
190 | then ulong | |
191 | else | |
192 | if Big_int.ge_big_int bn long_threshold | |
193 | then long | |
194 | else | |
195 | if Big_int.ge_big_int bn uint_threshold | |
196 | then long | |
197 | else int | |
198 | ||
199 | let is_long_ho s int uint long ulong drop bpd count = | |
200 | match !Flag_parsing_c.int_thresholds with | |
201 | None -> int | |
202 | | Some (uint_sz,ulong_sz,_,_,_) -> | |
203 | let len = String.length s in | |
204 | (* this assumes that all of the hex/oct digits are significant *) | |
205 | (* drop is 2 for hex (0x) and 1 for oct (0) *) | |
206 | let s = String.sub s drop (len - drop) in | |
207 | let len = | |
208 | ((len-drop) * bpd) - | |
209 | (count (int_of_string("0x"^(String.sub s 0 1)))) in | |
210 | if len < uint_sz | |
211 | then int | |
212 | else | |
213 | if len = uint_sz | |
214 | then uint | |
215 | else | |
216 | if len < ulong_sz | |
217 | then long | |
218 | else ulong | |
219 | ||
220 | let is_long_oct s int uint long ulong = | |
221 | is_long_ho s int uint long ulong 1 3 | |
222 | (* stupid, but probably more efficient than taking logs *) | |
223 | (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0) | |
224 | let is_long_hex s int uint long ulong = | |
225 | is_long_ho s int uint long ulong 2 4 | |
226 | (* stupid, but probably more efficient than taking logs *) | |
227 | (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1 | |
228 | | _ -> 0) | |
229 | ||
230 | let sint = (Signed,CInt) | |
231 | let uint = (UnSigned,CInt) | |
232 | let slong = (Signed,CLong) | |
233 | let ulong = (UnSigned,CLong) | |
234 | ||
34e49164 C |
235 | } |
236 | ||
237 | (*****************************************************************************) | |
238 | let letter = ['A'-'Z' 'a'-'z' '_'] | |
3a314143 | 239 | let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*) |
34e49164 C |
240 | let digit = ['0'-'9'] |
241 | ||
1b9ae606 C |
242 | let cplusplus_ident = (letter | '$') (letter | digit | '$') * |
243 | let cplusplus_ident_ext = (letter | '~' | '$') (letter | digit | '~' | '$') * | |
244 | ||
34e49164 | 245 | (* not used for the moment *) |
abad11c5 C |
246 | let punctuation = ['!' '\"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':' |
247 | ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~'] | |
34e49164 | 248 | let space = [' ' '\t' '\n' '\r' '\011' '\012' ] |
ae4735db C |
249 | let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] |
250 | (* 7 = \a = bell in C. this is not the only char allowed !! | |
251 | * ex @ and $ ` are valid too | |
34e49164 C |
252 | *) |
253 | ||
ae4735db | 254 | let cchar = (letter | digit | punctuation | additionnal) |
34e49164 C |
255 | |
256 | let sp = [' ' '\t']+ | |
257 | let spopt = [' ' '\t']* | |
258 | ||
259 | let dec = ['0'-'9'] | |
260 | let oct = ['0'-'7'] | |
261 | let hex = ['0'-'9' 'a'-'f' 'A'-'F'] | |
262 | ||
263 | let decimal = ('0' | (['1'-'9'] dec*)) | |
264 | let octal = ['0'] oct+ | |
ae4735db | 265 | let hexa = ("0x" |"0X") hex+ |
34e49164 C |
266 | |
267 | ||
268 | let pent = dec+ | |
269 | let pfract = dec+ | |
270 | let sign = ['-' '+'] | |
271 | let exp = ['e''E'] sign? dec+ | |
272 | let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?) | |
273 | ||
274 | let id = letter (letter | digit) * | |
275 | ||
276 | (*****************************************************************************) | |
277 | rule token = parse | |
278 | ||
279 | (* ----------------------------------------------------------------------- *) | |
280 | (* spacing/comments *) | |
281 | (* ----------------------------------------------------------------------- *) | |
282 | ||
ae4735db C |
283 | (* note: this lexer generate tokens for comments!! so can not give |
284 | * this lexer as-is to the parsing function. The caller must preprocess | |
978fd7e5 | 285 | * it, e.g. by using techniques like cur_tok ref in parse_c.ml. |
ae4735db | 286 | * |
978fd7e5 C |
287 | * update: we now also generate a separate token for newlines, so now |
288 | * the caller may also have to reagglomerate all those commentspace | |
289 | * tokens if he was assuming that spaces were agglomerate in a single | |
ae4735db | 290 | * token. |
34e49164 C |
291 | *) |
292 | ||
293 | | ['\n'] [' ' '\t' '\r' '\011' '\012' ]* | |
294 | (* starting a new line; the newline character followed by whitespace *) | |
295 | { TCommentNewline (tokinfo lexbuf) } | |
ae4735db | 296 | | [' ' '\t' '\r' '\011' '\012' ]+ |
34e49164 | 297 | { TCommentSpace (tokinfo lexbuf) } |
ae4735db C |
298 | | "/*" |
299 | { let info = tokinfo lexbuf in | |
34e49164 | 300 | let com = comment lexbuf in |
0708f913 C |
301 | |
302 | let info' = info +> tok_add_s com in | |
303 | let s = Ast_c.str_of_info info' in | |
ae4735db | 304 | (* could be more flexible, use [\t ]* instead of hardcoded |
b1b2de81 | 305 | * single space. *) |
0708f913 | 306 | match s with |
ae4735db | 307 | | "/* {{coccinelle:skip_start}} */" -> |
0708f913 | 308 | TCommentSkipTagStart (info') |
ae4735db | 309 | | "/* {{coccinelle:skip_end}} */" -> |
0708f913 | 310 | TCommentSkipTagEnd (info') |
ae4735db | 311 | | _ -> TComment(info') |
34e49164 C |
312 | } |
313 | ||
314 | ||
315 | (* C++ comment are allowed via gccext, but normally they are deleted by cpp. | |
316 | * So need this here only when dont call cpp before. | |
485bce71 | 317 | * note that we don't keep the trailing \n; it will be in another token. |
34e49164 | 318 | *) |
ae4735db | 319 | | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) } |
34e49164 C |
320 | |
321 | (* ----------------------------------------------------------------------- *) | |
322 | (* cpp *) | |
323 | (* ----------------------------------------------------------------------- *) | |
324 | ||
325 | (* old: | |
ae4735db C |
326 | * | '#' { endline lexbuf} // should be line, and not endline |
327 | * and endline = parse | '\n' { token lexbuf} | |
328 | * | _ { endline lexbuf} | |
34e49164 C |
329 | *) |
330 | ||
b1b2de81 | 331 | (* less?: |
ae4735db | 332 | * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ... |
34e49164 C |
333 | * => just "#" -> token {lexbuf} (that is ignore) |
334 | * il y'a 1 #elif sans rien apres | |
335 | * il y'a 1 #error sans rien apres | |
336 | * il y'a 2 mov dede, #xxx qui genere du coup exn car | |
337 |