Commit | Line | Data |
---|---|---|
34e49164 | 1 | { |
0708f913 | 2 | (* Yoann Padioleau |
ae4735db | 3 | * |
0708f913 | 4 | * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes |
34e49164 C |
5 | * |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License (GPL) | |
8 | * version 2 as published by the Free Software Foundation. | |
ae4735db | 9 | * |
34e49164 C |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * file license.txt for more details. | |
14 | *) | |
15 | open Common | |
16 | ||
17 | open Parser_c | |
18 | ||
19 | open Ast_c (* to factorise tokens, OpAssign, ... *) | |
20 | ||
21 | (*****************************************************************************) | |
22 | (* | |
ae4735db C |
23 | * subtil: ocamllex use side effect on lexbuf, so must take care. |
24 | * For instance must do | |
25 | * | |
26 | * let info = tokinfo lexbuf in | |
27 | * TComment (info +> tok_add_s (comment lexbuf)) | |
28 | * | |
29 | * and not | |
30 | * | |
31 | * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) | |
32 | * | |
34e49164 | 33 | * because of the "wierd" order of evaluation of OCaml. |
ae4735db C |
34 | * |
35 | * | |
34e49164 C |
36 | * |
37 | * note: can't use Lexer_parser._lexer_hint here to do different | |
38 | * things, because now we call the lexer to get all the tokens | |
39 | * (tokens_all), and then we parse. So we can't have the _lexer_hint | |
40 | * info here. We can have it only in parse_c. For the same reason, the | |
ae4735db | 41 | * typedef handling here is now useless. |
34e49164 C |
42 | *) |
43 | (*****************************************************************************) | |
44 | ||
45 | (*****************************************************************************) | |
46 | (* Wrappers *) | |
47 | (*****************************************************************************) | |
ae4735db | 48 | let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing |
34e49164 C |
49 | |
50 | (*****************************************************************************) | |
51 | ||
52 | ||
53 | exception Lexical of string | |
54 | ||
55 | let tok lexbuf = Lexing.lexeme lexbuf | |
56 | ||
ae4735db C |
57 | let tokinfo lexbuf = |
58 | { | |
34e49164 | 59 | pinfo = Ast_c.OriginTok { |
ae4735db | 60 | Common.charpos = Lexing.lexeme_start lexbuf; |
34e49164 C |
61 | Common.str = Lexing.lexeme lexbuf; |
62 | (* info filled in a post-lexing phase *) | |
ae4735db C |
63 | Common.line = -1; |
64 | Common.column = -1; | |
34e49164 C |
65 | Common.file = ""; |
66 | }; | |
67 | (* must generate a new ref each time, otherwise share *) | |
68 | cocci_tag = ref Ast_c.emptyAnnot; | |
69 | comments_tag = ref Ast_c.emptyComments; | |
70 | } | |
71 | ||
b1b2de81 | 72 | (* cppext: must generate a new ref each time, otherwise share *) |
485bce71 C |
73 | let no_ifdef_mark () = ref (None: (int * int) option) |
74 | ||
34e49164 | 75 | let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii |
ae4735db | 76 | |
34e49164 C |
77 | |
78 | (* opti: less convenient, but using a hash is faster than using a match *) | |
79 | let keyword_table = Common.hash_of_list [ | |
80 | ||
485bce71 | 81 | (* c: *) |
ae4735db C |
82 | "void", (fun ii -> Tvoid ii); |
83 | "char", (fun ii -> Tchar ii); | |
84 | "short", (fun ii -> Tshort ii); | |
85 | "int", (fun ii -> Tint ii); | |
86 | "long", (fun ii -> Tlong ii); | |
87 | "float", (fun ii -> Tfloat ii); | |
88 | "double", (fun ii -> Tdouble ii); | |
1eddfd50 C |
89 | "size_t", (fun ii -> Tsize_t ii); |
90 | "ssize_t", (fun ii -> Tssize_t ii); | |
91 | "ptrdiff_t", (fun ii -> Tptrdiff_t ii); | |
ae4735db C |
92 | |
93 | "unsigned", (fun ii -> Tunsigned ii); | |
34e49164 | 94 | "signed", (fun ii -> Tsigned ii); |
ae4735db C |
95 | |
96 | "auto", (fun ii -> Tauto ii); | |
97 | "register", (fun ii -> Tregister ii); | |
98 | "extern", (fun ii -> Textern ii); | |
34e49164 C |
99 | "static", (fun ii -> Tstatic ii); |
100 | ||
101 | "const", (fun ii -> Tconst ii); | |
ae4735db C |
102 | "volatile", (fun ii -> Tvolatile ii); |
103 | ||
104 | "struct", (fun ii -> Tstruct ii); | |
105 | "union", (fun ii -> Tunion ii); | |
106 | "enum", (fun ii -> Tenum ii); | |
107 | "typedef", (fun ii -> Ttypedef ii); | |
108 | ||
109 | "if", (fun ii -> Tif ii); | |
110 | "else", (fun ii -> Telse ii); | |
111 | "break", (fun ii -> Tbreak ii); | |
34e49164 | 112 | "continue", (fun ii -> Tcontinue ii); |
ae4735db C |
113 | "switch", (fun ii -> Tswitch ii); |
114 | "case", (fun ii -> Tcase ii); | |
115 | "default", (fun ii -> Tdefault ii); | |
116 | "for", (fun ii -> Tfor ii); | |
117 | "do", (fun ii -> Tdo ii); | |
118 | "while", (fun ii -> Twhile ii); | |
34e49164 | 119 | "return", (fun ii -> Treturn ii); |
ae4735db C |
120 | "goto", (fun ii -> Tgoto ii); |
121 | ||
122 | "sizeof", (fun ii -> Tsizeof ii); | |
34e49164 C |
123 | |
124 | ||
125 | (* gccext: cppext: linuxext: synonyms *) | |
126 | "asm", (fun ii -> Tasm ii); | |
127 | "__asm__", (fun ii -> Tasm ii); | |
128 | "__asm", (fun ii -> Tasm ii); | |
129 | ||
130 | "inline", (fun ii -> Tinline ii); | |
131 | "__inline__", (fun ii -> Tinline ii); | |
132 | "__inline", (fun ii -> Tinline ii); | |
34e49164 C |
133 | |
134 | "__attribute__", (fun ii -> Tattribute ii); | |
135 | "__attribute", (fun ii -> Tattribute ii); | |
136 | ||
137 | "typeof", (fun ii -> Ttypeof ii); | |
138 | "__typeof__", (fun ii -> Ttypeof ii); | |
485bce71 C |
139 | "__typeof", (fun ii -> Ttypeof ii); |
140 | ||
978fd7e5 | 141 | (* found a lot in expanded code *) |
ae4735db | 142 | "__extension__", (fun ii -> TattributeNoarg ii); |
978fd7e5 | 143 | |
34e49164 | 144 | |
485bce71 | 145 | (* gccext: alias *) |
34e49164 C |
146 | "__signed__", (fun ii -> Tsigned ii); |
147 | ||
148 | "__const__", (fun ii -> Tconst ii); | |
149 | "__const", (fun ii -> Tconst ii); | |
150 | ||
ae4735db C |
151 | "__volatile__", (fun ii -> Tvolatile ii); |
152 | "__volatile", (fun ii -> Tvolatile ii); | |
485bce71 | 153 | |
978fd7e5 C |
154 | (* windowsext: *) |
155 | "__declspec", (fun ii -> Tattribute ii); | |
156 | ||
157 | "__stdcall", (fun ii -> TattributeNoarg ii); | |
158 | "__cdecl", (fun ii -> TattributeNoarg ii); | |
159 | "WINAPI", (fun ii -> TattributeNoarg ii); | |
160 | "APIENTRY", (fun ii -> TattributeNoarg ii); | |
161 | "CALLBACK", (fun ii -> TattributeNoarg ii); | |
485bce71 C |
162 | |
163 | (* c99: *) | |
ae4735db C |
164 | (* no just "restrict" ? maybe for backward compatibility they avoided |
165 | * to use restrict which people may have used in their program already | |
485bce71 | 166 | *) |
ae4735db C |
167 | "__restrict", (fun ii -> Trestrict ii); |
168 | "__restrict__", (fun ii -> Trestrict ii); | |
169 | ||
34e49164 C |
170 | ] |
171 | ||
ae4735db | 172 | let error_radix s = |
34e49164 C |
173 | ("numeric " ^ s ^ " constant contains digits beyond the radix:") |
174 | ||
978fd7e5 | 175 | (* julia: functions for figuring out the type of integers *) |
708f4980 C |
176 | |
177 | let is_long_dec s int uint long ulong = | |
178 | match !Flag_parsing_c.int_thresholds with | |
179 | None -> int | |
180 | | Some (_,_,uint_threshold,long_threshold,ulong_threshold) -> | |
181 | let bn = Big_int.big_int_of_string s in | |
182 | if Big_int.ge_big_int bn ulong_threshold | |
183 | then ulong | |
184 | else | |
185 | if Big_int.ge_big_int bn long_threshold | |
186 | then long | |
187 | else | |
188 | if Big_int.ge_big_int bn uint_threshold | |
189 | then long | |
190 | else int | |
191 | ||
192 | let is_long_ho s int uint long ulong drop bpd count = | |
193 | match !Flag_parsing_c.int_thresholds with | |
194 | None -> int | |
195 | | Some (uint_sz,ulong_sz,_,_,_) -> | |
196 | let len = String.length s in | |
197 | (* this assumes that all of the hex/oct digits are significant *) | |
198 | (* drop is 2 for hex (0x) and 1 for oct (0) *) | |
199 | let s = String.sub s drop (len - drop) in | |
200 | let len = | |
201 | ((len-drop) * bpd) - | |
202 | (count (int_of_string("0x"^(String.sub s 0 1)))) in | |
203 | if len < uint_sz | |
204 | then int | |
205 | else | |
206 | if len = uint_sz | |
207 | then uint | |
208 | else | |
209 | if len < ulong_sz | |
210 | then long | |
211 | else ulong | |
212 | ||
213 | let is_long_oct s int uint long ulong = | |
214 | is_long_ho s int uint long ulong 1 3 | |
215 | (* stupid, but probably more efficient than taking logs *) | |
216 | (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0) | |
217 | let is_long_hex s int uint long ulong = | |
218 | is_long_ho s int uint long ulong 2 4 | |
219 | (* stupid, but probably more efficient than taking logs *) | |
220 | (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1 | |
221 | | _ -> 0) | |
222 | ||
223 | let sint = (Signed,CInt) | |
224 | let uint = (UnSigned,CInt) | |
225 | let slong = (Signed,CLong) | |
226 | let ulong = (UnSigned,CLong) | |
227 | ||
34e49164 C |
228 | } |
229 | ||
230 | (*****************************************************************************) | |
231 | let letter = ['A'-'Z' 'a'-'z' '_'] | |
232 | let digit = ['0'-'9'] | |
233 | ||
234 | (* not used for the moment *) | |
235 | let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':' | |
236 | ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~'] | |
237 | let space = [' ' '\t' '\n' '\r' '\011' '\012' ] | |
ae4735db C |
238 | let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] |
239 | (* 7 = \a = bell in C. this is not the only char allowed !! | |
240 | * ex @ and $ ` are valid too | |
34e49164 C |
241 | *) |
242 | ||
ae4735db | 243 | let cchar = (letter | digit | punctuation | additionnal) |
34e49164 C |
244 | |
245 | let sp = [' ' '\t']+ | |
246 | let spopt = [' ' '\t']* | |
247 | ||
248 | let dec = ['0'-'9'] | |
249 | let oct = ['0'-'7'] | |
250 | let hex = ['0'-'9' 'a'-'f' 'A'-'F'] | |
251 | ||
252 | let decimal = ('0' | (['1'-'9'] dec*)) | |
253 | let octal = ['0'] oct+ | |
ae4735db | 254 | let hexa = ("0x" |"0X") hex+ |
34e49164 C |
255 | |
256 | ||
257 | let pent = dec+ | |
258 | let pfract = dec+ | |
259 | let sign = ['-' '+'] | |
260 | let exp = ['e''E'] sign? dec+ | |
261 | let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?) | |
262 | ||
263 | let id = letter (letter | digit) * | |
264 | ||
265 | (*****************************************************************************) | |
266 | rule token = parse | |
267 | ||
268 | (* ----------------------------------------------------------------------- *) | |
269 | (* spacing/comments *) | |
270 | (* ----------------------------------------------------------------------- *) | |
271 | ||
ae4735db C |
272 | (* note: this lexer generate tokens for comments!! so can not give |
273 | * this lexer as-is to the parsing function. The caller must preprocess | |
978fd7e5 | 274 | * it, e.g. by using techniques like cur_tok ref in parse_c.ml. |
ae4735db | 275 | * |
978fd7e5 C |
276 | * update: we now also generate a separate token for newlines, so now |
277 | * the caller may also have to reagglomerate all those commentspace | |
278 | * tokens if he was assuming that spaces were agglomerate in a single | |
ae4735db | 279 | * token. |
34e49164 C |
280 | *) |
281 | ||
282 | | ['\n'] [' ' '\t' '\r' '\011' '\012' ]* | |
283 | (* starting a new line; the newline character followed by whitespace *) | |
284 | { TCommentNewline (tokinfo lexbuf) } | |
ae4735db | 285 | | [' ' '\t' '\r' '\011' '\012' ]+ |
34e49164 | 286 | { TCommentSpace (tokinfo lexbuf) } |
ae4735db C |
287 | | "/*" |
288 | { let info = tokinfo lexbuf in | |
34e49164 | 289 | let com = comment lexbuf in |
0708f913 C |
290 | |
291 | let info' = info +> tok_add_s com in | |
292 | let s = Ast_c.str_of_info info' in | |
ae4735db | 293 | (* could be more flexible, use [\t ]* instead of hardcoded |
b1b2de81 | 294 | * single space. *) |
0708f913 | 295 | match s with |
ae4735db | 296 | | "/* {{coccinelle:skip_start}} */" -> |
0708f913 | 297 | TCommentSkipTagStart (info') |
ae4735db | 298 | | "/* {{coccinelle:skip_end}} */" -> |
0708f913 | 299 | TCommentSkipTagEnd (info') |
ae4735db | 300 | | _ -> TComment(info') |
34e49164 C |
301 | } |
302 | ||
303 | ||
304 | (* C++ comment are allowed via gccext, but normally they are deleted by cpp. | |
305 | * So need this here only when dont call cpp before. | |
485bce71 | 306 | * note that we don't keep the trailing \n; it will be in another token. |
34e49164 | 307 | *) |
ae4735db | 308 | | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) } |
34e49164 C |
309 | |
310 | (* ----------------------------------------------------------------------- *) | |
311 | (* cpp *) | |
312 | (* ----------------------------------------------------------------------- *) | |
313 | ||
314 | (* old: | |
ae4735db C |
315 | * | '#' { endline lexbuf} // should be line, and not endline |
316 | * and endline = parse | '\n' { token lexbuf} | |
317 | * | _ { endline lexbuf} | |
34e49164 C |
318 | *) |
319 | ||
b1b2de81 | 320 | (* less?: |
ae4735db | 321 | * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ... |
34e49164 C |
322 | * => just "#" -> token {lexbuf} (that is ignore) |
323 | * il y'a 1 #elif sans rien apres | |
324 | * il y'a 1 #error sans rien apres | |
325 | * il y'a 2 mov dede, #xxx qui genere du coup exn car | |
326 |