Commit | Line | Data |
---|---|---|
34e49164 | 1 | { |
0708f913 C |
2 | (* Yoann Padioleau |
3 | * | |
4 | * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes | |
34e49164 C |
5 | * |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License (GPL) | |
8 | * version 2 as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, | |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | * file license.txt for more details. | |
14 | *) | |
15 | open Common | |
16 | ||
17 | open Parser_c | |
18 | ||
19 | open Ast_c (* to factorise tokens, OpAssign, ... *) | |
20 | ||
21 | (*****************************************************************************) | |
22 | (* | |
23 | * subtil: ocamllex use side effect on lexbuf, so must take care. | |
24 | * For instance must do | |
25 | * | |
26 | * let info = tokinfo lexbuf in | |
27 | * TComment (info +> tok_add_s (comment lexbuf)) | |
28 | * | |
29 | * and not | |
30 | * | |
31 | * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) | |
32 | * | |
33 | * because of the "wierd" order of evaluation of OCaml. | |
b1b2de81 C |
34 | * |
35 | * | |
34e49164 C |
36 | * |
37 | * note: can't use Lexer_parser._lexer_hint here to do different | |
38 | * things, because now we call the lexer to get all the tokens | |
39 | * (tokens_all), and then we parse. So we can't have the _lexer_hint | |
40 | * info here. We can have it only in parse_c. For the same reason, the | |
41 | * typedef handling here is now useless. | |
42 | *) | |
43 | (*****************************************************************************) | |
44 | ||
45 | (*****************************************************************************) | |
46 | (* Wrappers *) | |
47 | (*****************************************************************************) | |
708f4980 | 48 | let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing |
34e49164 C |
49 | |
50 | (*****************************************************************************) | |
51 | ||
52 | ||
53 | exception Lexical of string | |
54 | ||
55 | let tok lexbuf = Lexing.lexeme lexbuf | |
56 | ||
57 | let tokinfo lexbuf = | |
58 | { | |
59 | pinfo = Ast_c.OriginTok { | |
60 | Common.charpos = Lexing.lexeme_start lexbuf; | |
61 | Common.str = Lexing.lexeme lexbuf; | |
62 | (* info filled in a post-lexing phase *) | |
63 | Common.line = -1; | |
64 | Common.column = -1; | |
65 | Common.file = ""; | |
66 | }; | |
67 | (* must generate a new ref each time, otherwise share *) | |
68 | cocci_tag = ref Ast_c.emptyAnnot; | |
69 | comments_tag = ref Ast_c.emptyComments; | |
70 | } | |
71 | ||
b1b2de81 | 72 | (* cppext: must generate a new ref each time, otherwise share *) |
485bce71 C |
73 | let no_ifdef_mark () = ref (None: (int * int) option) |
74 | ||
34e49164 C |
75 | let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii |
76 | ||
77 | ||
78 | (* opti: less convenient, but using a hash is faster than using a match *) | |
79 | let keyword_table = Common.hash_of_list [ | |
80 | ||
485bce71 | 81 | (* c: *) |
34e49164 C |
82 | "void", (fun ii -> Tvoid ii); |
83 | "char", (fun ii -> Tchar ii); | |
84 | "short", (fun ii -> Tshort ii); | |
85 | "int", (fun ii -> Tint ii); | |
86 | "long", (fun ii -> Tlong ii); | |
87 | "float", (fun ii -> Tfloat ii); | |
88 | "double", (fun ii -> Tdouble ii); | |
89 | ||
90 | "unsigned", (fun ii -> Tunsigned ii); | |
91 | "signed", (fun ii -> Tsigned ii); | |
92 | ||
93 | "auto", (fun ii -> Tauto ii); | |
94 | "register", (fun ii -> Tregister ii); | |
95 | "extern", (fun ii -> Textern ii); | |
96 | "static", (fun ii -> Tstatic ii); | |
97 | ||
98 | "const", (fun ii -> Tconst ii); | |
99 | "volatile", (fun ii -> Tvolatile ii); | |
100 | ||
101 | "struct", (fun ii -> Tstruct ii); | |
102 | "union", (fun ii -> Tunion ii); | |
103 | "enum", (fun ii -> Tenum ii); | |
104 | "typedef", (fun ii -> Ttypedef ii); | |
105 | ||
106 | "if", (fun ii -> Tif ii); | |
107 | "else", (fun ii -> Telse ii); | |
108 | "break", (fun ii -> Tbreak ii); | |
109 | "continue", (fun ii -> Tcontinue ii); | |
110 | "switch", (fun ii -> Tswitch ii); | |
111 | "case", (fun ii -> Tcase ii); | |
112 | "default", (fun ii -> Tdefault ii); | |
113 | "for", (fun ii -> Tfor ii); | |
114 | "do", (fun ii -> Tdo ii); | |
115 | "while", (fun ii -> Twhile ii); | |
116 | "return", (fun ii -> Treturn ii); | |
117 | "goto", (fun ii -> Tgoto ii); | |
118 | ||
119 | "sizeof", (fun ii -> Tsizeof ii); | |
120 | ||
121 | ||
122 | (* gccext: cppext: linuxext: synonyms *) | |
123 | "asm", (fun ii -> Tasm ii); | |
124 | "__asm__", (fun ii -> Tasm ii); | |
125 | "__asm", (fun ii -> Tasm ii); | |
126 | ||
127 | "inline", (fun ii -> Tinline ii); | |
128 | "__inline__", (fun ii -> Tinline ii); | |
129 | "__inline", (fun ii -> Tinline ii); | |
34e49164 C |
130 | |
131 | "__attribute__", (fun ii -> Tattribute ii); | |
132 | "__attribute", (fun ii -> Tattribute ii); | |
133 | ||
134 | "typeof", (fun ii -> Ttypeof ii); | |
135 | "__typeof__", (fun ii -> Ttypeof ii); | |
485bce71 C |
136 | "__typeof", (fun ii -> Ttypeof ii); |
137 | ||
34e49164 | 138 | |
485bce71 | 139 | (* gccext: alias *) |
34e49164 C |
140 | "__signed__", (fun ii -> Tsigned ii); |
141 | ||
142 | "__const__", (fun ii -> Tconst ii); | |
143 | "__const", (fun ii -> Tconst ii); | |
144 | ||
145 | "__volatile__", (fun ii -> Tvolatile ii); | |
146 | "__volatile", (fun ii -> Tvolatile ii); | |
485bce71 C |
147 | |
148 | ||
149 | (* c99: *) | |
150 | (* no just "restrict" ? maybe for backward compatibility they avoided | |
151 | * to use restrict which people may have used in their program already | |
152 | *) | |
153 | "__restrict", (fun ii -> Trestrict ii); | |
154 | "__restrict__", (fun ii -> Trestrict ii); | |
34e49164 C |
155 | |
156 | ] | |
157 | ||
158 | let error_radix s = | |
159 | ("numeric " ^ s ^ " constant contains digits beyond the radix:") | |
160 | ||
708f4980 C |
161 | (* functions for figuring out the type of integers *) |
162 | ||
163 | let is_long_dec s int uint long ulong = | |
164 | match !Flag_parsing_c.int_thresholds with | |
165 | None -> int | |
166 | | Some (_,_,uint_threshold,long_threshold,ulong_threshold) -> | |
167 | let bn = Big_int.big_int_of_string s in | |
168 | if Big_int.ge_big_int bn ulong_threshold | |
169 | then ulong | |
170 | else | |
171 | if Big_int.ge_big_int bn long_threshold | |
172 | then long | |
173 | else | |
174 | if Big_int.ge_big_int bn uint_threshold | |
175 | then long | |
176 | else int | |
177 | ||
178 | let is_long_ho s int uint long ulong drop bpd count = | |
179 | match !Flag_parsing_c.int_thresholds with | |
180 | None -> int | |
181 | | Some (uint_sz,ulong_sz,_,_,_) -> | |
182 | let len = String.length s in | |
183 | (* this assumes that all of the hex/oct digits are significant *) | |
184 | (* drop is 2 for hex (0x) and 1 for oct (0) *) | |
185 | let s = String.sub s drop (len - drop) in | |
186 | let len = | |
187 | ((len-drop) * bpd) - | |
188 | (count (int_of_string("0x"^(String.sub s 0 1)))) in | |
189 | if len < uint_sz | |
190 | then int | |
191 | else | |
192 | if len = uint_sz | |
193 | then uint | |
194 | else | |
195 | if len < ulong_sz | |
196 | then long | |
197 | else ulong | |
198 | ||
199 | let is_long_oct s int uint long ulong = | |
200 | is_long_ho s int uint long ulong 1 3 | |
201 | (* stupid, but probably more efficient than taking logs *) | |
202 | (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0) | |
203 | let is_long_hex s int uint long ulong = | |
204 | is_long_ho s int uint long ulong 2 4 | |
205 | (* stupid, but probably more efficient than taking logs *) | |
206 | (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1 | |
207 | | _ -> 0) | |
208 | ||
209 | let sint = (Signed,CInt) | |
210 | let uint = (UnSigned,CInt) | |
211 | let slong = (Signed,CLong) | |
212 | let ulong = (UnSigned,CLong) | |
213 | ||
34e49164 C |
214 | } |
215 | ||
216 | (*****************************************************************************) | |
217 | let letter = ['A'-'Z' 'a'-'z' '_'] | |
218 | let digit = ['0'-'9'] | |
219 | ||
220 | (* not used for the moment *) | |
221 | let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':' | |
222 | ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~'] | |
223 | let space = [' ' '\t' '\n' '\r' '\011' '\012' ] | |
224 | let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] | |
225 | (* 7 = \a = bell in C. this is not the only char allowed !! | |
226 | * ex @ and $ ` are valid too | |
227 | *) | |
228 | ||
229 | let cchar = (letter | digit | punctuation | additionnal) | |
230 | ||
231 | let sp = [' ' '\t']+ | |
232 | let spopt = [' ' '\t']* | |
233 | ||
234 | let dec = ['0'-'9'] | |
235 | let oct = ['0'-'7'] | |
236 | let hex = ['0'-'9' 'a'-'f' 'A'-'F'] | |
237 | ||
238 | let decimal = ('0' | (['1'-'9'] dec*)) | |
239 | let octal = ['0'] oct+ | |
240 | let hexa = ("0x" |"0X") hex+ | |
241 | ||
242 | ||
243 | let pent = dec+ | |
244 | let pfract = dec+ | |
245 | let sign = ['-' '+'] | |
246 | let exp = ['e''E'] sign? dec+ | |
247 | let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?) | |
248 | ||
249 | let id = letter (letter | digit) * | |
250 | ||
251 | (*****************************************************************************) | |
252 | rule token = parse | |
253 | ||
254 | (* ----------------------------------------------------------------------- *) | |
255 | (* spacing/comments *) | |
256 | (* ----------------------------------------------------------------------- *) | |
257 | ||
258 | (* note: this lexer generate tokens for comments!! so can not give | |
259 | * this lexer as-is to the parsing function. Must preprocess it, hence | |
260 | * use techniques like cur_tok ref in parse_c.ml | |
261 | *) | |
262 | ||
263 | | ['\n'] [' ' '\t' '\r' '\011' '\012' ]* | |
264 | (* starting a new line; the newline character followed by whitespace *) | |
265 | { TCommentNewline (tokinfo lexbuf) } | |
266 | | [' ' '\t' '\r' '\011' '\012' ]+ | |
267 | { TCommentSpace (tokinfo lexbuf) } | |
268 | | "/*" | |
269 | { let info = tokinfo lexbuf in | |
270 | let com = comment lexbuf in | |
0708f913 C |
271 | |
272 | let info' = info +> tok_add_s com in | |
273 | let s = Ast_c.str_of_info info' in | |
b1b2de81 C |
274 | (* could be more flexible, use [\t ]* instead of hardcoded |
275 | * single space. *) | |
0708f913 C |
276 | match s with |
277 | | "/* {{coccinelle:skip_start}} */" -> | |
278 | TCommentSkipTagStart (info') | |
279 | | "/* {{coccinelle:skip_end}} */" -> | |
280 | TCommentSkipTagEnd (info') | |
281 | | _ -> TComment(info') | |
34e49164 C |
282 | } |
283 | ||
284 | ||
285 | (* C++ comment are allowed via gccext, but normally they are deleted by cpp. | |
286 | * So need this here only when dont call cpp before. | |
485bce71 | 287 | * note that we don't keep the trailing \n; it will be in another token. |
34e49164 C |
288 | *) |
289 | | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) } | |
290 | ||
291 | (* ----------------------------------------------------------------------- *) | |
292 | (* cpp *) | |
293 | (* ----------------------------------------------------------------------- *) | |
294 | ||
295 | (* old: | |
296 | * | '#' { endline lexbuf} // should be line, and not endline | |
297 | * and endline = parse | '\n' { token lexbuf} | |
298 | * | _ { endline lexbuf} | |
299 | *) | |
300 | ||
b1b2de81 | 301 | (* less?: |
34e49164 C |
302 | * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ... |
303 | * => just "#" -> token {lexbuf} (that is ignore) | |
304 | * il y'a 1 #elif sans rien apres | |
305 | * il y'a 1 #error sans rien apres | |
306 | * il y'a 2 mov dede, #xxx qui genere du coup exn car | |
307 |