Commit | Line | Data |
---|---|---|
34e49164 | 1 | { |
485bce71 | 2 | (* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau |
34e49164 C |
3 | * |
4 | * This program is free software; you can redistribute it and/or | |
5 | * modify it under the terms of the GNU General Public License (GPL) | |
6 | * version 2 as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 | * file license.txt for more details. | |
12 | *) | |
13 | open Common | |
14 | ||
15 | open Parser_c | |
16 | ||
17 | open Ast_c (* to factorise tokens, OpAssign, ... *) | |
18 | ||
19 | (*****************************************************************************) | |
20 | (* | |
21 | * subtil: ocamllex use side effect on lexbuf, so must take care. | |
22 | * For instance must do | |
23 | * | |
24 | * let info = tokinfo lexbuf in | |
25 | * TComment (info +> tok_add_s (comment lexbuf)) | |
26 | * | |
27 | * and not | |
28 | * | |
29 | * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf)) | |
30 | * | |
31 | * because of the "wierd" order of evaluation of OCaml. | |
32 | * | |
33 | * note: can't use Lexer_parser._lexer_hint here to do different | |
34 | * things, because now we call the lexer to get all the tokens | |
35 | * (tokens_all), and then we parse. So we can't have the _lexer_hint | |
36 | * info here. We can have it only in parse_c. For the same reason, the | |
37 | * typedef handling here is now useless. | |
38 | *) | |
39 | (*****************************************************************************) | |
40 | ||
41 | (*****************************************************************************) | |
42 | (* Wrappers *) | |
43 | (*****************************************************************************) | |
44 | let pr2 s = | |
45 | if !Flag_parsing_c.verbose_lexing | |
46 | then Common.pr2 s | |
47 | ||
48 | (*****************************************************************************) | |
49 | ||
50 | ||
51 | exception Lexical of string | |
52 | ||
53 | let tok lexbuf = Lexing.lexeme lexbuf | |
54 | ||
55 | let tokinfo lexbuf = | |
56 | { | |
57 | pinfo = Ast_c.OriginTok { | |
58 | Common.charpos = Lexing.lexeme_start lexbuf; | |
59 | Common.str = Lexing.lexeme lexbuf; | |
60 | (* info filled in a post-lexing phase *) | |
61 | Common.line = -1; | |
62 | Common.column = -1; | |
63 | Common.file = ""; | |
64 | }; | |
65 | (* must generate a new ref each time, otherwise share *) | |
66 | cocci_tag = ref Ast_c.emptyAnnot; | |
67 | comments_tag = ref Ast_c.emptyComments; | |
68 | } | |
69 | ||
485bce71 C |
70 | (* must generate a new ref each time, otherwise share *) |
71 | let no_ifdef_mark () = ref (None: (int * int) option) | |
72 | ||
34e49164 C |
73 | let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii |
74 | ||
75 | ||
76 | (* opti: less convenient, but using a hash is faster than using a match *) | |
77 | let keyword_table = Common.hash_of_list [ | |
78 | ||
485bce71 | 79 | (* c: *) |
34e49164 C |
80 | "void", (fun ii -> Tvoid ii); |
81 | "char", (fun ii -> Tchar ii); | |
82 | "short", (fun ii -> Tshort ii); | |
83 | "int", (fun ii -> Tint ii); | |
84 | "long", (fun ii -> Tlong ii); | |
85 | "float", (fun ii -> Tfloat ii); | |
86 | "double", (fun ii -> Tdouble ii); | |
87 | ||
88 | "unsigned", (fun ii -> Tunsigned ii); | |
89 | "signed", (fun ii -> Tsigned ii); | |
90 | ||
91 | "auto", (fun ii -> Tauto ii); | |
92 | "register", (fun ii -> Tregister ii); | |
93 | "extern", (fun ii -> Textern ii); | |
94 | "static", (fun ii -> Tstatic ii); | |
95 | ||
96 | "const", (fun ii -> Tconst ii); | |
97 | "volatile", (fun ii -> Tvolatile ii); | |
98 | ||
99 | "struct", (fun ii -> Tstruct ii); | |
100 | "union", (fun ii -> Tunion ii); | |
101 | "enum", (fun ii -> Tenum ii); | |
102 | "typedef", (fun ii -> Ttypedef ii); | |
103 | ||
104 | "if", (fun ii -> Tif ii); | |
105 | "else", (fun ii -> Telse ii); | |
106 | "break", (fun ii -> Tbreak ii); | |
107 | "continue", (fun ii -> Tcontinue ii); | |
108 | "switch", (fun ii -> Tswitch ii); | |
109 | "case", (fun ii -> Tcase ii); | |
110 | "default", (fun ii -> Tdefault ii); | |
111 | "for", (fun ii -> Tfor ii); | |
112 | "do", (fun ii -> Tdo ii); | |
113 | "while", (fun ii -> Twhile ii); | |
114 | "return", (fun ii -> Treturn ii); | |
115 | "goto", (fun ii -> Tgoto ii); | |
116 | ||
117 | "sizeof", (fun ii -> Tsizeof ii); | |
118 | ||
119 | ||
120 | (* gccext: cppext: linuxext: synonyms *) | |
121 | "asm", (fun ii -> Tasm ii); | |
122 | "__asm__", (fun ii -> Tasm ii); | |
123 | "__asm", (fun ii -> Tasm ii); | |
124 | ||
125 | "inline", (fun ii -> Tinline ii); | |
126 | "__inline__", (fun ii -> Tinline ii); | |
127 | "__inline", (fun ii -> Tinline ii); | |
34e49164 C |
128 | |
129 | "__attribute__", (fun ii -> Tattribute ii); | |
130 | "__attribute", (fun ii -> Tattribute ii); | |
131 | ||
132 | "typeof", (fun ii -> Ttypeof ii); | |
133 | "__typeof__", (fun ii -> Ttypeof ii); | |
485bce71 C |
134 | "__typeof", (fun ii -> Ttypeof ii); |
135 | ||
34e49164 | 136 | |
485bce71 | 137 | (* gccext: alias *) |
34e49164 C |
138 | "__signed__", (fun ii -> Tsigned ii); |
139 | ||
140 | "__const__", (fun ii -> Tconst ii); | |
141 | "__const", (fun ii -> Tconst ii); | |
142 | ||
143 | "__volatile__", (fun ii -> Tvolatile ii); | |
144 | "__volatile", (fun ii -> Tvolatile ii); | |
485bce71 C |
145 | |
146 | ||
147 | (* c99: *) | |
148 | (* no just "restrict" ? maybe for backward compatibility they avoided | |
149 | * to use restrict which people may have used in their program already | |
150 | *) | |
151 | "__restrict", (fun ii -> Trestrict ii); | |
152 | "__restrict__", (fun ii -> Trestrict ii); | |
34e49164 C |
153 | |
154 | ] | |
155 | ||
156 | let error_radix s = | |
157 | ("numeric " ^ s ^ " constant contains digits beyond the radix:") | |
158 | ||
159 | } | |
160 | ||
161 | (*****************************************************************************) | |
162 | let letter = ['A'-'Z' 'a'-'z' '_'] | |
163 | let digit = ['0'-'9'] | |
164 | ||
165 | (* not used for the moment *) | |
166 | let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':' | |
167 | ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~'] | |
168 | let space = [' ' '\t' '\n' '\r' '\011' '\012' ] | |
169 | let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ] | |
170 | (* 7 = \a = bell in C. this is not the only char allowed !! | |
171 | * ex @ and $ ` are valid too | |
172 | *) | |
173 | ||
174 | let cchar = (letter | digit | punctuation | additionnal) | |
175 | ||
176 | let sp = [' ' '\t']+ | |
177 | let spopt = [' ' '\t']* | |
178 | ||
179 | let dec = ['0'-'9'] | |
180 | let oct = ['0'-'7'] | |
181 | let hex = ['0'-'9' 'a'-'f' 'A'-'F'] | |
182 | ||
183 | let decimal = ('0' | (['1'-'9'] dec*)) | |
184 | let octal = ['0'] oct+ | |
185 | let hexa = ("0x" |"0X") hex+ | |
186 | ||
187 | ||
188 | let pent = dec+ | |
189 | let pfract = dec+ | |
190 | let sign = ['-' '+'] | |
191 | let exp = ['e''E'] sign? dec+ | |
192 | let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?) | |
193 | ||
194 | let id = letter (letter | digit) * | |
195 | ||
196 | (*****************************************************************************) | |
197 | rule token = parse | |
198 | ||
199 | (* ----------------------------------------------------------------------- *) | |
200 | (* spacing/comments *) | |
201 | (* ----------------------------------------------------------------------- *) | |
202 | ||
203 | (* note: this lexer generate tokens for comments!! so can not give | |
204 | * this lexer as-is to the parsing function. Must preprocess it, hence | |
205 | * use techniques like cur_tok ref in parse_c.ml | |
206 | *) | |
207 | ||
208 | | ['\n'] [' ' '\t' '\r' '\011' '\012' ]* | |
209 | (* starting a new line; the newline character followed by whitespace *) | |
210 | { TCommentNewline (tokinfo lexbuf) } | |
211 | | [' ' '\t' '\r' '\011' '\012' ]+ | |
212 | { TCommentSpace (tokinfo lexbuf) } | |
213 | | "/*" | |
214 | { let info = tokinfo lexbuf in | |
215 | let com = comment lexbuf in | |
216 | TComment(info +> tok_add_s com) | |
217 | } | |
218 | ||
219 | ||
220 | (* C++ comment are allowed via gccext, but normally they are deleted by cpp. | |
221 | * So need this here only when dont call cpp before. | |
485bce71 | 222 | * note that we don't keep the trailing \n; it will be in another token. |
34e49164 C |
223 | *) |
224 | | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) } | |
225 | ||
226 | (* ----------------------------------------------------------------------- *) | |
227 | (* cpp *) | |
228 | (* ----------------------------------------------------------------------- *) | |
229 | ||
230 | (* old: | |
231 | * | '#' { endline lexbuf} // should be line, and not endline | |
232 | * and endline = parse | '\n' { token lexbuf} | |
233 | * | _ { endline lexbuf} | |
234 | *) | |
235 | ||
236 | (* todo?: | |
237 | * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ... | |
238 | * => just "#" -> token {lexbuf} (that is ignore) | |
239 | * il y'a 1 #elif sans rien apres | |
240 | * il y'a 1 #error sans rien apres | |
241 | * il y'a 2 mov dede, #xxx qui genere du coup exn car | |
242 |