Release coccinelle-0.1.2
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
CommitLineData
34e49164 1{
485bce71 2(* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau
34e49164
C
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License (GPL)
6 * version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * file license.txt for more details.
12 *)
13open Common
14
15open Parser_c
16
17open Ast_c (* to factorise tokens, OpAssign, ... *)
18
19(*****************************************************************************)
20(*
21 * subtil: ocamllex use side effect on lexbuf, so must take care.
22 * For instance must do
23 *
24 * let info = tokinfo lexbuf in
25 * TComment (info +> tok_add_s (comment lexbuf))
26 *
27 * and not
28 *
29 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
30 *
31 * because of the "wierd" order of evaluation of OCaml.
32 *
33 * note: can't use Lexer_parser._lexer_hint here to do different
34 * things, because now we call the lexer to get all the tokens
35 * (tokens_all), and then we parse. So we can't have the _lexer_hint
36 * info here. We can have it only in parse_c. For the same reason, the
37 * typedef handling here is now useless.
38 *)
39(*****************************************************************************)
40
41(*****************************************************************************)
42(* Wrappers *)
43(*****************************************************************************)
44let pr2 s =
45 if !Flag_parsing_c.verbose_lexing
46 then Common.pr2 s
47
48(*****************************************************************************)
49
50
51exception Lexical of string
52
53let tok lexbuf = Lexing.lexeme lexbuf
54
55let tokinfo lexbuf =
56 {
57 pinfo = Ast_c.OriginTok {
58 Common.charpos = Lexing.lexeme_start lexbuf;
59 Common.str = Lexing.lexeme lexbuf;
60 (* info filled in a post-lexing phase *)
61 Common.line = -1;
62 Common.column = -1;
63 Common.file = "";
64 };
65 (* must generate a new ref each time, otherwise share *)
66 cocci_tag = ref Ast_c.emptyAnnot;
67 comments_tag = ref Ast_c.emptyComments;
68 }
69
485bce71
C
70(* must generate a new ref each time, otherwise share *)
71let no_ifdef_mark () = ref (None: (int * int) option)
72
34e49164
C
73let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
74
75
76(* opti: less convenient, but using a hash is faster than using a match *)
77let keyword_table = Common.hash_of_list [
78
485bce71 79 (* c: *)
34e49164
C
80 "void", (fun ii -> Tvoid ii);
81 "char", (fun ii -> Tchar ii);
82 "short", (fun ii -> Tshort ii);
83 "int", (fun ii -> Tint ii);
84 "long", (fun ii -> Tlong ii);
85 "float", (fun ii -> Tfloat ii);
86 "double", (fun ii -> Tdouble ii);
87
88 "unsigned", (fun ii -> Tunsigned ii);
89 "signed", (fun ii -> Tsigned ii);
90
91 "auto", (fun ii -> Tauto ii);
92 "register", (fun ii -> Tregister ii);
93 "extern", (fun ii -> Textern ii);
94 "static", (fun ii -> Tstatic ii);
95
96 "const", (fun ii -> Tconst ii);
97 "volatile", (fun ii -> Tvolatile ii);
98
99 "struct", (fun ii -> Tstruct ii);
100 "union", (fun ii -> Tunion ii);
101 "enum", (fun ii -> Tenum ii);
102 "typedef", (fun ii -> Ttypedef ii);
103
104 "if", (fun ii -> Tif ii);
105 "else", (fun ii -> Telse ii);
106 "break", (fun ii -> Tbreak ii);
107 "continue", (fun ii -> Tcontinue ii);
108 "switch", (fun ii -> Tswitch ii);
109 "case", (fun ii -> Tcase ii);
110 "default", (fun ii -> Tdefault ii);
111 "for", (fun ii -> Tfor ii);
112 "do", (fun ii -> Tdo ii);
113 "while", (fun ii -> Twhile ii);
114 "return", (fun ii -> Treturn ii);
115 "goto", (fun ii -> Tgoto ii);
116
117 "sizeof", (fun ii -> Tsizeof ii);
118
119
120 (* gccext: cppext: linuxext: synonyms *)
121 "asm", (fun ii -> Tasm ii);
122 "__asm__", (fun ii -> Tasm ii);
123 "__asm", (fun ii -> Tasm ii);
124
125 "inline", (fun ii -> Tinline ii);
126 "__inline__", (fun ii -> Tinline ii);
127 "__inline", (fun ii -> Tinline ii);
34e49164
C
128
129 "__attribute__", (fun ii -> Tattribute ii);
130 "__attribute", (fun ii -> Tattribute ii);
131
132 "typeof", (fun ii -> Ttypeof ii);
133 "__typeof__", (fun ii -> Ttypeof ii);
485bce71
C
134 "__typeof", (fun ii -> Ttypeof ii);
135
34e49164 136
485bce71 137 (* gccext: alias *)
34e49164
C
138 "__signed__", (fun ii -> Tsigned ii);
139
140 "__const__", (fun ii -> Tconst ii);
141 "__const", (fun ii -> Tconst ii);
142
143 "__volatile__", (fun ii -> Tvolatile ii);
144 "__volatile", (fun ii -> Tvolatile ii);
485bce71
C
145
146
147 (* c99: *)
148 (* no just "restrict" ? maybe for backward compatibility they avoided
149 * to use restrict which people may have used in their program already
150 *)
151 "__restrict", (fun ii -> Trestrict ii);
152 "__restrict__", (fun ii -> Trestrict ii);
34e49164
C
153
154 ]
155
156let error_radix s =
157 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
158
159}
160
161(*****************************************************************************)
162let letter = ['A'-'Z' 'a'-'z' '_']
163let digit = ['0'-'9']
164
165(* not used for the moment *)
166let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
167 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
168let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
169let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
170(* 7 = \a = bell in C. this is not the only char allowed !!
171 * ex @ and $ ` are valid too
172 *)
173
174let cchar = (letter | digit | punctuation | additionnal)
175
176let sp = [' ' '\t']+
177let spopt = [' ' '\t']*
178
179let dec = ['0'-'9']
180let oct = ['0'-'7']
181let hex = ['0'-'9' 'a'-'f' 'A'-'F']
182
183let decimal = ('0' | (['1'-'9'] dec*))
184let octal = ['0'] oct+
185let hexa = ("0x" |"0X") hex+
186
187
188let pent = dec+
189let pfract = dec+
190let sign = ['-' '+']
191let exp = ['e''E'] sign? dec+
192let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
193
194let id = letter (letter | digit) *
195
196(*****************************************************************************)
197rule token = parse
198
199 (* ----------------------------------------------------------------------- *)
200 (* spacing/comments *)
201 (* ----------------------------------------------------------------------- *)
202
203 (* note: this lexer generate tokens for comments!! so can not give
204 * this lexer as-is to the parsing function. Must preprocess it, hence
205 * use techniques like cur_tok ref in parse_c.ml
206 *)
207
208 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
209 (* starting a new line; the newline character followed by whitespace *)
210 { TCommentNewline (tokinfo lexbuf) }
211 | [' ' '\t' '\r' '\011' '\012' ]+
212 { TCommentSpace (tokinfo lexbuf) }
213 | "/*"
214 { let info = tokinfo lexbuf in
215 let com = comment lexbuf in
216 TComment(info +> tok_add_s com)
217 }
218
219
220 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
221 * So need this here only when dont call cpp before.
485bce71 222 * note that we don't keep the trailing \n; it will be in another token.
34e49164
C
223 *)
224 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
225
226 (* ----------------------------------------------------------------------- *)
227 (* cpp *)
228 (* ----------------------------------------------------------------------- *)
229
230 (* old:
231 * | '#' { endline lexbuf} // should be line, and not endline
232 * and endline = parse | '\n' { token lexbuf}
233 * | _ { endline lexbuf}
234 *)
235
236 (* todo?:
237 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
238 * => just "#" -> token {lexbuf} (that is ignore)
239 * il y'a 1 #elif sans rien apres
240 * il y'a 1 #error sans rien apres
241 * il y'a 2 mov dede, #xxx qui genere du coup exn car
242