Coccinelle release 0.2.5-rc7.
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
1 {
2 (* Yoann Padioleau
3 *
4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15 open Common
16
17 open Parser_c
18
19 open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21 (*****************************************************************************)
22 (*
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
33 * because of the "wierd" order of evaluation of OCaml.
34 *
35 *
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
41 * typedef handling here is now useless.
42 *)
43 (*****************************************************************************)
44
45 (*****************************************************************************)
46 (* Wrappers *)
47 (*****************************************************************************)
48 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
49
50 (*****************************************************************************)
51
52
53 exception Lexical of string
54
55 let tok lexbuf = Lexing.lexeme lexbuf
56
57 let tokinfo lexbuf =
58 {
59 pinfo = Ast_c.OriginTok {
60 Common.charpos = Lexing.lexeme_start lexbuf;
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
63 Common.line = -1;
64 Common.column = -1;
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
69 comments_tag = ref Ast_c.emptyComments;
70 }
71
72 (* cppext: must generate a new ref each time, otherwise share *)
73 let no_ifdef_mark () = ref (None: (int * int) option)
74
75 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
76
77
78 (* opti: less convenient, but using a hash is faster than using a match *)
79 let keyword_table = Common.hash_of_list [
80
81 (* c: *)
82 "void", (fun ii -> Tvoid ii);
83 "char", (fun ii -> Tchar ii);
84 "short", (fun ii -> Tshort ii);
85 "int", (fun ii -> Tint ii);
86 "long", (fun ii -> Tlong ii);
87 "float", (fun ii -> Tfloat ii);
88 "double", (fun ii -> Tdouble ii);
89 "size_t", (fun ii -> Tsize_t ii);
90 "ssize_t", (fun ii -> Tssize_t ii);
91 "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
92
93 "unsigned", (fun ii -> Tunsigned ii);
94 "signed", (fun ii -> Tsigned ii);
95
96 "auto", (fun ii -> Tauto ii);
97 "register", (fun ii -> Tregister ii);
98 "extern", (fun ii -> Textern ii);
99 "static", (fun ii -> Tstatic ii);
100
101 "const", (fun ii -> Tconst ii);
102 "volatile", (fun ii -> Tvolatile ii);
103
104 "struct", (fun ii -> Tstruct ii);
105 "union", (fun ii -> Tunion ii);
106 "enum", (fun ii -> Tenum ii);
107 "typedef", (fun ii -> Ttypedef ii);
108
109 "if", (fun ii -> Tif ii);
110 "else", (fun ii -> Telse ii);
111 "break", (fun ii -> Tbreak ii);
112 "continue", (fun ii -> Tcontinue ii);
113 "switch", (fun ii -> Tswitch ii);
114 "case", (fun ii -> Tcase ii);
115 "default", (fun ii -> Tdefault ii);
116 "for", (fun ii -> Tfor ii);
117 "do", (fun ii -> Tdo ii);
118 "while", (fun ii -> Twhile ii);
119 "return", (fun ii -> Treturn ii);
120 "goto", (fun ii -> Tgoto ii);
121
122 "sizeof", (fun ii -> Tsizeof ii);
123
124
125 (* gccext: cppext: linuxext: synonyms *)
126 "asm", (fun ii -> Tasm ii);
127 "__asm__", (fun ii -> Tasm ii);
128 "__asm", (fun ii -> Tasm ii);
129
130 "inline", (fun ii -> Tinline ii);
131 "__inline__", (fun ii -> Tinline ii);
132 "__inline", (fun ii -> Tinline ii);
133
134 "__attribute__", (fun ii -> Tattribute ii);
135 "__attribute", (fun ii -> Tattribute ii);
136
137 "typeof", (fun ii -> Ttypeof ii);
138 "__typeof__", (fun ii -> Ttypeof ii);
139 "__typeof", (fun ii -> Ttypeof ii);
140
141 (* found a lot in expanded code *)
142 "__extension__", (fun ii -> TattributeNoarg ii);
143
144
145 (* gccext: alias *)
146 "__signed__", (fun ii -> Tsigned ii);
147
148 "__const__", (fun ii -> Tconst ii);
149 "__const", (fun ii -> Tconst ii);
150
151 "__volatile__", (fun ii -> Tvolatile ii);
152 "__volatile", (fun ii -> Tvolatile ii);
153
154 (* windowsext: *)
155 "__declspec", (fun ii -> Tattribute ii);
156
157 "__stdcall", (fun ii -> TattributeNoarg ii);
158 "__cdecl", (fun ii -> TattributeNoarg ii);
159 "WINAPI", (fun ii -> TattributeNoarg ii);
160 "APIENTRY", (fun ii -> TattributeNoarg ii);
161 "CALLBACK", (fun ii -> TattributeNoarg ii);
162
163 (* c99: *)
164 (* no just "restrict" ? maybe for backward compatibility they avoided
165 * to use restrict which people may have used in their program already
166 *)
167 "__restrict", (fun ii -> Trestrict ii);
168 "__restrict__", (fun ii -> Trestrict ii);
169
170 ]
171
172 let cpp_keyword_table = Common.hash_of_list [
173 "new", (fun ii -> Tnew ii);
174 "delete",(fun ii -> Tdelete ii) ]
175
176 let error_radix s =
177 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
178
179 (* julia: functions for figuring out the type of integers *)
180
181 let is_long_dec s int uint long ulong =
182 match !Flag_parsing_c.int_thresholds with
183 None -> int
184 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
185 let bn = Big_int.big_int_of_string s in
186 if Big_int.ge_big_int bn ulong_threshold
187 then ulong
188 else
189 if Big_int.ge_big_int bn long_threshold
190 then long
191 else
192 if Big_int.ge_big_int bn uint_threshold
193 then long
194 else int
195
196 let is_long_ho s int uint long ulong drop bpd count =
197 match !Flag_parsing_c.int_thresholds with
198 None -> int
199 | Some (uint_sz,ulong_sz,_,_,_) ->
200 let len = String.length s in
201 (* this assumes that all of the hex/oct digits are significant *)
202 (* drop is 2 for hex (0x) and 1 for oct (0) *)
203 let s = String.sub s drop (len - drop) in
204 let len =
205 ((len-drop) * bpd) -
206 (count (int_of_string("0x"^(String.sub s 0 1)))) in
207 if len < uint_sz
208 then int
209 else
210 if len = uint_sz
211 then uint
212 else
213 if len < ulong_sz
214 then long
215 else ulong
216
217 let is_long_oct s int uint long ulong =
218 is_long_ho s int uint long ulong 1 3
219 (* stupid, but probably more efficient than taking logs *)
220 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
221 let is_long_hex s int uint long ulong =
222 is_long_ho s int uint long ulong 2 4
223 (* stupid, but probably more efficient than taking logs *)
224 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
225 | _ -> 0)
226
227 let sint = (Signed,CInt)
228 let uint = (UnSigned,CInt)
229 let slong = (Signed,CLong)
230 let ulong = (UnSigned,CLong)
231
232 }
233
234 (*****************************************************************************)
235 let letter = ['A'-'Z' 'a'-'z' '_']
236 let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*)
237 let digit = ['0'-'9']
238
239 (* not used for the moment *)
240 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
241 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
242 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
243 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
244 (* 7 = \a = bell in C. this is not the only char allowed !!
245 * ex @ and $ ` are valid too
246 *)
247
248 let cchar = (letter | digit | punctuation | additionnal)
249
250 let sp = [' ' '\t']+
251 let spopt = [' ' '\t']*
252
253 let dec = ['0'-'9']
254 let oct = ['0'-'7']
255 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
256
257 let decimal = ('0' | (['1'-'9'] dec*))
258 let octal = ['0'] oct+
259 let hexa = ("0x" |"0X") hex+
260
261
262 let pent = dec+
263 let pfract = dec+
264 let sign = ['-' '+']
265 let exp = ['e''E'] sign? dec+
266 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
267
268 let id = letter (letter | digit) *
269
270 (*****************************************************************************)
271 rule token = parse
272
273 (* ----------------------------------------------------------------------- *)
274 (* spacing/comments *)
275 (* ----------------------------------------------------------------------- *)
276
277 (* note: this lexer generate tokens for comments!! so can not give
278 * this lexer as-is to the parsing function. The caller must preprocess
279 * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
280 *
281 * update: we now also generate a separate token for newlines, so now
282 * the caller may also have to reagglomerate all those commentspace
283 * tokens if he was assuming that spaces were agglomerate in a single
284 * token.
285 *)
286
287 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
288 (* starting a new line; the newline character followed by whitespace *)
289 { TCommentNewline (tokinfo lexbuf) }
290 | [' ' '\t' '\r' '\011' '\012' ]+
291 { TCommentSpace (tokinfo lexbuf) }
292 | "/*"
293 { let info = tokinfo lexbuf in
294 let com = comment lexbuf in
295
296 let info' = info +> tok_add_s com in
297 let s = Ast_c.str_of_info info' in
298 (* could be more flexible, use [\t ]* instead of hardcoded
299 * single space. *)
300 match s with
301 | "/* {{coccinelle:skip_start}} */" ->
302 TCommentSkipTagStart (info')
303 | "/* {{coccinelle:skip_end}} */" ->
304 TCommentSkipTagEnd (info')
305 | _ -> TComment(info')
306 }
307
308
309 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
310 * So need this here only when dont call cpp before.
311 * note that we don't keep the trailing \n; it will be in another token.
312 *)
313 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
314
315 (* ----------------------------------------------------------------------- *)
316 (* cpp *)
317 (* ----------------------------------------------------------------------- *)
318
319 (* old:
320 * | '#' { endline lexbuf} // should be line, and not endline
321 * and endline = parse | '\n' { token lexbuf}
322 * | _ { endline lexbuf}
323 *)
324
325 (* less?:
326 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
327 * => just "#" -> token {lexbuf} (that is ignore)
328 * il y'a 1 #elif sans rien apres
329 * il y'a 1 #error sans rien apres
330 * il y'a 2 mov dede, #xxx qui genere du coup exn car
331 * entouré par des #if 0
332 * => make as for comment, call a comment_cpp that when #endif finish the
333 * comment and if other cpp stuff raise exn
334 * il y'a environ 10 #if(xxx) ou le ( est collé direct
335 * il y'a des include"" et include<
336 * il y'a 1 ` (derriere un #ifndef linux)
337 *)
338
339
340
341 (* ---------------------- *)
342 (* misc *)
343 (* ---------------------- *)
344
345 (* bugfix: I want now to keep comments for the cComment study
346 * so cant do: sp [^'\n']+ '\n'
347 * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
348 *)
349
350 | "#" spopt "pragma" sp [^'\n']* '\n'
351 | "#" spopt "ident" sp [^'\n']* '\n'
352 | "#" spopt "line" sp [^'\n']* '\n'
353 | "#" spopt "error" sp [^'\n']* '\n'
354 | "#" spopt "warning" sp [^'\n']* '\n'
355 | "#" spopt "abort" sp [^'\n']* '\n'
356 { TCppDirectiveOther (tokinfo lexbuf) }
357
358 | "#" [' ' '\t']* '\n'
359 { TCppDirectiveOther (tokinfo lexbuf) }
360
361 (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
362 | "#" sp pent sp '"' [^ '"']* '"' (spopt pent)* spopt '\n'
363 { TCppDirectiveOther (tokinfo lexbuf) }
364
365
366
367 (* ---------------------- *)
368 (* #define, #undef *)
369 (* ---------------------- *)
370
371 (* the rest of the lexing/parsing of define is done in fix_tokens_define
372 * where we parse until a TCppEscapedNewline and generate a TDefEol
373 *)
374 | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
375
376 (* note: in some cases can have stuff after the ident as in #undef XXX 50,
377 * but I currently don't handle it cos I think it's bad code.
378 *)
379 | "#" [' ' '\t']* "undef" { TUndef (tokinfo lexbuf) }
380
381 (* ---------------------- *)
382 (* #include *)
383 (* ---------------------- *)
384
385 (* The difference between a local "" and standard <> include is computed
386 * later in parser_c.mly. So redo a little bit of lexing there; ugly but
387 * simpler to generate a single token here. *)
388 | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
389 (('"' ([^ '"']+) '"' |
390 '<' [^ '>']+ '>' |
391 ['A'-'Z''_']+
392 ) as filename)
393 { let info = tokinfo lexbuf in
394 TInclude (includes, filename, Ast_c.noInIfdef(), info)
395 }
396 (* gccext: found in glibc *)
397 | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
398 (('"' ([^ '"']+) '"' |
399 '<' [^ '>']+ '>' |
400 ['A'-'Z''_']+
401 ) as filename)
402 { let info = tokinfo lexbuf in
403 TInclude (includes, filename, Ast_c.noInIfdef(), info)
404 }
405
406 (* ---------------------- *)
407 (* #ifdef *)
408 (* ---------------------- *)
409
410 (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
411 * when working on the ifdef view.
412 *)
413
414 (* '0'+ because sometimes it is a #if 000 *)
415 | "#" [' ' '\t']* "if" [' ' '\t']* '0'+ (* [^'\n']* '\n' *)
416 { let info = tokinfo lexbuf in
417 TIfdefBool (false, no_ifdef_mark(), info)
418 (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
419 }
420
421 | "#" [' ' '\t']* "if" [' ' '\t']* '1' (* [^'\n']* '\n' *)
422 { let info = tokinfo lexbuf in
423 TIfdefBool (true, no_ifdef_mark(), info)
424
425 }
426
427 (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
428 | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
429 { let info = tokinfo lexbuf in
430 TIfdefMisc (false, no_ifdef_mark(), info)
431 }
432
433 (* DO NOT cherry pick to lexer_cplusplus !!! *)
434 | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus" [^'\n']* '\n'
435 { let info = tokinfo lexbuf in
436 TIfdefMisc (false, no_ifdef_mark(), info)
437 }
438
439 (* in glibc *)
440 | "#" spopt ("ifdef"|"if") sp "__STDC__"
441 { let info = tokinfo lexbuf in
442 TIfdefVersion (true, no_ifdef_mark(),
443 info +> tok_add_s (cpp_eat_until_nl lexbuf))
444 }
445
446
447 (* linuxext: different possible variations (we do not manage all of them):
448
449 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
450 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
451 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
452 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
453 #if LINUX_VERSION_CODE < 0x020600
454 #if LINUX_VERSION_CODE >= 0x2051c
455 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
456 #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
457 #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
458 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
459 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
460 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
461 #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
462 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
463 #ifndef LINUX_VERSION_CODE
464 #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
465 (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
466 LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
467 #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
468 #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
469 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
470
471 *)
472
473 (* linuxext: must be before the generic rules for if and ifdef *)
474 | "#" spopt "if" sp "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
475 { let info = tokinfo lexbuf in
476 TIfdefVersion (true, no_ifdef_mark(),
477 info +> tok_add_s (cpp_eat_until_nl lexbuf))
478 }
479 (* linuxext: *)
480 | "#" spopt "if" sp "!" "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
481 | "#" spopt "if" sp ['(']? "LINUX_VERSION_CODE" sp ("<=" | "<") sp
482
483 { let info = tokinfo lexbuf in
484 TIfdefVersion (false, no_ifdef_mark(),
485 info +> tok_add_s (cpp_eat_until_nl lexbuf))
486 }
487
488
489
490
491 (* can have some ifdef 0 hence the letter|digit even at beginning of word *)
492 | "#" [' ''\t']* "ifdef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
493 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
494 | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
495 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
496 | "#" [' ''\t']* "if" [' ' '\t']+
497 { let info = tokinfo lexbuf in
498 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
499 }
500 | "#" [' ' '\t']* "if" '('
501 { let info = tokinfo lexbuf in
502 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
503 }
504
505 | "#" [' ' '\t']* "elif" [' ' '\t']+
506 { let info = tokinfo lexbuf in
507 TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
508 }
509
510
511 | "#" [' ''\t']* "endif" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
512 { TEndif (no_ifdef_mark(), tokinfo lexbuf) }
513 (* bugfix: can have #endif LINUX but at the same time if I eat everything
514 * until next line, I may miss some TComment which for some tools
515 * are important such as aComment
516 *)
517 | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
518 TEndif (no_ifdef_mark(), tokinfo lexbuf)
519 }
520 (* can be at eof *)
521 (*| "#" [' ' '\t']* "endif" { TEndif (tokinfo lexbuf) }*)
522
523 | "#" [' ' '\t']* "else" [' ' '\t' '\n']
524 { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
525
526
527
528
529 (* ---------------------- *)
530 (* #define body *)
531 (* ---------------------- *)
532
533 (* only in cpp directives normally *)
534 | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
535
536 (* We must generate separate tokens for #, ## and extend the grammar.
537 * Note there can be "elaborated" idents in many different places, in
538 * expression but also in declaration, in function name. So having 3 tokens
539 * for an ident does not work well with how we add info in
540 * ast_c. Was easier to generate just one token, just one info,
541 * even if have later to reanalyse those tokens and unsplit. But then,
542 * handling C++ lead to having not just a string for ident but something
543 * more complex. Also when we want to parse elaborated function headers
544 * (e.g. void METH(foo)(int x)), we need anyway to go from a string
545 * to something more. So having also for C something more than just
546 * string for ident is natural.
547 *
548 * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
549 * an easier solution would be to augment the TIdent type such as
550 * TIdent of string * info * cpp_ident_additionnal_info
551 *
552 * old:
553 * | id ([' ''\t']* "##" [' ''\t']* id)+
554 * { let info = tokinfo lexbuf in
555 * TIdent (tok lexbuf, info)
556 * }
557 * | "##" spopt id
558 * { let info = tokinfo lexbuf in
559 * TIdent (tok lexbuf, info)
560 * }
561 *
562 *)
563 (* cppext: string concatenation of idents, also ##args for variadic macro. *)
564 | "##" { TCppConcatOp (tokinfo lexbuf) }
565
566 (* cppext: stringification.
567 * bugfix: this case must be after the other cases such as #endif
568 * otherwise take precedent.
569 *)
570 | "#" spopt id
571 { let info = tokinfo lexbuf in
572 TIdent (tok lexbuf, info)
573 }
574 (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
575 | ((id as s) "...")
576 { TDefParamVariadic (s, tokinfo lexbuf) }
577
578
579
580
581
582 (* ----------------------------------------------------------------------- *)
583 (* C symbols *)
584 (* ----------------------------------------------------------------------- *)
585 (* stdC:
586 ... && -= >= ~ + ; ]
587 <<= &= -> >> % , < ^
588 >>= *= /= ^= & - = {
589 != ++ << |= ( . > |
590 %= += <= || ) / ? }
591 -- == ! * : [
592 recent addition: <: :> <% %>
593 only at processing: %: %:%: # ##
594 *)
595
596
597 | '[' { TOCro(tokinfo lexbuf) } | ']' { TCCro(tokinfo lexbuf) }
598 | '(' { TOPar(tokinfo lexbuf) } | ')' { TCPar(tokinfo lexbuf) }
599 | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
600
601 | '+' { TPlus(tokinfo lexbuf) } | '*' { TMul(tokinfo lexbuf) }
602 | '-' { TMinus(tokinfo lexbuf) } | '/' { TDiv(tokinfo lexbuf) }
603 | '%' { TMod(tokinfo lexbuf) }
604
605 | "++"{ TInc(tokinfo lexbuf) } | "--"{ TDec(tokinfo lexbuf) }
606
607 | "=" { TEq(tokinfo lexbuf) }
608
609 | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
610 | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
611 | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
612 | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
613 | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
614 | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
615 | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
616 | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
617 | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
618 | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
619
620 | "==" { TEqEq(tokinfo lexbuf) } | "!=" { TNotEq(tokinfo lexbuf) }
621 | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
622 | "<" { TInf(tokinfo lexbuf) } | ">" { TSup(tokinfo lexbuf) }
623
624 | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
625 | ">>" { TShr(tokinfo lexbuf) } | "<<" { TShl(tokinfo lexbuf) }
626 | "&" { TAnd(tokinfo lexbuf) } | "|" { TOr(tokinfo lexbuf) }
627 | "^" { TXor(tokinfo lexbuf) }
628 | "..." { TEllipsis(tokinfo lexbuf) }
629 | "->" { TPtrOp(tokinfo lexbuf) } | '.' { TDot(tokinfo lexbuf) }
630 | ',' { TComma(tokinfo lexbuf) }
631 | ";" { TPtVirg(tokinfo lexbuf) }
632 | "?" { TWhy(tokinfo lexbuf) } | ":" { TDotDot(tokinfo lexbuf) }
633 | "!" { TBang(tokinfo lexbuf) } | "~" { TTilde(tokinfo lexbuf) }
634
635 | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
636 | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
637
638
639
640 (* ----------------------------------------------------------------------- *)
641 (* C keywords and ident *)
642 (* ----------------------------------------------------------------------- *)
643
644 (* StdC: must handle at least name of length > 509, but can
645 * truncate to 31 when compare and truncate to 6 and even lowerise
646 * in the external linkage phase
647 *)
648 | letter (letter | digit) *
649 { let info = tokinfo lexbuf in
650 let s = tok lexbuf in
651 Common.profile_code "C parsing.lex_ident" (fun () ->
652 let tok =
653 if !Flag.c_plus_plus
654 then Common.optionise (fun () -> Hashtbl.find cpp_keyword_table s)
655 else None in
656 match tok with
657 Some f -> f info
658 | None ->
659 match Common.optionise (fun () -> Hashtbl.find keyword_table s)
660 with
661 | Some f -> f info
662
663 (* parse_typedef_fix.
664 * if Lexer_parser.is_typedef s
665 * then TypedefIdent (s, info)
666 * else TIdent (s, info)
667 *
668 * update: now this is no more useful, cos
669 * as we use tokens_all, it first parse all as an ident and
670 * later transform an indent in a typedef. so the typedef job is
671 * now done in parse_c.ml.
672 *)
673
674 | None -> TIdent (s, info)
675 )
676 }
677 (* gccext: apparently gcc allows dollar in variable names. found such
678 * thing a few time in linux and in glibc. No need look in keyword_table
679 * here.
680 *)
681 | (letter | '$') (letter | digit | '$') *
682 {
683 let info = tokinfo lexbuf in
684 let s = tok lexbuf in
685 pr2 ("LEXER: identifier with dollar: " ^ s);
686 TIdent (s, info)
687 }
688 | (letter | '$') (letter | digit | '$') *
689 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
690 ("::~" (letter | '$') (letter | digit | '$') *
691 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) +
692
693 {
694 if !Flag.c_plus_plus
695 then
696 begin
697 let info = tokinfo lexbuf in
698 let s = tok lexbuf in
699 Tconstructorname (s, info)
700 end
701 else
702 raise
703 (Lexical "~ and :: not allowed in C identifiers, try -c++ option")
704 }
705 | ((letter | '$') (letter | digit | '$') *)
706 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>')
707
708 {
709 if !Flag.c_plus_plus
710 then
711 begin
712 let info = tokinfo lexbuf in
713 let s = tok lexbuf in
714 TypedefIdent (s, info)
715 end
716 else raise (Lexical "<> detected, try -c++ option")
717 }
718
719
720 | (((letter | '$') (letter | digit | '$') *) as first)
721 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
722 "::" (((letter | '$') (letter | digit | '$') *) as second)
723 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
724 ("::" ((letter | '$') (letter | digit | '$') *)
725 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) *
726
727 {
728 if !Flag.c_plus_plus
729 then
730 begin
731 let info = tokinfo lexbuf in
732 let s = tok lexbuf in
733 if first = second
734 then Tconstructorname (s, info)
735 else TIdent (s, info)
736 end
737 else
738 raise
739 (Lexical "~ and :: not allowed in C identifiers, try -c++ option")
740 }
741
742 | "::" ((letter | '$') (letter | digit | '$') *)
743 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?
744 ("::" ((letter | '$') (letter | digit | '$') *)
745 ('<' (letter | '$' | '~') (letter | digit | '$' | '~') * '>') ?) *
746
747 {
748 if !Flag.c_plus_plus
749 then
750 begin
751 let info = tokinfo lexbuf in
752 let s = tok lexbuf in
753 TIdent (s, info)
754 end
755 else
756 raise
757 (Lexical "~ and :: not allowed in C identifiers, try -c++ option")
758 }
759
760
761 (* ----------------------------------------------------------------------- *)
762 (* C constant *)
763 (* ----------------------------------------------------------------------- *)
764
765 | "'"
766 { let info = tokinfo lexbuf in
767 let s = char lexbuf in
768 TChar ((s, IsChar), (info +> tok_add_s (s ^ "'")))
769 }
770 | '"'
771 { let info = tokinfo lexbuf in
772 let s = string lexbuf in
773 TString ((s, IsChar), (info +> tok_add_s (s ^ "\"")))
774 }
775 (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
776 | 'L' "'"
777 { let info = tokinfo lexbuf in
778 let s = char lexbuf in
779 TChar ((s, IsWchar), (info +> tok_add_s (s ^ "'")))
780 }
781 | 'L' '"'
782 { let info = tokinfo lexbuf in
783 let s = string lexbuf in
784 TString ((s, IsWchar), (info +> tok_add_s (s ^ "\"")))
785 }
786
787
788 (* Take care of the order ? No because lex tries the longest match. The
789 * strange diff between decimal and octal constant semantic is not
790 * understood too by refman :) refman:11.1.4, and ritchie.
791 *)
792
793 | decimal as x
794 { TInt ((x, is_long_dec x sint slong slong ulong), tokinfo lexbuf) }
795 | hexa as x
796 { TInt ((x, is_long_hex x sint uint slong ulong), tokinfo lexbuf) }
797 | octal as x
798 { TInt ((x, is_long_oct x sint uint slong ulong), tokinfo lexbuf) }
799 | ((decimal as s) ['u' 'U']) as x
800 { TInt ((x, is_long_dec s uint uint ulong ulong), tokinfo lexbuf) }
801 | ((hexa as s) ['u' 'U']) as x
802 { TInt ((x, is_long_hex s uint uint ulong ulong), tokinfo lexbuf) }
803 | ((octal as s) ['u' 'U']) as x
804 { TInt ((x, is_long_oct s uint uint ulong ulong), tokinfo lexbuf) }
805 | (( decimal as s) ['l' 'L']) as x
806 { TInt ((x, is_long_dec s slong slong slong ulong), tokinfo lexbuf) }
807 | ((hexa as s) ['l' 'L']) as x
808 { TInt ((x, is_long_hex s slong slong slong ulong), tokinfo lexbuf) }
809 | ((octal as s) ['l' 'L']) as x
810 { TInt ((x, is_long_oct s slong slong slong ulong), tokinfo lexbuf) }
811 | ((( decimal | hexa | octal) ['l' 'L'] ['u' 'U'])
812 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'])) as x
813 { TInt ((x, (UnSigned,CLong)), tokinfo lexbuf) }
814 | (( decimal | hexa | octal) ['l' 'L'] ['l' 'L']) as x
815 { TInt ((x, (Signed,CLongLong)), tokinfo lexbuf) }
816 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'] ['l' 'L']) as x
817 { TInt ((x, (UnSigned,CLongLong)), tokinfo lexbuf) }
818
819 | (real ['f' 'F']) as x { TFloat ((x, CFloat), tokinfo lexbuf) }
820 | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
821 | (real as x) { TFloat ((x, CDouble), tokinfo lexbuf) }
822
823 | ['0'] ['0'-'9']+
824 { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
825 TUnknown (tokinfo lexbuf)
826 }
827 | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
828 { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
829 TUnknown (tokinfo lexbuf)
830 }
831
832
833 (* !!! to put after other rules !!! otherwise 0xff
834 * will be parsed as an ident.
835 *)
836 | ['0'-'9']+ letter (letter | digit) *
837 { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
838 TIdent (tok lexbuf, tokinfo lexbuf)
839 }
840
841 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
842 (*
843 | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
844 | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
845 *)
846
847
848 (*------------------------------------------------------------------------ *)
849 | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
850
851 | _
852 {
853 if !Flag_parsing_c.verbose_lexing
854 then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
855 TUnknown (tokinfo lexbuf)
856 }
857
858
859
860 (*****************************************************************************)
861 and char = parse
862 | (_ as x) "'" { String.make 1 x }
863 (* todo?: as for octal, do exception beyond radix exception ? *)
864 | (("\\" (oct | oct oct | oct oct oct)) as x "'") { x }
865 (* this rule must be after the one with octal, lex try first longest
866 * and when \7 we want an octal, not an exn.
867 *)
868 | (("\\x" ((hex | hex hex))) as x "'") { x }
869 | (("\\" (_ as v)) as x "'")
870 {
871 (match v with (* Machine specific ? *)
872 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
873 | 'f' -> () | 'a' -> ()
874 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
875 | 'e' -> () (* linuxext: ? *)
876 | _ ->
877 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
878 );
879 x
880 }
881 | _
882 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
883 tok lexbuf
884 }
885
886
887
888 (*****************************************************************************)
889
890 (* todo? factorise code with char ? but not same ending token so hard. *)
891 and string = parse
892 | '"' { "" }
893 | (_ as x) { string_of_char x^string lexbuf}
894 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
895 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
896 | ("\\" (_ as v)) as x
897 {
898 (match v with (* Machine specific ? *)
899 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
900 | 'f' -> () | 'a' -> ()
901 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
902 | 'e' -> () (* linuxext: ? *)
903
904 (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
905
906 (* cppext: can have \ for multiline in string too *)
907 | '\n' -> ()
908 | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
909 );
910 x ^ string lexbuf
911 }
912
913 | eof { pr2 "LEXER: WIERD end of file in string"; ""}
914
915 (* Bug if add following code, cos match also the '"' that is needed
916 * to finish the string, and so go until end of file.
917 *)
918 (*
919 | [^ '\\']+
920 { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
921 cs ++ string lexbuf
922 }
923 *)
924
925
926
927 (*****************************************************************************)
928
929 (* less: allow only char-'*' ? *)
930 and comment = parse
931 | "*/" { tok lexbuf }
932 (* noteopti: *)
933 | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
934 | [ '*'] { let s = tok lexbuf in s ^ comment lexbuf }
935 | eof { pr2 "LEXER: end of file in comment"; "*/"}
936 | _
937 { let s = tok lexbuf in
938 pr2 ("LEXER: unrecognised symbol in comment:"^s);
939 s ^ comment lexbuf
940 }
941
942
943
944 (*****************************************************************************)
945
946 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
947 * then he has already erased the /* comment. So:
948 * - dont eat the start of the comment otherwise afterwards we are in the middle
949 * of a comment and so will problably get a parse error somewhere.
950 * - have to recognize comments in cpp_eat_until_nl.
951 *)
952
953 and cpp_eat_until_nl = parse
954 (* bugfix: *)
955 | "/*"
956 { let s = tok lexbuf in
957 let s2 = comment lexbuf in
958 let s3 = cpp_eat_until_nl lexbuf in
959 s ^ s2 ^ s3
960 }
961 | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
962
963 | "\n" { tok lexbuf }
964 (* noteopti:
965 * update: need also deal with comments chars now
966 *)
967 | [^ '\n' '\\' '/' '*' ]+
968 { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
969 | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
970 | _ { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }