permit multiline comments and strings in macros
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
1 {
2 (* Yoann Padioleau
3 *
4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15 open Common
16
17 open Parser_c
18
19 open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21 (*****************************************************************************)
22 (*
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
33 * because of the "wierd" order of evaluation of OCaml.
34 *
35 *
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
41 * typedef handling here is now useless.
42 *)
43 (*****************************************************************************)
44
45 (*****************************************************************************)
46 (* Wrappers *)
47 (*****************************************************************************)
48 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
49
50 (*****************************************************************************)
51
52
53 exception Lexical of string
54
55 let tok lexbuf = Lexing.lexeme lexbuf
56
57 let tokinfo lexbuf =
58 {
59 pinfo = Ast_c.OriginTok {
60 Common.charpos = Lexing.lexeme_start lexbuf;
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
63 Common.line = -1;
64 Common.column = -1;
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
69 annots_tag = Token_annot.empty;
70 comments_tag = ref Ast_c.emptyComments;
71 }
72
73 (* cppext: must generate a new ref each time, otherwise share *)
74 let no_ifdef_mark () = ref (None: (int * int) option)
75
76 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
77
78
79 (* opti: less convenient, but using a hash is faster than using a match *)
80 let keyword_table = Common.hash_of_list [
81
82 (* c: *)
83 "void", (fun ii -> Tvoid ii);
84 "char", (fun ii -> Tchar ii);
85 "short", (fun ii -> Tshort ii);
86 "int", (fun ii -> Tint ii);
87 "long", (fun ii -> Tlong ii);
88 "float", (fun ii -> Tfloat ii);
89 "double", (fun ii -> Tdouble ii);
90 "size_t", (fun ii -> Tsize_t ii);
91 "ssize_t", (fun ii -> Tssize_t ii);
92 "ptrdiff_t", (fun ii -> Tptrdiff_t ii);
93
94 "unsigned", (fun ii -> Tunsigned ii);
95 "signed", (fun ii -> Tsigned ii);
96
97 "auto", (fun ii -> Tauto ii);
98 "register", (fun ii -> Tregister ii);
99 "extern", (fun ii -> Textern ii);
100 "static", (fun ii -> Tstatic ii);
101
102 "const", (fun ii -> Tconst ii);
103 "volatile", (fun ii -> Tvolatile ii);
104
105 "struct", (fun ii -> Tstruct ii);
106 "union", (fun ii -> Tunion ii);
107 "enum", (fun ii -> Tenum ii);
108 "typedef", (fun ii -> Ttypedef ii);
109
110 "if", (fun ii -> Tif ii);
111 "else", (fun ii -> Telse ii);
112 "break", (fun ii -> Tbreak ii);
113 "continue", (fun ii -> Tcontinue ii);
114 "switch", (fun ii -> Tswitch ii);
115 "case", (fun ii -> Tcase ii);
116 "default", (fun ii -> Tdefault ii);
117 "for", (fun ii -> Tfor ii);
118 "do", (fun ii -> Tdo ii);
119 "while", (fun ii -> Twhile ii);
120 "return", (fun ii -> Treturn ii);
121 "goto", (fun ii -> Tgoto ii);
122
123 "sizeof", (fun ii -> Tsizeof ii);
124
125
126 (* gccext: cppext: linuxext: synonyms *)
127 "asm", (fun ii -> Tasm ii);
128 "__asm__", (fun ii -> Tasm ii);
129 "__asm", (fun ii -> Tasm ii);
130
131 "inline", (fun ii -> Tinline ii);
132 "__inline__", (fun ii -> Tinline ii);
133 "__inline", (fun ii -> Tinline ii);
134
135 "__attribute__", (fun ii -> Tattribute ii);
136 "__attribute", (fun ii -> Tattribute ii);
137
138 "typeof", (fun ii -> Ttypeof ii);
139 "__typeof__", (fun ii -> Ttypeof ii);
140 "__typeof", (fun ii -> Ttypeof ii);
141
142 (* found a lot in expanded code *)
143 "__extension__", (fun ii -> TattributeNoarg ii);
144
145
146 (* gccext: alias *)
147 "__signed__", (fun ii -> Tsigned ii);
148
149 "__const__", (fun ii -> Tconst ii);
150 "__const", (fun ii -> Tconst ii);
151
152 "__volatile__", (fun ii -> Tvolatile ii);
153 "__volatile", (fun ii -> Tvolatile ii);
154
155 (* windowsext: *)
156 "__declspec", (fun ii -> Tattribute ii);
157
158 "__stdcall", (fun ii -> TattributeNoarg ii);
159 "__cdecl", (fun ii -> TattributeNoarg ii);
160 "WINAPI", (fun ii -> TattributeNoarg ii);
161 "APIENTRY", (fun ii -> TattributeNoarg ii);
162 "CALLBACK", (fun ii -> TattributeNoarg ii);
163
164 (* c99: *)
165 (* no just "restrict" ? maybe for backward compatibility they avoided
166 * to use restrict which people may have used in their program already
167 *)
168 "__restrict", (fun ii -> Trestrict ii);
169 "__restrict__", (fun ii -> Trestrict ii);
170
171 ]
172
173 let cpp_keyword_table = Common.hash_of_list [
174 "namespace", (fun ii -> Tnamespace ii);
175 "new", (fun ii -> Tnew ii);
176 "delete", (fun ii -> Tdelete ii);
177 "using", (fun ii -> TComment ii) ]
178
179 let error_radix s =
180 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
181
182 (* julia: functions for figuring out the type of integers *)
183
184 let is_long_dec s int uint long ulong =
185 match !Flag_parsing_c.int_thresholds with
186 None -> int
187 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
188 let bn = Big_int.big_int_of_string s in
189 if Big_int.ge_big_int bn ulong_threshold
190 then ulong
191 else
192 if Big_int.ge_big_int bn long_threshold
193 then long
194 else
195 if Big_int.ge_big_int bn uint_threshold
196 then long
197 else int
198
199 let is_long_ho s int uint long ulong drop bpd count =
200 match !Flag_parsing_c.int_thresholds with
201 None -> int
202 | Some (uint_sz,ulong_sz,_,_,_) ->
203 let len = String.length s in
204 (* this assumes that all of the hex/oct digits are significant *)
205 (* drop is 2 for hex (0x) and 1 for oct (0) *)
206 let s = String.sub s drop (len - drop) in
207 let len =
208 ((len-drop) * bpd) -
209 (count (int_of_string("0x"^(String.sub s 0 1)))) in
210 if len < uint_sz
211 then int
212 else
213 if len = uint_sz
214 then uint
215 else
216 if len < ulong_sz
217 then long
218 else ulong
219
220 let is_long_oct s int uint long ulong =
221 is_long_ho s int uint long ulong 1 3
222 (* stupid, but probably more efficient than taking logs *)
223 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
224 let is_long_hex s int uint long ulong =
225 is_long_ho s int uint long ulong 2 4
226 (* stupid, but probably more efficient than taking logs *)
227 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
228 | _ -> 0)
229
230 let sint = (Signed,CInt)
231 let uint = (UnSigned,CInt)
232 let slong = (Signed,CLong)
233 let ulong = (UnSigned,CLong)
234
235 }
236
237 (*****************************************************************************)
238 let letter = ['A'-'Z' 'a'-'z' '_']
239 let extended_letter = ['A'-'Z' 'a'-'z' '_' ':' '<' '>' '~'](*for c++, not used*)
240 let digit = ['0'-'9']
241
242 let cplusplus_ident = (letter | '$') (letter | digit | '$') *
243 let cplusplus_ident_ext = (letter | '~' | '$') (letter | digit | '~' | '$') *
244
245 (* not used for the moment *)
246 let punctuation = ['!' '\"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
247 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
248 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
249 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
250 (* 7 = \a = bell in C. this is not the only char allowed !!
251 * ex @ and $ ` are valid too
252 *)
253
254 let cchar = (letter | digit | punctuation | additionnal)
255
256 let sp = [' ' '\t']+
257 let spopt = [' ' '\t']*
258
259 let dec = ['0'-'9']
260 let oct = ['0'-'7']
261 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
262
263 let decimal = ('0' | (['1'-'9'] dec*))
264 let octal = ['0'] oct+
265 let hexa = ("0x" |"0X") hex+
266
267
268 let pent = dec+
269 let pfract = dec+
270 let sign = ['-' '+']
271 let exp = ['e''E'] sign? dec+
272 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
273
274 let id = letter (letter | digit) *
275
276 (*****************************************************************************)
277 rule token = parse
278
279 (* ----------------------------------------------------------------------- *)
280 (* spacing/comments *)
281 (* ----------------------------------------------------------------------- *)
282
283 (* note: this lexer generate tokens for comments!! so can not give
284 * this lexer as-is to the parsing function. The caller must preprocess
285 * it, e.g. by using techniques like cur_tok ref in parse_c.ml.
286 *
287 * update: we now also generate a separate token for newlines, so now
288 * the caller may also have to reagglomerate all those commentspace
289 * tokens if he was assuming that spaces were agglomerate in a single
290 * token.
291 *)
292
293 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
294 (* starting a new line; the newline character followed by whitespace *)
295 { TCommentNewline (tokinfo lexbuf) }
296 | [' ' '\t' '\r' '\011' '\012' ]+
297 { TCommentSpace (tokinfo lexbuf) }
298 | "/*"
299 { let info = tokinfo lexbuf in
300 let com = comment lexbuf in
301
302 let info' = info +> tok_add_s com in
303 let s = Ast_c.str_of_info info' in
304 (* could be more flexible, use [\t ]* instead of hardcoded
305 * single space. *)
306 match s with
307 | "/* {{coccinelle:skip_start}} */" ->
308 TCommentSkipTagStart (info')
309 | "/* {{coccinelle:skip_end}} */" ->
310 TCommentSkipTagEnd (info')
311 | _ -> TComment(info')
312 }
313
314
315 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
316 * So need this here only when dont call cpp before.
317 * note that we don't keep the trailing \n; it will be in another token.
318 *)
319 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
320
321 (* ----------------------------------------------------------------------- *)
322 (* cpp *)
323 (* ----------------------------------------------------------------------- *)
324
325 (* old:
326 * | '#' { endline lexbuf} // should be line, and not endline
327 * and endline = parse | '\n' { token lexbuf}
328 * | _ { endline lexbuf}
329 *)
330
331 (* less?:
332 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
333 * => just "#" -> token {lexbuf} (that is ignore)
334 * il y'a 1 #elif sans rien apres
335 * il y'a 1 #error sans rien apres
336 * il y'a 2 mov dede, #xxx qui genere du coup exn car
337 * entouré par des #if 0
338 * => make as for comment, call a comment_cpp that when #endif finish the
339 * comment and if other cpp stuff raise exn
340 * il y'a environ 10 #if(xxx) ou le ( est collé direct
341 * il y'a des include"" et include<
342 * il y'a 1 ` (derriere un #ifndef linux)
343 *)
344
345
346
347 (* ---------------------- *)
348 (* misc *)
349 (* ---------------------- *)
350
351 (* bugfix: I want now to keep comments for the cComment study
352 * so cant do: sp [^'\n']+ '\n'
353 * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
354 *)
355
356 | "#" spopt "pragma" sp [^'\n' '\r']* ('\n' | "\r\n")
357 | "#" spopt "ident" sp [^'\n' '\r']* ('\n' | "\r\n")
358 | "#" spopt "line" sp [^'\n' '\r']* ('\n' | "\r\n")
359 | "#" spopt "error" sp [^'\n' '\r']* ('\n' | "\r\n")
360 | "#" spopt "warning" sp [^'\n' '\r']* ('\n' | "\r\n")
361 | "#" spopt "abort" sp [^'\n' '\r']* ('\n' | "\r\n")
362 { TCppDirectiveOther (tokinfo lexbuf) }
363
364 | "#" [' ' '\t']* ('\n' | "\r\n")
365 { TCppDirectiveOther (tokinfo lexbuf) }
366
367 (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
368 | "#" sp pent sp '\"' [^ '\"']* '\"' (spopt pent)* spopt ('\n' | "\r\n")
369 { TCppDirectiveOther (tokinfo lexbuf) }
370
371
372
373 (* ---------------------- *)
374 (* #define, #undef *)
375 (* ---------------------- *)
376
377 (* the rest of the lexing/parsing of define is done in fix_tokens_define
378 * where we parse until a TCppEscapedNewline and generate a TDefEol
379 *)
380 | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
381
382 (* note: in some cases can have stuff after the ident as in #undef XXX 50,
383 * but I currently don't handle it cos I think it's bad code.
384 *)
385 | "#" [' ' '\t']* "undef" { TUndef (tokinfo lexbuf) }
386
387 (* ---------------------- *)
388 (* #include *)
389 (* ---------------------- *)
390
391 (* The difference between a local "" and standard <> include is computed
392 * later in parser_c.mly. So redo a little bit of lexing there; ugly but
393 * simpler to generate a single token here. *)
394 | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
395 (('\"' ([^ '\"']+) '\"' |
396 '<' [^ '>']+ '>' |
397 ['A'-'Z''_']+
398 ) as filename)
399 { let info = tokinfo lexbuf in
400 TInclude (includes, filename, Ast_c.noInIfdef(), info)
401 }
402 (* gccext: found in glibc *)
403 | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
404 (('\"' ([^ '\"']+) '\"' |
405 '<' [^ '>']+ '>' |
406 ['A'-'Z''_']+
407 ) as filename)
408 { let info = tokinfo lexbuf in
409 TInclude (includes, filename, Ast_c.noInIfdef(), info)
410 }
411
412 (* ---------------------- *)
413 (* #ifdef *)
414 (* ---------------------- *)
415
416 (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
417 * when working on the ifdef view.
418 *)
419
420 (* '0'+ because sometimes it is a #if 000 *)
421 | "#" [' ' '\t']* "if" [' ' '\t']* '0'+ (* [^'\n']* '\n' *)
422 { let info = tokinfo lexbuf in
423 TIfdefBool (false, no_ifdef_mark(), info)
424 (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
425 }
426
427 | "#" [' ' '\t']* "if" [' ' '\t']* '1' (* [^'\n']* '\n' *)
428 { let info = tokinfo lexbuf in
429 TIfdefBool (true, no_ifdef_mark(), info)
430
431 }
432
433 (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
434 | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n' '\r']* ('\n' | "\r\n")
435 { let info = tokinfo lexbuf in
436 TIfdefMisc (false, no_ifdef_mark(), info)
437 }
438
439 (* DO NOT cherry pick to lexer_cplusplus !!! *)
440 | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus" [^'\n']* '\n'
441 { let info = tokinfo lexbuf in
442 TIfdefMisc (false, no_ifdef_mark(), info)
443 }
444
445 (* in glibc *)
446 | "#" spopt ("ifdef"|"if") sp "__STDC__"
447 { let info = tokinfo lexbuf in
448 TIfdefVersion (true, no_ifdef_mark(),
449 info +> tok_add_s (cpp_eat_until_nl lexbuf))
450 }
451
452
453 (* linuxext: different possible variations (we do not manage all of them):
454
455 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
456 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
457 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
458 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
459 #if LINUX_VERSION_CODE < 0x020600
460 #if LINUX_VERSION_CODE >= 0x2051c
461 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
462 #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
463 #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
464 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
465 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
466 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
467 #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
468 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
469 #ifndef LINUX_VERSION_CODE
470 #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
471 (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
472 LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
473 #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
474 #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
475 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
476
477 *)
478
479 (* linuxext: must be before the generic rules for if and ifdef *)
480 | "#" spopt "if" sp "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
481 { let info = tokinfo lexbuf in
482 TIfdefVersion (true, no_ifdef_mark(),
483 info +> tok_add_s (cpp_eat_until_nl lexbuf))
484 }
485 (* linuxext: *)
486 | "#" spopt "if" sp "!" "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
487 | "#" spopt "if" sp ['(']? "LINUX_VERSION_CODE" sp ("<=" | "<") sp
488
489 { let info = tokinfo lexbuf in
490 TIfdefVersion (false, no_ifdef_mark(),
491 info +> tok_add_s (cpp_eat_until_nl lexbuf))
492 }
493
494
495
496
497 (* can have some ifdef 0 hence the letter|digit even at beginning of word *)
498 | "#" [' ''\t']* "ifdef" [' ''\t']+
499 (((letter|digit) ((letter|digit)*)) as x) [' ''\t']*
500 { if List.mem x !Flag_parsing_c.undefined
501 then TIfdefBool (false, no_ifdef_mark(), tokinfo lexbuf)
502 else if List.mem x !Flag_parsing_c.defined
503 then TIfdefBool (true, no_ifdef_mark(), tokinfo lexbuf)
504 else TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
505 | "#" [' ''\t']* "ifndef" [' ''\t']+
506 (((letter|digit) ((letter|digit)*)) as x) [' ''\t']*
507 { if List.mem x !Flag_parsing_c.defined
508 then TIfdefBool (false, no_ifdef_mark(), tokinfo lexbuf)
509 else if List.mem x !Flag_parsing_c.undefined
510 then TIfdefBool (true, no_ifdef_mark(), tokinfo lexbuf)
511 else TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
512 | "#" [' ''\t']* "if" [' ' '\t']+
513 { let info = tokinfo lexbuf in
514 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
515 }
516 | "#" [' ' '\t']* "if" '('
517 { let info = tokinfo lexbuf in
518 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
519 }
520
521 | "#" [' ' '\t']* "elif" [' ' '\t']+
522 { let info = tokinfo lexbuf in
523 TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
524 }
525
526
527 | "#" [' ''\t']* "endif" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
528 { TEndif (no_ifdef_mark(), tokinfo lexbuf) }
529 (* bugfix: can have #endif LINUX but at the same time if I eat everything
530 * until next line, I may miss some TComment which for some tools
531 * are important such as aComment
532 *)
533 | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
534 TEndif (no_ifdef_mark(), tokinfo lexbuf)
535 }
536 (* can be at eof *)
537 (*| "#" [' ' '\t']* "endif" { TEndif (tokinfo lexbuf) }*)
538
539 | "#" [' ' '\t']* "else" ([' ' '\t' '\n'] | "\r\n")
540 { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
541
542
543
544
545 (* ---------------------- *)
546 (* #define body *)
547 (* ---------------------- *)
548
549 (* only in cpp directives normally *)
550 | "\\" ('\n' | "\r\n") { TCppEscapedNewline (tokinfo lexbuf) }
551
552 (* We must generate separate tokens for #, ## and extend the grammar.
553 * Note there can be "elaborated" idents in many different places, in
554 * expression but also in declaration, in function name. So having 3 tokens
555 * for an ident does not work well with how we add info in
556 * ast_c. Was easier to generate just one token, just one info,
557 * even if have later to reanalyse those tokens and unsplit. But then,
558 * handling C++ lead to having not just a string for ident but something
559 * more complex. Also when we want to parse elaborated function headers
560 * (e.g. void METH(foo)(int x)), we need anyway to go from a string
561 * to something more. So having also for C something more than just
562 * string for ident is natural.
563 *
564 * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
565 * an easier solution would be to augment the TIdent type such as
566 * TIdent of string * info * cpp_ident_additionnal_info
567 *
568 * old:
569 * | id ([' ''\t']* "##" [' ''\t']* id)+
570 * { let info = tokinfo lexbuf in
571 * TIdent (tok lexbuf, info)
572 * }
573 * | "##" spopt id
574 * { let info = tokinfo lexbuf in
575 * TIdent (tok lexbuf, info)
576 * }
577 *
578 *)
579 (* cppext: string concatenation of idents, also ##args for variadic macro. *)
580 | "##" { TCppConcatOp (tokinfo lexbuf) }
581
582 (* cppext: stringification.
583 * bugfix: this case must be after the other cases such as #endif
584 * otherwise take precedent.
585 *)
586 | "#" spopt id
587 { let info = tokinfo lexbuf in
588 TIdent (tok lexbuf, info)
589 }
590 (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
591 | ((id as s) "...")
592 { TDefParamVariadic (s, tokinfo lexbuf) }
593
594
595
596
597
598 (* ----------------------------------------------------------------------- *)
599 (* C symbols *)
600 (* ----------------------------------------------------------------------- *)
601 (* stdC:
602 ... && -= >= ~ + ; ]
603 <<= &= -> >> % , < ^
604 >>= *= /= ^= & - = {
605 != ++ << |= ( . > |
606 %= += <= || ) / ? }
607 -- == ! * : [
608 recent addition: <: :> <% %>
609 only at processing: %: %:%: # ##
610 *)
611
612
613 | '[' { TOCro(tokinfo lexbuf) } | ']' { TCCro(tokinfo lexbuf) }
614 | '(' { TOPar(tokinfo lexbuf) } | ')' { TCPar(tokinfo lexbuf) }
615 | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
616
617 | '+' { TPlus(tokinfo lexbuf) } | '*' { TMul(tokinfo lexbuf) }
618 | '-' { TMinus(tokinfo lexbuf) } | '/' { TDiv(tokinfo lexbuf) }
619 | '%' { TMod(tokinfo lexbuf) } | ">?" { TMax(tokinfo lexbuf) }
620 | "<?" { TMin(tokinfo lexbuf) }
621
622 | "++"{ TInc(tokinfo lexbuf) } | "--"{ TDec(tokinfo lexbuf) }
623
624 | "=" { TEq(tokinfo lexbuf) }
625
626 | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
627 | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
628 | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
629 | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
630 | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
631 | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
632 | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
633 | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
634 | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
635 | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
636 | ">?=" {TAssign (OpAssign Max, (tokinfo lexbuf))}
637 | "<?=" {TAssign (OpAssign Min, (tokinfo lexbuf))}
638
639 | "==" { TEqEq(tokinfo lexbuf) } | "!=" { TNotEq(tokinfo lexbuf) }
640 | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
641 | "<" { TInf(tokinfo lexbuf) } | ">" { TSup(tokinfo lexbuf) }
642
643 | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
644 | ">>" { TShr(tokinfo lexbuf) } | "<<" { TShl(tokinfo lexbuf) }
645 | "&" { TAnd(tokinfo lexbuf) } | "|" { TOr(tokinfo lexbuf) }
646 | "^" { TXor(tokinfo lexbuf) }
647 | "..." { TEllipsis(tokinfo lexbuf) }
648 | "->" { TPtrOp(tokinfo lexbuf) } | '.' { TDot(tokinfo lexbuf) }
649 | ',' { TComma(tokinfo lexbuf) }
650 | ";" { TPtVirg(tokinfo lexbuf) }
651 | "?" { TWhy(tokinfo lexbuf) } | ":" { TDotDot(tokinfo lexbuf) }
652 | "!" { TBang(tokinfo lexbuf) } | "~" { TTilde(tokinfo lexbuf) }
653
654 | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
655 | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
656
657
658
659 (* ----------------------------------------------------------------------- *)
660 (* C keywords and ident *)
661 (* ----------------------------------------------------------------------- *)
662
663 (* StdC: must handle at least name of length > 509, but can
664 * truncate to 31 when compare and truncate to 6 and even lowerise
665 * in the external linkage phase
666 *)
667 | letter (letter | digit) *
668 { let info = tokinfo lexbuf in
669 let s = tok lexbuf in
670 Common.profile_code "C parsing.lex_ident" (fun () ->
671 let tok =
672 if !Flag.c_plus_plus
673 then Common.optionise (fun () -> Hashtbl.find cpp_keyword_table s)
674 else None in
675 match tok with
676 Some f -> f info
677 | None ->
678 match Common.optionise (fun () -> Hashtbl.find keyword_table s)
679 with
680 | Some f -> f info
681
682 (* parse_typedef_fix.
683 * if Lexer_parser.is_typedef s
684 * then TypedefIdent (s, info)
685 * else TIdent (s, info)
686 *
687 * update: now this is no more useful, cos
688 * as we use tokens_all, it first parse all as an ident and
689 * later transform an indent in a typedef. so the typedef job is
690 * now done in parse_c.ml.
691 *)
692
693 | None -> TIdent (s, info)
694 )
695 }
696 (* gccext: apparently gcc allows dollar in variable names. found such
697 * thing a few time in linux and in glibc. No need look in keyword_table
698 * here.
699 *)
700 | (cplusplus_ident "::")+ "operator new"
701 {
702 let info = tokinfo lexbuf in
703 let s = tok lexbuf in
704 TIdent (s, info)
705 }
706 | cplusplus_ident
707 {
708 let info = tokinfo lexbuf in
709 let s = tok lexbuf in
710 pr2 ("LEXER: identifier with dollar: " ^ s);
711 TIdent (s, info)
712 }
713
714 | cplusplus_ident
715 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
716 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
717 ("::~" cplusplus_ident
718 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
719 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) +
720
721 {
722 let info = tokinfo lexbuf in
723 let s = tok lexbuf in
724 if !Flag.c_plus_plus
725 then Tconstructorname (s, info)
726 else
727 begin
728 pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
729 TIdent (s, info)
730 end
731 }
732 | cplusplus_ident
733 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
734 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>')
735
736 {
737 let info = tokinfo lexbuf in
738 let s = tok lexbuf in
739 if !Flag.c_plus_plus
740 then TypedefIdent (s, info)
741 else
742 begin
743 pr2_once "<> detected, try -c++ option";
744 TIdent (s, info)
745 end
746 }
747
748
749 | (cplusplus_ident as first)
750 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
751 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
752 "::" (cplusplus_ident as second)
753 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
754 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
755 ("::" cplusplus_ident
756 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
757 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) *
758
759 {
760 let info = tokinfo lexbuf in
761 let s = tok lexbuf in
762 if !Flag.c_plus_plus
763 then
764 begin
765 if first = second
766 then Tconstructorname (s, info)
767 else TIdent (s, info)
768 end
769 else
770 begin
771 pr2_once "~ and :: not allowed in C identifiers, try -c++ option";
772 TIdent (s, info)
773 end
774 }
775
776 | "::" cplusplus_ident
777 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
778 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?
779 ("::" cplusplus_ident
780 ('<' "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'*
781 (", " "const "? cplusplus_ident_ext ("::" cplusplus_ident_ext) * '*'* ) * '>') ?) *
782
783 {
784 let info = tokinfo lexbuf in
785 let s = tok lexbuf in
786 (if not !Flag.c_plus_plus
787 then
788 pr2_once "~ and :: not allowed in C identifiers, try -c++ option");
789 TIdent (s, info)
790 }
791
792 (* ----------------------------------------------------------------------- *)
793 (* C constant *)
794 (* ----------------------------------------------------------------------- *)
795
796 | "'"
797 { let info = tokinfo lexbuf in
798 let s = char lexbuf in
799 TChar ((s, IsChar), (info +> tok_add_s (s ^ "'")))
800 }
801 | '\"'
802 { let info = tokinfo lexbuf in
803 let s = string lexbuf in
804 TString ((s, IsChar), (info +> tok_add_s (s ^ "\"")))
805 }
806 (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
807 | 'L' "'"
808 { let info = tokinfo lexbuf in
809 let s = char lexbuf in
810 TChar ((s, IsWchar), (info +> tok_add_s (s ^ "'")))
811 }
812 | 'L' '\"'
813 { let info = tokinfo lexbuf in
814 let s = string lexbuf in
815 TString ((s, IsWchar), (info +> tok_add_s (s ^ "\"")))
816 }
817
818
819 (* Take care of the order ? No because lex tries the longest match. The
820 * strange diff between decimal and octal constant semantic is not
821 * understood too by refman :) refman:11.1.4, and ritchie.
822 *)
823
824 | decimal as x
825 { TInt ((x, is_long_dec x sint slong slong ulong), tokinfo lexbuf) }
826 | hexa as x
827 { TInt ((x, is_long_hex x sint uint slong ulong), tokinfo lexbuf) }
828 | octal as x
829 { TInt ((x, is_long_oct x sint uint slong ulong), tokinfo lexbuf) }
830 | ((decimal as s) ['u' 'U']) as x
831 { TInt ((x, is_long_dec s uint uint ulong ulong), tokinfo lexbuf) }
832 | ((hexa as s) ['u' 'U']) as x
833 { TInt ((x, is_long_hex s uint uint ulong ulong), tokinfo lexbuf) }
834 | ((octal as s) ['u' 'U']) as x
835 { TInt ((x, is_long_oct s uint uint ulong ulong), tokinfo lexbuf) }
836 | (( decimal as s) ['l' 'L']) as x
837 { TInt ((x, is_long_dec s slong slong slong ulong), tokinfo lexbuf) }
838 | ((hexa as s) ['l' 'L']) as x
839 { TInt ((x, is_long_hex s slong slong slong ulong), tokinfo lexbuf) }
840 | ((octal as s) ['l' 'L']) as x
841 { TInt ((x, is_long_oct s slong slong slong ulong), tokinfo lexbuf) }
842 | ((( decimal | hexa | octal) ['l' 'L'] ['u' 'U'])
843 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'])) as x
844 { TInt ((x, (UnSigned,CLong)), tokinfo lexbuf) }
845 | (( decimal | hexa | octal) ['l' 'L'] ['l' 'L']) as x
846 { TInt ((x, (Signed,CLongLong)), tokinfo lexbuf) }
847 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'] ['l' 'L']) as x
848 { TInt ((x, (UnSigned,CLongLong)), tokinfo lexbuf) }
849
850 | (real ['f' 'F']) as x { TFloat ((x, CFloat), tokinfo lexbuf) }
851 | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
852 | (real as x) { TFloat ((x, CDouble), tokinfo lexbuf) }
853
854 | ['0'] ['0'-'9']+
855 { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
856 TUnknown (tokinfo lexbuf)
857 }
858 | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
859 { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
860 TUnknown (tokinfo lexbuf)
861 }
862
863
864 (* !!! to put after other rules !!! otherwise 0xff
865 * will be parsed as an ident.
866 *)
867 | ['0'-'9']+ letter (letter | digit) *
868 { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
869 TIdent (tok lexbuf, tokinfo lexbuf)
870 }
871
872 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
873 (*
874 | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
875 | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
876 *)
877
878
879 (*------------------------------------------------------------------------ *)
880 | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
881
882 | _
883 {
884 if !Flag_parsing_c.verbose_lexing
885 then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
886 TUnknown (tokinfo lexbuf)
887 }
888
889
890
891 (*****************************************************************************)
892 and char = parse
893 | (_ as x) { String.make 1 x ^ restchars lexbuf }
894 (* todo?: as for octal, do exception beyond radix exception ? *)
895 | (("\\" (oct | oct oct | oct oct oct)) as x ) { x ^ restchars lexbuf }
896 (* this rule must be after the one with octal, lex try first longest
897 * and when \7 we want an octal, not an exn.
898 *)
899 | (("\\x" ((hex | hex hex))) as x ) { x ^ restchars lexbuf }
900 | (("\\" (_ as v)) as x )
901 {
902 (match v with (* Machine specific ? *)
903 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
904 | 'f' -> () | 'a' -> ()
905 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
906 | 'e' -> () (* linuxext: ? *)
907 | _ ->
908 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
909 );
910 x ^ restchars lexbuf
911 }
912 | _
913 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
914 tok lexbuf ^ restchars lexbuf
915 }
916
917 and restchars = parse
918 | "'" { "" }
919 | (_ as x) { String.make 1 x ^ restchars lexbuf }
920 (* todo?: as for octal, do exception beyond radix exception ? *)
921 | (("\\" (oct | oct oct | oct oct oct)) as x ) { x ^ restchars lexbuf }
922 (* this rule must be after the one with octal, lex try first longest
923 * and when \7 we want an octal, not an exn.
924 *)
925 | (("\\x" ((hex | hex hex))) as x ) { x ^ restchars lexbuf }
926 | (("\\" (_ as v)) as x )
927 {
928 (match v with (* Machine specific ? *)
929 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
930 | 'f' -> () | 'a' -> ()
931 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
932 | 'e' -> () (* linuxext: ? *)
933 | _ ->
934 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
935 );
936 x ^ restchars lexbuf
937 }
938 | _
939 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
940 tok lexbuf ^ restchars lexbuf
941 }
942
943
944 (*****************************************************************************)
945
946 (* todo? factorise code with char ? but not same ending token so hard. *)
947 and string = parse
948 | '\"' { "" }
949 | (_ as x) { string_of_char x^string lexbuf}
950 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
951 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
952 | ("\\" (_ as v)) as x
953 {
954 (match v with (* Machine specific ? *)
955 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
956 | 'f' -> () | 'a' -> ()
957 | '\\' -> () | '?' -> () | '\'' -> () | '\"' -> ()
958 | 'e' -> () (* linuxext: ? *)
959
960 (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
961
962 (* cppext: can have \ for multiline in string too *)
963 | '\n' -> ()
964 | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
965 );
966 x ^ string lexbuf
967 }
968
969 | eof { pr2 "LEXER: WIERD end of file in string"; ""}
970
971 (* Bug if add following code, cos match also the '"' that is needed
972 * to finish the string, and so go until end of file.
973 *)
974 (*
975 | [^ '\\']+
976 { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
977 cs ++ string lexbuf
978 }
979 *)
980
981
982
983 (*****************************************************************************)
984
985 (* less: allow only char-'*' ? *)
986 and comment = parse
987 | "*/" { tok lexbuf }
988 (* noteopti: *)
989 | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
990 | [ '*'] { let s = tok lexbuf in s ^ comment lexbuf }
991 | eof { pr2 "LEXER: end of file in comment"; "*/"}
992 | _
993 { let s = tok lexbuf in
994 pr2 ("LEXER: unrecognised symbol in comment:"^s);
995 s ^ comment lexbuf
996 }
997
998
999
1000 (*****************************************************************************)
1001
1002 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
1003 * then he has already erased the /* comment. So:
1004 * - dont eat the start of the comment otherwise afterwards we are in the middle
1005 * of a comment and so will problably get a parse error somewhere.
1006 * - have to recognize comments in cpp_eat_until_nl.
1007 *)
1008
1009 and cpp_eat_until_nl = parse
1010 (* bugfix: *)
1011 | "/*"
1012 { let s = tok lexbuf in
1013 let s2 = comment lexbuf in
1014 let s3 = cpp_eat_until_nl lexbuf in
1015 s ^ s2 ^ s3
1016 }
1017 | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
1018
1019 | "\n" { tok lexbuf }
1020 (* noteopti:
1021 * update: need also deal with comments chars now
1022 *)
1023 | [^ '\n' '\r' '\\' '/' '*' ]+
1024 { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
1025 | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
1026 | _ { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }