Release coccinelle-0.1.8
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
1 {
2 (* Yoann Padioleau
3 *
4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15 open Common
16
17 open Parser_c
18
19 open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21 (*****************************************************************************)
22 (*
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
33 * because of the "wierd" order of evaluation of OCaml.
34 *
35 *
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
41 * typedef handling here is now useless.
42 *)
43 (*****************************************************************************)
44
45 (*****************************************************************************)
46 (* Wrappers *)
47 (*****************************************************************************)
48 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_lexing
49
50 (*****************************************************************************)
51
52
53 exception Lexical of string
54
55 let tok lexbuf = Lexing.lexeme lexbuf
56
57 let tokinfo lexbuf =
58 {
59 pinfo = Ast_c.OriginTok {
60 Common.charpos = Lexing.lexeme_start lexbuf;
61 Common.str = Lexing.lexeme lexbuf;
62 (* info filled in a post-lexing phase *)
63 Common.line = -1;
64 Common.column = -1;
65 Common.file = "";
66 };
67 (* must generate a new ref each time, otherwise share *)
68 cocci_tag = ref Ast_c.emptyAnnot;
69 comments_tag = ref Ast_c.emptyComments;
70 }
71
72 (* cppext: must generate a new ref each time, otherwise share *)
73 let no_ifdef_mark () = ref (None: (int * int) option)
74
75 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
76
77
78 (* opti: less convenient, but using a hash is faster than using a match *)
79 let keyword_table = Common.hash_of_list [
80
81 (* c: *)
82 "void", (fun ii -> Tvoid ii);
83 "char", (fun ii -> Tchar ii);
84 "short", (fun ii -> Tshort ii);
85 "int", (fun ii -> Tint ii);
86 "long", (fun ii -> Tlong ii);
87 "float", (fun ii -> Tfloat ii);
88 "double", (fun ii -> Tdouble ii);
89
90 "unsigned", (fun ii -> Tunsigned ii);
91 "signed", (fun ii -> Tsigned ii);
92
93 "auto", (fun ii -> Tauto ii);
94 "register", (fun ii -> Tregister ii);
95 "extern", (fun ii -> Textern ii);
96 "static", (fun ii -> Tstatic ii);
97
98 "const", (fun ii -> Tconst ii);
99 "volatile", (fun ii -> Tvolatile ii);
100
101 "struct", (fun ii -> Tstruct ii);
102 "union", (fun ii -> Tunion ii);
103 "enum", (fun ii -> Tenum ii);
104 "typedef", (fun ii -> Ttypedef ii);
105
106 "if", (fun ii -> Tif ii);
107 "else", (fun ii -> Telse ii);
108 "break", (fun ii -> Tbreak ii);
109 "continue", (fun ii -> Tcontinue ii);
110 "switch", (fun ii -> Tswitch ii);
111 "case", (fun ii -> Tcase ii);
112 "default", (fun ii -> Tdefault ii);
113 "for", (fun ii -> Tfor ii);
114 "do", (fun ii -> Tdo ii);
115 "while", (fun ii -> Twhile ii);
116 "return", (fun ii -> Treturn ii);
117 "goto", (fun ii -> Tgoto ii);
118
119 "sizeof", (fun ii -> Tsizeof ii);
120
121
122 (* gccext: cppext: linuxext: synonyms *)
123 "asm", (fun ii -> Tasm ii);
124 "__asm__", (fun ii -> Tasm ii);
125 "__asm", (fun ii -> Tasm ii);
126
127 "inline", (fun ii -> Tinline ii);
128 "__inline__", (fun ii -> Tinline ii);
129 "__inline", (fun ii -> Tinline ii);
130
131 "__attribute__", (fun ii -> Tattribute ii);
132 "__attribute", (fun ii -> Tattribute ii);
133
134 "typeof", (fun ii -> Ttypeof ii);
135 "__typeof__", (fun ii -> Ttypeof ii);
136 "__typeof", (fun ii -> Ttypeof ii);
137
138
139 (* gccext: alias *)
140 "__signed__", (fun ii -> Tsigned ii);
141
142 "__const__", (fun ii -> Tconst ii);
143 "__const", (fun ii -> Tconst ii);
144
145 "__volatile__", (fun ii -> Tvolatile ii);
146 "__volatile", (fun ii -> Tvolatile ii);
147
148
149 (* c99: *)
150 (* no just "restrict" ? maybe for backward compatibility they avoided
151 * to use restrict which people may have used in their program already
152 *)
153 "__restrict", (fun ii -> Trestrict ii);
154 "__restrict__", (fun ii -> Trestrict ii);
155
156 ]
157
158 let error_radix s =
159 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
160
161 (* functions for figuring out the type of integers *)
162
163 let is_long_dec s int uint long ulong =
164 match !Flag_parsing_c.int_thresholds with
165 None -> int
166 | Some (_,_,uint_threshold,long_threshold,ulong_threshold) ->
167 let bn = Big_int.big_int_of_string s in
168 if Big_int.ge_big_int bn ulong_threshold
169 then ulong
170 else
171 if Big_int.ge_big_int bn long_threshold
172 then long
173 else
174 if Big_int.ge_big_int bn uint_threshold
175 then long
176 else int
177
178 let is_long_ho s int uint long ulong drop bpd count =
179 match !Flag_parsing_c.int_thresholds with
180 None -> int
181 | Some (uint_sz,ulong_sz,_,_,_) ->
182 let len = String.length s in
183 (* this assumes that all of the hex/oct digits are significant *)
184 (* drop is 2 for hex (0x) and 1 for oct (0) *)
185 let s = String.sub s drop (len - drop) in
186 let len =
187 ((len-drop) * bpd) -
188 (count (int_of_string("0x"^(String.sub s 0 1)))) in
189 if len < uint_sz
190 then int
191 else
192 if len = uint_sz
193 then uint
194 else
195 if len < ulong_sz
196 then long
197 else ulong
198
199 let is_long_oct s int uint long ulong =
200 is_long_ho s int uint long ulong 1 3
201 (* stupid, but probably more efficient than taking logs *)
202 (function 0 -> 3 | 1 -> 2 | n when n < 4 -> 1 | _ -> 0)
203 let is_long_hex s int uint long ulong =
204 is_long_ho s int uint long ulong 2 4
205 (* stupid, but probably more efficient than taking logs *)
206 (function 0 -> 4 | 1 -> 3 | n when n < 4 -> 2 | n when n < 8 -> 1
207 | _ -> 0)
208
209 let sint = (Signed,CInt)
210 let uint = (UnSigned,CInt)
211 let slong = (Signed,CLong)
212 let ulong = (UnSigned,CLong)
213
214 }
215
216 (*****************************************************************************)
217 let letter = ['A'-'Z' 'a'-'z' '_']
218 let digit = ['0'-'9']
219
220 (* not used for the moment *)
221 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
222 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
223 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
224 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
225 (* 7 = \a = bell in C. this is not the only char allowed !!
226 * ex @ and $ ` are valid too
227 *)
228
229 let cchar = (letter | digit | punctuation | additionnal)
230
231 let sp = [' ' '\t']+
232 let spopt = [' ' '\t']*
233
234 let dec = ['0'-'9']
235 let oct = ['0'-'7']
236 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
237
238 let decimal = ('0' | (['1'-'9'] dec*))
239 let octal = ['0'] oct+
240 let hexa = ("0x" |"0X") hex+
241
242
243 let pent = dec+
244 let pfract = dec+
245 let sign = ['-' '+']
246 let exp = ['e''E'] sign? dec+
247 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
248
249 let id = letter (letter | digit) *
250
251 (*****************************************************************************)
252 rule token = parse
253
254 (* ----------------------------------------------------------------------- *)
255 (* spacing/comments *)
256 (* ----------------------------------------------------------------------- *)
257
258 (* note: this lexer generate tokens for comments!! so can not give
259 * this lexer as-is to the parsing function. Must preprocess it, hence
260 * use techniques like cur_tok ref in parse_c.ml
261 *)
262
263 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
264 (* starting a new line; the newline character followed by whitespace *)
265 { TCommentNewline (tokinfo lexbuf) }
266 | [' ' '\t' '\r' '\011' '\012' ]+
267 { TCommentSpace (tokinfo lexbuf) }
268 | "/*"
269 { let info = tokinfo lexbuf in
270 let com = comment lexbuf in
271
272 let info' = info +> tok_add_s com in
273 let s = Ast_c.str_of_info info' in
274 (* could be more flexible, use [\t ]* instead of hardcoded
275 * single space. *)
276 match s with
277 | "/* {{coccinelle:skip_start}} */" ->
278 TCommentSkipTagStart (info')
279 | "/* {{coccinelle:skip_end}} */" ->
280 TCommentSkipTagEnd (info')
281 | _ -> TComment(info')
282 }
283
284
285 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
286 * So need this here only when dont call cpp before.
287 * note that we don't keep the trailing \n; it will be in another token.
288 *)
289 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
290
291 (* ----------------------------------------------------------------------- *)
292 (* cpp *)
293 (* ----------------------------------------------------------------------- *)
294
295 (* old:
296 * | '#' { endline lexbuf} // should be line, and not endline
297 * and endline = parse | '\n' { token lexbuf}
298 * | _ { endline lexbuf}
299 *)
300
301 (* less?:
302 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
303 * => just "#" -> token {lexbuf} (that is ignore)
304 * il y'a 1 #elif sans rien apres
305 * il y'a 1 #error sans rien apres
306 * il y'a 2 mov dede, #xxx qui genere du coup exn car
307 * entouré par des #if 0
308 * => make as for comment, call a comment_cpp that when #endif finish the
309 * comment and if other cpp stuff raise exn
310 * il y'a environ 10 #if(xxx) ou le ( est collé direct
311 * il y'a des include"" et include<
312 * il y'a 1 ` (derriere un #ifndef linux)
313 *)
314
315
316
317 (* ---------------------- *)
318 (* misc *)
319 (* ---------------------- *)
320
321 (* bugfix: I want now to keep comments for the cComment study
322 * so cant do: sp [^'\n']+ '\n'
323 * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
324 *)
325
326 | "#" spopt "pragma" sp [^'\n']* '\n'
327 | "#" spopt "ident" sp [^'\n']* '\n'
328 | "#" spopt "line" sp [^'\n']* '\n'
329 | "#" spopt "error" sp [^'\n']* '\n'
330 | "#" spopt "warning" sp [^'\n']* '\n'
331 | "#" spopt "abort" sp [^'\n']* '\n'
332 { TCppDirectiveOther (tokinfo lexbuf) }
333
334 | "#" [' ' '\t']* '\n'
335 { TCppDirectiveOther (tokinfo lexbuf) }
336
337 (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
338 | "#" sp pent sp '"' [^ '"']* '"' (spopt pent)* spopt '\n'
339 { TCppDirectiveOther (tokinfo lexbuf) }
340
341
342
343 (* ---------------------- *)
344 (* #define, #undef *)
345 (* ---------------------- *)
346
347 (* the rest of the lexing/parsing of define is done in fix_tokens_define
348 * where we parse until a TCppEscapedNewline and generate a TDefEol
349 *)
350 | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
351
352 (* note: in some cases can have stuff after the ident as in #undef XXX 50,
353 * but I currently don't handle it cos I think it's bad code.
354 *)
355 | (("#" [' ' '\t']* "undef" [' ' '\t']+) as _undef) (id as id)
356 { let info = tokinfo lexbuf in
357 TUndef (id, info)
358 (*+> tok_add_s (cpp_eat_until_nl lexbuf))*)
359 }
360
361
362 (* ---------------------- *)
363 (* #include *)
364 (* ---------------------- *)
365
366 (* The difference between a local "" and standard <> include is computed
367 * later in parser_c.mly. So redo a little bit of lexing there; ugly but
368 * simpler to generate a single token here. *)
369 | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
370 (('"' ([^ '"']+) '"' |
371 '<' [^ '>']+ '>' |
372 ['A'-'Z''_']+
373 ) as filename)
374 { let info = tokinfo lexbuf in
375 TInclude (includes, filename, Ast_c.noInIfdef(), info)
376 }
377 (* gccext: found in glibc *)
378 | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
379 (('"' ([^ '"']+) '"' |
380 '<' [^ '>']+ '>' |
381 ['A'-'Z''_']+
382 ) as filename)
383 { let info = tokinfo lexbuf in
384 TInclude (includes, filename, Ast_c.noInIfdef(), info)
385 }
386
387 (* ---------------------- *)
388 (* #ifdef *)
389 (* ---------------------- *)
390
391 (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
392 * when working on the ifdef view.
393 *)
394
395 (* '0'+ because sometimes it is a #if 000 *)
396 | "#" [' ' '\t']* "if" [' ' '\t']* '0'+ (* [^'\n']* '\n' *)
397 { let info = tokinfo lexbuf in
398 TIfdefBool (false, no_ifdef_mark(), info)
399 (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
400 }
401
402 | "#" [' ' '\t']* "if" [' ' '\t']* '1' (* [^'\n']* '\n' *)
403 { let info = tokinfo lexbuf in
404 TIfdefBool (true, no_ifdef_mark(), info)
405
406 }
407
408 (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
409 | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
410 { let info = tokinfo lexbuf in
411 TIfdefMisc (false, no_ifdef_mark(), info)
412 }
413
414 (* DO NOT cherry pick to lexer_cplusplus !!! *)
415 | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus" [^'\n']* '\n'
416 { let info = tokinfo lexbuf in
417 TIfdefMisc (false, no_ifdef_mark(), info)
418 }
419
420 (* in glibc *)
421 | "#" spopt ("ifdef"|"if") sp "__STDC__"
422 { let info = tokinfo lexbuf in
423 TIfdefVersion (true, no_ifdef_mark(),
424 info +> tok_add_s (cpp_eat_until_nl lexbuf))
425 }
426
427
428 (* linuxext: different possible variations (we do not manage all of them):
429
430 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
431 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
432 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
433 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
434 #if LINUX_VERSION_CODE < 0x020600
435 #if LINUX_VERSION_CODE >= 0x2051c
436 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
437 #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
438 #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
439 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
440 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
441 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
442 #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
443 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
444 #ifndef LINUX_VERSION_CODE
445 #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
446 (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
447 LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
448 #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
449 #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
450 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
451
452 *)
453
454 (* linuxext: must be before the generic rules for if and ifdef *)
455 | "#" spopt "if" sp "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
456 { let info = tokinfo lexbuf in
457 TIfdefVersion (true, no_ifdef_mark(),
458 info +> tok_add_s (cpp_eat_until_nl lexbuf))
459 }
460 (* linuxext: *)
461 | "#" spopt "if" sp "!" "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
462 | "#" spopt "if" sp ['(']? "LINUX_VERSION_CODE" sp ("<=" | "<") sp
463
464 { let info = tokinfo lexbuf in
465 TIfdefVersion (false, no_ifdef_mark(),
466 info +> tok_add_s (cpp_eat_until_nl lexbuf))
467 }
468
469
470
471
472 (* can have some ifdef 0 hence the letter|digit even at beginning of word *)
473 | "#" [' ''\t']* "ifdef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
474 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
475 | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
476 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
477 | "#" [' ''\t']* "if" [' ' '\t']+
478 { let info = tokinfo lexbuf in
479 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
480 }
481 | "#" [' ' '\t']* "if" '('
482 { let info = tokinfo lexbuf in
483 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
484 }
485
486 | "#" [' ' '\t']* "elif" [' ' '\t']+
487 { let info = tokinfo lexbuf in
488 TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
489 }
490
491
492 (* bugfix: can have #endif LINUX but at the same time if I eat everything
493 * until next line, I may miss some TComment which for some tools
494 * are important such as aComment
495 *)
496 | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
497 TEndif (no_ifdef_mark(), tokinfo lexbuf)
498 }
499 (* can be at eof *)
500 (*| "#" [' ' '\t']* "endif" { TEndif (tokinfo lexbuf) }*)
501
502 | "#" [' ' '\t']* "else" [' ' '\t' '\n']
503 { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
504
505
506
507
508 (* ---------------------- *)
509 (* #define body *)
510 (* ---------------------- *)
511
512 (* only in cpp directives normally *)
513 | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
514
515 (* We must generate separate tokens for #, ## and extend the grammar.
516 * Note there can be "elaborated" idents in many different places, in
517 * expression but also in declaration, in function name. So having 3 tokens
518 * for an ident does not work well with how we add info in
519 * ast_c. Was easier to generate just one token, just one info,
520 * even if have later to reanalyse those tokens and unsplit. But then,
521 * handling C++ lead to having not just a string for ident but something
522 * more complex. Also when we want to parse elaborated function headers
523 * (e.g. void METH(foo)(int x)), we need anyway to go from a string
524 * to something more. So having also for C something more than just
525 * string for ident is natural.
526 *
527 * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
528 * an easier solution would be to augment the TIdent type such as
529 * TIdent of string * info * cpp_ident_additionnal_info
530 *
531 * old:
532 * | id ([' ''\t']* "##" [' ''\t']* id)+
533 * { let info = tokinfo lexbuf in
534 * TIdent (tok lexbuf, info)
535 * }
536 * | "##" spopt id
537 * { let info = tokinfo lexbuf in
538 * TIdent (tok lexbuf, info)
539 * }
540 *
541 *)
542 (* cppext: string concatenation of idents, also ##args for variadic macro. *)
543 | "##" { TCppConcatOp (tokinfo lexbuf) }
544
545 (* cppext: stringification.
546 * bugfix: this case must be after the other cases such as #endif
547 * otherwise take precedent.
548 *)
549 | "#" spopt id
550 { let info = tokinfo lexbuf in
551 TIdent (tok lexbuf, info)
552 }
553 (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
554 | ((id as s) "...")
555 { TDefParamVariadic (s, tokinfo lexbuf) }
556
557
558
559
560
561 (* ----------------------------------------------------------------------- *)
562 (* C symbols *)
563 (* ----------------------------------------------------------------------- *)
564 (* stdC:
565 ... && -= >= ~ + ; ]
566 <<= &= -> >> % , < ^
567 >>= *= /= ^= & - = {
568 != ++ << |= ( . > |
569 %= += <= || ) / ? }
570 -- == ! * : [
571 recent addition: <: :> <% %>
572 only at processing: %: %:%: # ##
573 *)
574
575
576 | '[' { TOCro(tokinfo lexbuf) } | ']' { TCCro(tokinfo lexbuf) }
577 | '(' { TOPar(tokinfo lexbuf) } | ')' { TCPar(tokinfo lexbuf) }
578 | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
579
580 | '+' { TPlus(tokinfo lexbuf) } | '*' { TMul(tokinfo lexbuf) }
581 | '-' { TMinus(tokinfo lexbuf) } | '/' { TDiv(tokinfo lexbuf) }
582 | '%' { TMod(tokinfo lexbuf) }
583
584 | "++"{ TInc(tokinfo lexbuf) } | "--"{ TDec(tokinfo lexbuf) }
585
586 | "=" { TEq(tokinfo lexbuf) }
587
588 | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
589 | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
590 | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
591 | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
592 | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
593 | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
594 | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
595 | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
596 | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
597 | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
598
599 | "==" { TEqEq(tokinfo lexbuf) } | "!=" { TNotEq(tokinfo lexbuf) }
600 | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
601 | "<" { TInf(tokinfo lexbuf) } | ">" { TSup(tokinfo lexbuf) }
602
603 | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
604 | ">>" { TShr(tokinfo lexbuf) } | "<<" { TShl(tokinfo lexbuf) }
605 | "&" { TAnd(tokinfo lexbuf) } | "|" { TOr(tokinfo lexbuf) }
606 | "^" { TXor(tokinfo lexbuf) }
607 | "..." { TEllipsis(tokinfo lexbuf) }
608 | "->" { TPtrOp(tokinfo lexbuf) } | '.' { TDot(tokinfo lexbuf) }
609 | ',' { TComma(tokinfo lexbuf) }
610 | ";" { TPtVirg(tokinfo lexbuf) }
611 | "?" { TWhy(tokinfo lexbuf) } | ":" { TDotDot(tokinfo lexbuf) }
612 | "!" { TBang(tokinfo lexbuf) } | "~" { TTilde(tokinfo lexbuf) }
613
614 | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
615 | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
616
617
618
619 (* ----------------------------------------------------------------------- *)
620 (* C keywords and ident *)
621 (* ----------------------------------------------------------------------- *)
622
623 (* StdC: must handle at least name of length > 509, but can
624 * truncate to 31 when compare and truncate to 6 and even lowerise
625 * in the external linkage phase
626 *)
627 | letter (letter | digit) *
628 { let info = tokinfo lexbuf in
629 let s = tok lexbuf in
630 Common.profile_code "C parsing.lex_ident" (fun () ->
631 match Common.optionise (fun () -> Hashtbl.find keyword_table s)
632 with
633 | Some f -> f info
634
635 (* parse_typedef_fix.
636 * if Lexer_parser.is_typedef s
637 * then TypedefIdent (s, info)
638 * else TIdent (s, info)
639 *
640 * update: now this is no more useful, cos
641 * as we use tokens_all, it first parse all as an ident and
642 * later transform an indent in a typedef. so the typedef job is
643 * now done in parse_c.ml.
644 *)
645
646 | None -> TIdent (s, info)
647 )
648 }
649 (* gccext: apparently gcc allows dollar in variable names. found such
650 * thing a few time in linux and in glibc. No need look in keyword_table
651 * here.
652 *)
653 | (letter | '$') (letter | digit | '$') *
654 {
655 let info = tokinfo lexbuf in
656 let s = tok lexbuf in
657 pr2 ("LEXER: identifier with dollar: " ^ s);
658 TIdent (s, info)
659 }
660
661
662 (* ----------------------------------------------------------------------- *)
663 (* C constant *)
664 (* ----------------------------------------------------------------------- *)
665
666 | "'"
667 { let info = tokinfo lexbuf in
668 let s = char lexbuf in
669 TChar ((s, IsChar), (info +> tok_add_s (s ^ "'")))
670 }
671 | '"'
672 { let info = tokinfo lexbuf in
673 let s = string lexbuf in
674 TString ((s, IsChar), (info +> tok_add_s (s ^ "\"")))
675 }
676 (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
677 | 'L' "'"
678 { let info = tokinfo lexbuf in
679 let s = char lexbuf in
680 TChar ((s, IsWchar), (info +> tok_add_s (s ^ "'")))
681 }
682 | 'L' '"'
683 { let info = tokinfo lexbuf in
684 let s = string lexbuf in
685 TString ((s, IsWchar), (info +> tok_add_s (s ^ "\"")))
686 }
687
688
689 (* Take care of the order ? No because lex tries the longest match. The
690 * strange diff between decimal and octal constant semantic is not
691 * understood too by refman :) refman:11.1.4, and ritchie.
692 *)
693
694 | decimal as x
695 { TInt ((x, is_long_dec x sint slong slong ulong), tokinfo lexbuf) }
696 | hexa as x
697 { TInt ((x, is_long_hex x sint uint slong ulong), tokinfo lexbuf) }
698 | octal as x
699 { TInt ((x, is_long_oct x sint uint slong ulong), tokinfo lexbuf) }
700 | ((decimal as s) ['u' 'U']) as x
701 { TInt ((x, is_long_dec s uint uint ulong ulong), tokinfo lexbuf) }
702 | ((hexa as s) ['u' 'U']) as x
703 { TInt ((x, is_long_hex s uint uint ulong ulong), tokinfo lexbuf) }
704 | ((octal as s) ['u' 'U']) as x
705 { TInt ((x, is_long_oct s uint uint ulong ulong), tokinfo lexbuf) }
706 | (( decimal as s) ['l' 'L']) as x
707 { TInt ((x, is_long_dec s slong slong slong ulong), tokinfo lexbuf) }
708 | ((hexa as s) ['l' 'L']) as x
709 { TInt ((x, is_long_hex s slong slong slong ulong), tokinfo lexbuf) }
710 | ((octal as s) ['l' 'L']) as x
711 { TInt ((x, is_long_oct s slong slong slong ulong), tokinfo lexbuf) }
712 | ((( decimal | hexa | octal) ['l' 'L'] ['u' 'U'])
713 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'])) as x
714 { TInt ((x, (UnSigned,CLong)), tokinfo lexbuf) }
715 | (( decimal | hexa | octal) ['l' 'L'] ['l' 'L']) as x
716 { TInt ((x, (Signed,CLongLong)), tokinfo lexbuf) }
717 | (( decimal | hexa | octal) ['u' 'U'] ['l' 'L'] ['l' 'L']) as x
718 { TInt ((x, (UnSigned,CLongLong)), tokinfo lexbuf) }
719
720 | (real ['f' 'F']) as x { TFloat ((x, CFloat), tokinfo lexbuf) }
721 | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
722 | (real as x) { TFloat ((x, CDouble), tokinfo lexbuf) }
723
724 | ['0'] ['0'-'9']+
725 { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
726 TUnknown (tokinfo lexbuf)
727 }
728 | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
729 { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
730 TUnknown (tokinfo lexbuf)
731 }
732
733
734 (* !!! to put after other rules !!! otherwise 0xff
735 * will be parsed as an ident.
736 *)
737 | ['0'-'9']+ letter (letter | digit) *
738 { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
739 TIdent (tok lexbuf, tokinfo lexbuf)
740 }
741
742 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
743 (*
744 | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
745 | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
746 *)
747
748
749 (*------------------------------------------------------------------------ *)
750 | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
751
752 | _
753 {
754 if !Flag_parsing_c.verbose_lexing
755 then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
756 TUnknown (tokinfo lexbuf)
757 }
758
759
760
761 (*****************************************************************************)
762 and char = parse
763 | (_ as x) "'" { String.make 1 x }
764 (* todo?: as for octal, do exception beyond radix exception ? *)
765 | (("\\" (oct | oct oct | oct oct oct)) as x "'") { x }
766 (* this rule must be after the one with octal, lex try first longest
767 * and when \7 we want an octal, not an exn.
768 *)
769 | (("\\x" ((hex | hex hex))) as x "'") { x }
770 | (("\\" (_ as v)) as x "'")
771 {
772 (match v with (* Machine specific ? *)
773 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
774 | 'f' -> () | 'a' -> ()
775 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
776 | 'e' -> () (* linuxext: ? *)
777 | _ ->
778 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
779 );
780 x
781 }
782 | _
783 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
784 tok lexbuf
785 }
786
787
788
789 (*****************************************************************************)
790
791 (* todo? factorise code with char ? but not same ending token so hard. *)
792 and string = parse
793 | '"' { "" }
794 | (_ as x) { string_of_char x^string lexbuf}
795 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
796 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
797 | ("\\" (_ as v)) as x
798 {
799 (match v with (* Machine specific ? *)
800 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
801 | 'f' -> () | 'a' -> ()
802 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
803 | 'e' -> () (* linuxext: ? *)
804
805 (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
806
807 (* cppext: can have \ for multiline in string too *)
808 | '\n' -> ()
809 | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
810 );
811 x ^ string lexbuf
812 }
813
814 | eof { pr2 "LEXER: WIERD end of file in string"; ""}
815
816 (* Bug if add following code, cos match also the '"' that is needed
817 * to finish the string, and so go until end of file.
818 *)
819 (*
820 | [^ '\\']+
821 { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
822 cs ++ string lexbuf
823 }
824 *)
825
826
827
828 (*****************************************************************************)
829
830 (* less: allow only char-'*' ? *)
831 and comment = parse
832 | "*/" { tok lexbuf }
833 (* noteopti: *)
834 | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
835 | [ '*'] { let s = tok lexbuf in s ^ comment lexbuf }
836 | eof { pr2 "LEXER: end of file in comment"; "*/"}
837 | _
838 { let s = tok lexbuf in
839 pr2 ("LEXER: unrecognised symbol in comment:"^s);
840 s ^ comment lexbuf
841 }
842
843
844
845 (*****************************************************************************)
846
847 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
848 * then he has already erased the /* comment. So:
849 * - dont eat the start of the comment otherwise afterwards we are in the middle
850 * of a comment and so will problably get a parse error somewhere.
851 * - have to recognize comments in cpp_eat_until_nl.
852 *)
853
854 and cpp_eat_until_nl = parse
855 (* bugfix: *)
856 | "/*"
857 { let s = tok lexbuf in
858 let s2 = comment lexbuf in
859 let s3 = cpp_eat_until_nl lexbuf in
860 s ^ s2 ^ s3
861 }
862 | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
863
864 | "\n" { tok lexbuf }
865 (* noteopti:
866 * update: need also deal with comments chars now
867 *)
868 | [^ '\n' '\\' '/' '*' ]+
869 { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
870 | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
871 | _ { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }