6a18e2e8c210b81a9bad98af7ea0d17f71636ec6
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
1 {
2 (* Yoann Padioleau
3 *
4 * Copyright (C) 2002, 2006, 2007, 2008, 2009, Ecole des Mines de Nantes
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15 open Common
16
17 open Parser_c
18
19 open Ast_c (* to factorise tokens, OpAssign, ... *)
20
21 (*****************************************************************************)
22 (*
23 * subtil: ocamllex use side effect on lexbuf, so must take care.
24 * For instance must do
25 *
26 * let info = tokinfo lexbuf in
27 * TComment (info +> tok_add_s (comment lexbuf))
28 *
29 * and not
30 *
31 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
32 *
33 * because of the "wierd" order of evaluation of OCaml.
34 *
35 *
36 *
37 * note: can't use Lexer_parser._lexer_hint here to do different
38 * things, because now we call the lexer to get all the tokens
39 * (tokens_all), and then we parse. So we can't have the _lexer_hint
40 * info here. We can have it only in parse_c. For the same reason, the
41 * typedef handling here is now useless.
42 *)
43 (*****************************************************************************)
44
45 (*****************************************************************************)
46 (* Wrappers *)
47 (*****************************************************************************)
48 let pr2 s =
49 if !Flag_parsing_c.verbose_lexing
50 then Common.pr2 s
51
52 (*****************************************************************************)
53
54
55 exception Lexical of string
56
57 let tok lexbuf = Lexing.lexeme lexbuf
58
59 let tokinfo lexbuf =
60 {
61 pinfo = Ast_c.OriginTok {
62 Common.charpos = Lexing.lexeme_start lexbuf;
63 Common.str = Lexing.lexeme lexbuf;
64 (* info filled in a post-lexing phase *)
65 Common.line = -1;
66 Common.column = -1;
67 Common.file = "";
68 };
69 (* must generate a new ref each time, otherwise share *)
70 cocci_tag = ref Ast_c.emptyAnnot;
71 comments_tag = ref Ast_c.emptyComments;
72 }
73
74 (* cppext: must generate a new ref each time, otherwise share *)
75 let no_ifdef_mark () = ref (None: (int * int) option)
76
77 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
78
79
80 (* opti: less convenient, but using a hash is faster than using a match *)
81 let keyword_table = Common.hash_of_list [
82
83 (* c: *)
84 "void", (fun ii -> Tvoid ii);
85 "char", (fun ii -> Tchar ii);
86 "short", (fun ii -> Tshort ii);
87 "int", (fun ii -> Tint ii);
88 "long", (fun ii -> Tlong ii);
89 "float", (fun ii -> Tfloat ii);
90 "double", (fun ii -> Tdouble ii);
91
92 "unsigned", (fun ii -> Tunsigned ii);
93 "signed", (fun ii -> Tsigned ii);
94
95 "auto", (fun ii -> Tauto ii);
96 "register", (fun ii -> Tregister ii);
97 "extern", (fun ii -> Textern ii);
98 "static", (fun ii -> Tstatic ii);
99
100 "const", (fun ii -> Tconst ii);
101 "volatile", (fun ii -> Tvolatile ii);
102
103 "struct", (fun ii -> Tstruct ii);
104 "union", (fun ii -> Tunion ii);
105 "enum", (fun ii -> Tenum ii);
106 "typedef", (fun ii -> Ttypedef ii);
107
108 "if", (fun ii -> Tif ii);
109 "else", (fun ii -> Telse ii);
110 "break", (fun ii -> Tbreak ii);
111 "continue", (fun ii -> Tcontinue ii);
112 "switch", (fun ii -> Tswitch ii);
113 "case", (fun ii -> Tcase ii);
114 "default", (fun ii -> Tdefault ii);
115 "for", (fun ii -> Tfor ii);
116 "do", (fun ii -> Tdo ii);
117 "while", (fun ii -> Twhile ii);
118 "return", (fun ii -> Treturn ii);
119 "goto", (fun ii -> Tgoto ii);
120
121 "sizeof", (fun ii -> Tsizeof ii);
122
123
124 (* gccext: cppext: linuxext: synonyms *)
125 "asm", (fun ii -> Tasm ii);
126 "__asm__", (fun ii -> Tasm ii);
127 "__asm", (fun ii -> Tasm ii);
128
129 "inline", (fun ii -> Tinline ii);
130 "__inline__", (fun ii -> Tinline ii);
131 "__inline", (fun ii -> Tinline ii);
132
133 "__attribute__", (fun ii -> Tattribute ii);
134 "__attribute", (fun ii -> Tattribute ii);
135
136 "typeof", (fun ii -> Ttypeof ii);
137 "__typeof__", (fun ii -> Ttypeof ii);
138 "__typeof", (fun ii -> Ttypeof ii);
139
140
141 (* gccext: alias *)
142 "__signed__", (fun ii -> Tsigned ii);
143
144 "__const__", (fun ii -> Tconst ii);
145 "__const", (fun ii -> Tconst ii);
146
147 "__volatile__", (fun ii -> Tvolatile ii);
148 "__volatile", (fun ii -> Tvolatile ii);
149
150
151 (* c99: *)
152 (* no just "restrict" ? maybe for backward compatibility they avoided
153 * to use restrict which people may have used in their program already
154 *)
155 "__restrict", (fun ii -> Trestrict ii);
156 "__restrict__", (fun ii -> Trestrict ii);
157
158 ]
159
160 let error_radix s =
161 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
162
163 }
164
165 (*****************************************************************************)
166 let letter = ['A'-'Z' 'a'-'z' '_']
167 let digit = ['0'-'9']
168
169 (* not used for the moment *)
170 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
171 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
172 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
173 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
174 (* 7 = \a = bell in C. this is not the only char allowed !!
175 * ex @ and $ ` are valid too
176 *)
177
178 let cchar = (letter | digit | punctuation | additionnal)
179
180 let sp = [' ' '\t']+
181 let spopt = [' ' '\t']*
182
183 let dec = ['0'-'9']
184 let oct = ['0'-'7']
185 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
186
187 let decimal = ('0' | (['1'-'9'] dec*))
188 let octal = ['0'] oct+
189 let hexa = ("0x" |"0X") hex+
190
191
192 let pent = dec+
193 let pfract = dec+
194 let sign = ['-' '+']
195 let exp = ['e''E'] sign? dec+
196 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
197
198 let id = letter (letter | digit) *
199
200 (*****************************************************************************)
201 rule token = parse
202
203 (* ----------------------------------------------------------------------- *)
204 (* spacing/comments *)
205 (* ----------------------------------------------------------------------- *)
206
207 (* note: this lexer generate tokens for comments!! so can not give
208 * this lexer as-is to the parsing function. Must preprocess it, hence
209 * use techniques like cur_tok ref in parse_c.ml
210 *)
211
212 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
213 (* starting a new line; the newline character followed by whitespace *)
214 { TCommentNewline (tokinfo lexbuf) }
215 | [' ' '\t' '\r' '\011' '\012' ]+
216 { TCommentSpace (tokinfo lexbuf) }
217 | "/*"
218 { let info = tokinfo lexbuf in
219 let com = comment lexbuf in
220
221 let info' = info +> tok_add_s com in
222 let s = Ast_c.str_of_info info' in
223 (* could be more flexible, use [\t ]* instead of hardcoded
224 * single space. *)
225 match s with
226 | "/* {{coccinelle:skip_start}} */" ->
227 TCommentSkipTagStart (info')
228 | "/* {{coccinelle:skip_end}} */" ->
229 TCommentSkipTagEnd (info')
230 | _ -> TComment(info')
231 }
232
233
234 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
235 * So need this here only when dont call cpp before.
236 * note that we don't keep the trailing \n; it will be in another token.
237 *)
238 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
239
240 (* ----------------------------------------------------------------------- *)
241 (* cpp *)
242 (* ----------------------------------------------------------------------- *)
243
244 (* old:
245 * | '#' { endline lexbuf} // should be line, and not endline
246 * and endline = parse | '\n' { token lexbuf}
247 * | _ { endline lexbuf}
248 *)
249
250 (* less?:
251 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
252 * => just "#" -> token {lexbuf} (that is ignore)
253 * il y'a 1 #elif sans rien apres
254 * il y'a 1 #error sans rien apres
255 * il y'a 2 mov dede, #xxx qui genere du coup exn car
256 * entouré par des #if 0
257 * => make as for comment, call a comment_cpp that when #endif finish the
258 * comment and if other cpp stuff raise exn
259 * il y'a environ 10 #if(xxx) ou le ( est collé direct
260 * il y'a des include"" et include<
261 * il y'a 1 ` (derriere un #ifndef linux)
262 *)
263
264
265
266 (* ---------------------- *)
267 (* misc *)
268 (* ---------------------- *)
269
270 (* bugfix: I want now to keep comments for the cComment study
271 * so cant do: sp [^'\n']+ '\n'
272 * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
273 *)
274
275 | "#" spopt "pragma" sp [^'\n']* '\n'
276 | "#" spopt "ident" sp [^'\n']* '\n'
277 | "#" spopt "line" sp [^'\n']* '\n'
278 | "#" spopt "error" sp [^'\n']* '\n'
279 | "#" spopt "warning" sp [^'\n']* '\n'
280 | "#" spopt "abort" sp [^'\n']* '\n'
281 { TCppDirectiveOther (tokinfo lexbuf) }
282
283 | "#" [' ' '\t']* '\n'
284 { TCppDirectiveOther (tokinfo lexbuf) }
285
286 (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
287 | "#" sp pent sp '"' [^ '"']* '"' (spopt pent)* spopt '\n'
288 { TCppDirectiveOther (tokinfo lexbuf) }
289
290
291
292 (* ---------------------- *)
293 (* #define, #undef *)
294 (* ---------------------- *)
295
296 (* the rest of the lexing/parsing of define is done in fix_tokens_define
297 * where we parse until a TCppEscapedNewline and generate a TDefEol
298 *)
299 | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
300
301 (* note: in some cases can have stuff after the ident as in #undef XXX 50,
302 * but I currently don't handle it cos I think it's bad code.
303 *)
304 | (("#" [' ' '\t']* "undef" [' ' '\t']+) as _undef) (id as id)
305 { let info = tokinfo lexbuf in
306 TUndef (id, info)
307 (*+> tok_add_s (cpp_eat_until_nl lexbuf))*)
308 }
309
310
311 (* ---------------------- *)
312 (* #include *)
313 (* ---------------------- *)
314
315 (* The difference between a local "" and standard <> include is computed
316 * later in parser_c.mly. So redo a little bit of lexing there; ugly but
317 * simpler to generate a single token here. *)
318 | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
319 (('"' ([^ '"']+) '"' |
320 '<' [^ '>']+ '>' |
321 ['A'-'Z''_']+
322 ) as filename)
323 { let info = tokinfo lexbuf in
324 TInclude (includes, filename, Ast_c.noInIfdef(), info)
325 }
326 (* gccext: found in glibc *)
327 | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
328 (('"' ([^ '"']+) '"' |
329 '<' [^ '>']+ '>' |
330 ['A'-'Z''_']+
331 ) as filename)
332 { let info = tokinfo lexbuf in
333 TInclude (includes, filename, Ast_c.noInIfdef(), info)
334 }
335
336 (* ---------------------- *)
337 (* #ifdef *)
338 (* ---------------------- *)
339
340 (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
341 * when working on the ifdef view.
342 *)
343
344 (* '0'+ because sometimes it is a #if 000 *)
345 | "#" [' ' '\t']* "if" [' ' '\t']* '0'+ (* [^'\n']* '\n' *)
346 { let info = tokinfo lexbuf in
347 TIfdefBool (false, no_ifdef_mark(), info)
348 (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
349 }
350
351 | "#" [' ' '\t']* "if" [' ' '\t']* '1' (* [^'\n']* '\n' *)
352 { let info = tokinfo lexbuf in
353 TIfdefBool (true, no_ifdef_mark(), info)
354
355 }
356
357 (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
358 | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
359 { let info = tokinfo lexbuf in
360 TIfdefMisc (false, no_ifdef_mark(), info)
361 }
362
363 (* DO NOT cherry pick to lexer_cplusplus !!! *)
364 | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus" [^'\n']* '\n'
365 { let info = tokinfo lexbuf in
366 TIfdefMisc (false, no_ifdef_mark(), info)
367 }
368
369 (* in glibc *)
370 | "#" spopt ("ifdef"|"if") sp "__STDC__"
371 { let info = tokinfo lexbuf in
372 TIfdefVersion (true, no_ifdef_mark(),
373 info +> tok_add_s (cpp_eat_until_nl lexbuf))
374 }
375
376
377 (* linuxext: different possible variations (we do not manage all of them):
378
379 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
380 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
381 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
382 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
383 #if LINUX_VERSION_CODE < 0x020600
384 #if LINUX_VERSION_CODE >= 0x2051c
385 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
386 #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
387 #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
388 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
389 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
390 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
391 #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
392 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
393 #ifndef LINUX_VERSION_CODE
394 #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
395 (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
396 LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
397 #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
398 #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
399 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
400
401 *)
402
403 (* linuxext: must be before the generic rules for if and ifdef *)
404 | "#" spopt "if" sp "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
405 { let info = tokinfo lexbuf in
406 TIfdefVersion (true, no_ifdef_mark(),
407 info +> tok_add_s (cpp_eat_until_nl lexbuf))
408 }
409 (* linuxext: *)
410 | "#" spopt "if" sp "!" "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
411 | "#" spopt "if" sp ['(']? "LINUX_VERSION_CODE" sp ("<=" | "<") sp
412
413 { let info = tokinfo lexbuf in
414 TIfdefVersion (false, no_ifdef_mark(),
415 info +> tok_add_s (cpp_eat_until_nl lexbuf))
416 }
417
418
419
420
421 (* can have some ifdef 0 hence the letter|digit even at beginning of word *)
422 | "#" [' ''\t']* "ifdef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
423 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
424 | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
425 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
426 | "#" [' ''\t']* "if" [' ' '\t']+
427 { let info = tokinfo lexbuf in
428 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
429 }
430 | "#" [' ' '\t']* "if" '('
431 { let info = tokinfo lexbuf in
432 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
433 }
434
435 | "#" [' ' '\t']* "elif" [' ' '\t']+
436 { let info = tokinfo lexbuf in
437 TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
438 }
439
440
441 (* bugfix: can have #endif LINUX but at the same time if I eat everything
442 * until next line, I may miss some TComment which for some tools
443 * are important such as aComment
444 *)
445 | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
446 TEndif (no_ifdef_mark(), tokinfo lexbuf)
447 }
448 (* can be at eof *)
449 (*| "#" [' ' '\t']* "endif" { TEndif (tokinfo lexbuf) }*)
450
451 | "#" [' ' '\t']* "else" [' ' '\t' '\n']
452 { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
453
454
455
456
457 (* ---------------------- *)
458 (* #define body *)
459 (* ---------------------- *)
460
461 (* only in cpp directives normally *)
462 | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
463
464 (* We must generate separate tokens for #, ## and extend the grammar.
465 * Note there can be "elaborated" idents in many different places, in
466 * expression but also in declaration, in function name. So having 3 tokens
467 * for an ident does not work well with how we add info in
468 * ast_c. Was easier to generate just one token, just one info,
469 * even if have later to reanalyse those tokens and unsplit. But then,
470 * handling C++ lead to having not just a string for ident but something
471 * more complex. Also when we want to parse elaborated function headers
472 * (e.g. void METH(foo)(int x)), we need anyway to go from a string
473 * to something more. So having also for C something more than just
474 * string for ident is natural.
475 *
476 * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
477 * an easier solution would be to augment the TIdent type such as
478 * TIdent of string * info * cpp_ident_additionnal_info
479 *
480 * old:
481 * | id ([' ''\t']* "##" [' ''\t']* id)+
482 * { let info = tokinfo lexbuf in
483 * TIdent (tok lexbuf, info)
484 * }
485 * | "##" spopt id
486 * { let info = tokinfo lexbuf in
487 * TIdent (tok lexbuf, info)
488 * }
489 *
490 *)
491 (* cppext: string concatenation of idents, also ##args for variadic macro. *)
492 | "##" { TCppConcatOp (tokinfo lexbuf) }
493
494 (* cppext: stringification.
495 * bugfix: this case must be after the other cases such as #endif
496 * otherwise take precedent.
497 *)
498 | "#" spopt id
499 { let info = tokinfo lexbuf in
500 TIdent (tok lexbuf, info)
501 }
502 (* the ... next to id, e.g. arg..., works with ##, e.g. ##arg *)
503 | ((id as s) "...")
504 { TDefParamVariadic (s, tokinfo lexbuf) }
505
506
507
508
509
510 (* ----------------------------------------------------------------------- *)
511 (* C symbols *)
512 (* ----------------------------------------------------------------------- *)
513 (* stdC:
514 ... && -= >= ~ + ; ]
515 <<= &= -> >> % , < ^
516 >>= *= /= ^= & - = {
517 != ++ << |= ( . > |
518 %= += <= || ) / ? }
519 -- == ! * : [
520 recent addition: <: :> <% %>
521 only at processing: %: %:%: # ##
522 *)
523
524
525 | '[' { TOCro(tokinfo lexbuf) } | ']' { TCCro(tokinfo lexbuf) }
526 | '(' { TOPar(tokinfo lexbuf) } | ')' { TCPar(tokinfo lexbuf) }
527 | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
528
529 | '+' { TPlus(tokinfo lexbuf) } | '*' { TMul(tokinfo lexbuf) }
530 | '-' { TMinus(tokinfo lexbuf) } | '/' { TDiv(tokinfo lexbuf) }
531 | '%' { TMod(tokinfo lexbuf) }
532
533 | "++"{ TInc(tokinfo lexbuf) } | "--"{ TDec(tokinfo lexbuf) }
534
535 | "=" { TEq(tokinfo lexbuf) }
536
537 | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
538 | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
539 | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
540 | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
541 | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
542 | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
543 | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
544 | "^=" { TAssign (OpAssign Xor, (tokinfo lexbuf))}
545 | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
546 | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
547
548 | "==" { TEqEq(tokinfo lexbuf) } | "!=" { TNotEq(tokinfo lexbuf) }
549 | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
550 | "<" { TInf(tokinfo lexbuf) } | ">" { TSup(tokinfo lexbuf) }
551
552 | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
553 | ">>" { TShr(tokinfo lexbuf) } | "<<" { TShl(tokinfo lexbuf) }
554 | "&" { TAnd(tokinfo lexbuf) } | "|" { TOr(tokinfo lexbuf) }
555 | "^" { TXor(tokinfo lexbuf) }
556 | "..." { TEllipsis(tokinfo lexbuf) }
557 | "->" { TPtrOp(tokinfo lexbuf) } | '.' { TDot(tokinfo lexbuf) }
558 | ',' { TComma(tokinfo lexbuf) }
559 | ";" { TPtVirg(tokinfo lexbuf) }
560 | "?" { TWhy(tokinfo lexbuf) } | ":" { TDotDot(tokinfo lexbuf) }
561 | "!" { TBang(tokinfo lexbuf) } | "~" { TTilde(tokinfo lexbuf) }
562
563 | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
564 | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
565
566
567
568 (* ----------------------------------------------------------------------- *)
569 (* C keywords and ident *)
570 (* ----------------------------------------------------------------------- *)
571
572 (* StdC: must handle at least name of length > 509, but can
573 * truncate to 31 when compare and truncate to 6 and even lowerise
574 * in the external linkage phase
575 *)
576 | letter (letter | digit) *
577 { let info = tokinfo lexbuf in
578 let s = tok lexbuf in
579 Common.profile_code "C parsing.lex_ident" (fun () ->
580 match Common.optionise (fun () -> Hashtbl.find keyword_table s)
581 with
582 | Some f -> f info
583
584 (* parse_typedef_fix.
585 * if Lexer_parser.is_typedef s
586 * then TypedefIdent (s, info)
587 * else TIdent (s, info)
588 *
589 * update: now this is no more useful, cos
590 * as we use tokens_all, it first parse all as an ident and
591 * later transform an indent in a typedef. so the typedef job is
592 * now done in parse_c.ml.
593 *)
594
595 | None -> TIdent (s, info)
596 )
597 }
598 (* gccext: apparently gcc allows dollar in variable names. found such
599 * thing a few time in linux and in glibc. No need look in keyword_table
600 * here.
601 *)
602 | (letter | '$') (letter | digit | '$') *
603 {
604 let info = tokinfo lexbuf in
605 let s = tok lexbuf in
606 pr2 ("LEXER: identifier with dollar: " ^ s);
607 TIdent (s, info)
608 }
609
610
611 (* ----------------------------------------------------------------------- *)
612 (* C constant *)
613 (* ----------------------------------------------------------------------- *)
614
615 | "'"
616 { let info = tokinfo lexbuf in
617 let s = char lexbuf in
618 TChar ((s, IsChar), (info +> tok_add_s (s ^ "'")))
619 }
620 | '"'
621 { let info = tokinfo lexbuf in
622 let s = string lexbuf in
623 TString ((s, IsChar), (info +> tok_add_s (s ^ "\"")))
624 }
625 (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
626 | 'L' "'"
627 { let info = tokinfo lexbuf in
628 let s = char lexbuf in
629 TChar ((s, IsWchar), (info +> tok_add_s (s ^ "'")))
630 }
631 | 'L' '"'
632 { let info = tokinfo lexbuf in
633 let s = string lexbuf in
634 TString ((s, IsWchar), (info +> tok_add_s (s ^ "\"")))
635 }
636
637
638 (* Take care of the order ? No because lex try the longest match. The
639 * strange diff between decimal and octal constant semantic is not
640 * understood too by refman :) refman:11.1.4, and ritchie.
641 *
642 * todo: attach type info to constant, like for float
643 *)
644
645 | (( decimal | hexa | octal)
646 ( ['u' 'U']
647 | ['l' 'L']
648 | (['l' 'L'] ['u' 'U'])
649 | (['u' 'U'] ['l' 'L'])
650 | (['u' 'U'] ['l' 'L'] ['l' 'L'])
651 | (['l' 'L'] ['l' 'L'])
652 )?
653 ) as x { TInt (x, tokinfo lexbuf) }
654
655
656 | (real ['f' 'F']) as x { TFloat ((x, CFloat), tokinfo lexbuf) }
657 | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
658 | (real as x) { TFloat ((x, CDouble), tokinfo lexbuf) }
659
660 | ['0'] ['0'-'9']+
661 { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
662 TUnknown (tokinfo lexbuf)
663 }
664 | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
665 { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
666 TUnknown (tokinfo lexbuf)
667 }
668
669
670 (* !!! to put after other rules !!! otherwise 0xff
671 * will be parsed as an ident.
672 *)
673 | ['0'-'9']+ letter (letter | digit) *
674 { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
675 TIdent (tok lexbuf, tokinfo lexbuf)
676 }
677
678 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
679 (*
680 | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
681 | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
682 *)
683
684
685 (*------------------------------------------------------------------------ *)
686 | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
687
688 | _
689 {
690 if !Flag_parsing_c.verbose_lexing
691 then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
692 TUnknown (tokinfo lexbuf)
693 }
694
695
696
697 (*****************************************************************************)
698 and char = parse
699 | (_ as x) "'" { String.make 1 x }
700 (* todo?: as for octal, do exception beyond radix exception ? *)
701 | (("\\" (oct | oct oct | oct oct oct)) as x "'") { x }
702 (* this rule must be after the one with octal, lex try first longest
703 * and when \7 we want an octal, not an exn.
704 *)
705 | (("\\x" ((hex | hex hex))) as x "'") { x }
706 | (("\\" (_ as v)) as x "'")
707 {
708 (match v with (* Machine specific ? *)
709 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
710 | 'f' -> () | 'a' -> ()
711 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
712 | 'e' -> () (* linuxext: ? *)
713 | _ ->
714 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
715 );
716 x
717 }
718 | _
719 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
720 tok lexbuf
721 }
722
723
724
725 (*****************************************************************************)
726
727 (* todo? factorise code with char ? but not same ending token so hard. *)
728 and string = parse
729 | '"' { "" }
730 | (_ as x) { string_of_char x^string lexbuf}
731 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
732 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
733 | ("\\" (_ as v)) as x
734 {
735 (match v with (* Machine specific ? *)
736 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
737 | 'f' -> () | 'a' -> ()
738 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
739 | 'e' -> () (* linuxext: ? *)
740
741 (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
742
743 (* cppext: can have \ for multiline in string too *)
744 | '\n' -> ()
745 | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
746 );
747 x ^ string lexbuf
748 }
749
750 | eof { pr2 "LEXER: WIERD end of file in string"; ""}
751
752 (* Bug if add following code, cos match also the '"' that is needed
753 * to finish the string, and so go until end of file.
754 *)
755 (*
756 | [^ '\\']+
757 { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
758 cs ++ string lexbuf
759 }
760 *)
761
762
763
764 (*****************************************************************************)
765
766 (* less: allow only char-'*' ? *)
767 and comment = parse
768 | "*/" { tok lexbuf }
769 (* noteopti: *)
770 | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
771 | [ '*'] { let s = tok lexbuf in s ^ comment lexbuf }
772 | _
773 { let s = tok lexbuf in
774 pr2 ("LEXER: unrecognised symbol in comment:"^s);
775 s ^ comment lexbuf
776 }
777
778
779
780 (*****************************************************************************)
781
782 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
783 * then he has already erased the /* comment. So:
784 * - dont eat the start of the comment otherwise afterwards we are in the middle
785 * of a comment and so will problably get a parse error somewhere.
786 * - have to recognize comments in cpp_eat_until_nl.
787 *)
788
789 and cpp_eat_until_nl = parse
790 (* bugfix: *)
791 | "/*"
792 { let s = tok lexbuf in
793 let s2 = comment lexbuf in
794 let s3 = cpp_eat_until_nl lexbuf in
795 s ^ s2 ^ s3
796 }
797 | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
798
799 | "\n" { tok lexbuf }
800 (* noteopti:
801 * update: need also deal with comments chars now
802 *)
803 | [^ '\n' '\\' '/' '*' ]+
804 { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
805 | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
806 | _ { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }