Release coccinelle-0.1.2
[bpt/coccinelle.git] / parsing_c / lexer_c.mll
1 {
2 (* Copyright (C) 2002, 2006, 2007, 2008 Yoann Padioleau
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License (GPL)
6 * version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * file license.txt for more details.
12 *)
13 open Common
14
15 open Parser_c
16
17 open Ast_c (* to factorise tokens, OpAssign, ... *)
18
19 (*****************************************************************************)
20 (*
21 * subtil: ocamllex use side effect on lexbuf, so must take care.
22 * For instance must do
23 *
24 * let info = tokinfo lexbuf in
25 * TComment (info +> tok_add_s (comment lexbuf))
26 *
27 * and not
28 *
29 * TComment (tokinfo lexbuf +> tok_add_s (comment lexbuf))
30 *
31 * because of the "wierd" order of evaluation of OCaml.
32 *
33 * note: can't use Lexer_parser._lexer_hint here to do different
34 * things, because now we call the lexer to get all the tokens
35 * (tokens_all), and then we parse. So we can't have the _lexer_hint
36 * info here. We can have it only in parse_c. For the same reason, the
37 * typedef handling here is now useless.
38 *)
39 (*****************************************************************************)
40
41 (*****************************************************************************)
42 (* Wrappers *)
43 (*****************************************************************************)
44 let pr2 s =
45 if !Flag_parsing_c.verbose_lexing
46 then Common.pr2 s
47
48 (*****************************************************************************)
49
50
51 exception Lexical of string
52
53 let tok lexbuf = Lexing.lexeme lexbuf
54
55 let tokinfo lexbuf =
56 {
57 pinfo = Ast_c.OriginTok {
58 Common.charpos = Lexing.lexeme_start lexbuf;
59 Common.str = Lexing.lexeme lexbuf;
60 (* info filled in a post-lexing phase *)
61 Common.line = -1;
62 Common.column = -1;
63 Common.file = "";
64 };
65 (* must generate a new ref each time, otherwise share *)
66 cocci_tag = ref Ast_c.emptyAnnot;
67 comments_tag = ref Ast_c.emptyComments;
68 }
69
70 (* must generate a new ref each time, otherwise share *)
71 let no_ifdef_mark () = ref (None: (int * int) option)
72
73 let tok_add_s s ii = Ast_c.rewrap_str ((Ast_c.str_of_info ii) ^ s) ii
74
75
76 (* opti: less convenient, but using a hash is faster than using a match *)
77 let keyword_table = Common.hash_of_list [
78
79 (* c: *)
80 "void", (fun ii -> Tvoid ii);
81 "char", (fun ii -> Tchar ii);
82 "short", (fun ii -> Tshort ii);
83 "int", (fun ii -> Tint ii);
84 "long", (fun ii -> Tlong ii);
85 "float", (fun ii -> Tfloat ii);
86 "double", (fun ii -> Tdouble ii);
87
88 "unsigned", (fun ii -> Tunsigned ii);
89 "signed", (fun ii -> Tsigned ii);
90
91 "auto", (fun ii -> Tauto ii);
92 "register", (fun ii -> Tregister ii);
93 "extern", (fun ii -> Textern ii);
94 "static", (fun ii -> Tstatic ii);
95
96 "const", (fun ii -> Tconst ii);
97 "volatile", (fun ii -> Tvolatile ii);
98
99 "struct", (fun ii -> Tstruct ii);
100 "union", (fun ii -> Tunion ii);
101 "enum", (fun ii -> Tenum ii);
102 "typedef", (fun ii -> Ttypedef ii);
103
104 "if", (fun ii -> Tif ii);
105 "else", (fun ii -> Telse ii);
106 "break", (fun ii -> Tbreak ii);
107 "continue", (fun ii -> Tcontinue ii);
108 "switch", (fun ii -> Tswitch ii);
109 "case", (fun ii -> Tcase ii);
110 "default", (fun ii -> Tdefault ii);
111 "for", (fun ii -> Tfor ii);
112 "do", (fun ii -> Tdo ii);
113 "while", (fun ii -> Twhile ii);
114 "return", (fun ii -> Treturn ii);
115 "goto", (fun ii -> Tgoto ii);
116
117 "sizeof", (fun ii -> Tsizeof ii);
118
119
120 (* gccext: cppext: linuxext: synonyms *)
121 "asm", (fun ii -> Tasm ii);
122 "__asm__", (fun ii -> Tasm ii);
123 "__asm", (fun ii -> Tasm ii);
124
125 "inline", (fun ii -> Tinline ii);
126 "__inline__", (fun ii -> Tinline ii);
127 "__inline", (fun ii -> Tinline ii);
128
129 "__attribute__", (fun ii -> Tattribute ii);
130 "__attribute", (fun ii -> Tattribute ii);
131
132 "typeof", (fun ii -> Ttypeof ii);
133 "__typeof__", (fun ii -> Ttypeof ii);
134 "__typeof", (fun ii -> Ttypeof ii);
135
136
137 (* gccext: alias *)
138 "__signed__", (fun ii -> Tsigned ii);
139
140 "__const__", (fun ii -> Tconst ii);
141 "__const", (fun ii -> Tconst ii);
142
143 "__volatile__", (fun ii -> Tvolatile ii);
144 "__volatile", (fun ii -> Tvolatile ii);
145
146
147 (* c99: *)
148 (* no just "restrict" ? maybe for backward compatibility they avoided
149 * to use restrict which people may have used in their program already
150 *)
151 "__restrict", (fun ii -> Trestrict ii);
152 "__restrict__", (fun ii -> Trestrict ii);
153
154 ]
155
156 let error_radix s =
157 ("numeric " ^ s ^ " constant contains digits beyond the radix:")
158
159 }
160
161 (*****************************************************************************)
162 let letter = ['A'-'Z' 'a'-'z' '_']
163 let digit = ['0'-'9']
164
165 (* not used for the moment *)
166 let punctuation = ['!' '"' '#' '%' '&' '\'' '(' ')' '*' '+' ',' '-' '.' '/' ':'
167 ';' '<' '=' '>' '?' '[' '\\' ']' '^' '{' '|' '}' '~']
168 let space = [' ' '\t' '\n' '\r' '\011' '\012' ]
169 let additionnal = [ ' ' '\b' '\t' '\011' '\n' '\r' '\007' ]
170 (* 7 = \a = bell in C. this is not the only char allowed !!
171 * ex @ and $ ` are valid too
172 *)
173
174 let cchar = (letter | digit | punctuation | additionnal)
175
176 let sp = [' ' '\t']+
177 let spopt = [' ' '\t']*
178
179 let dec = ['0'-'9']
180 let oct = ['0'-'7']
181 let hex = ['0'-'9' 'a'-'f' 'A'-'F']
182
183 let decimal = ('0' | (['1'-'9'] dec*))
184 let octal = ['0'] oct+
185 let hexa = ("0x" |"0X") hex+
186
187
188 let pent = dec+
189 let pfract = dec+
190 let sign = ['-' '+']
191 let exp = ['e''E'] sign? dec+
192 let real = pent exp | ((pent? '.' pfract | pent '.' pfract? ) exp?)
193
194 let id = letter (letter | digit) *
195
196 (*****************************************************************************)
197 rule token = parse
198
199 (* ----------------------------------------------------------------------- *)
200 (* spacing/comments *)
201 (* ----------------------------------------------------------------------- *)
202
203 (* note: this lexer generate tokens for comments!! so can not give
204 * this lexer as-is to the parsing function. Must preprocess it, hence
205 * use techniques like cur_tok ref in parse_c.ml
206 *)
207
208 | ['\n'] [' ' '\t' '\r' '\011' '\012' ]*
209 (* starting a new line; the newline character followed by whitespace *)
210 { TCommentNewline (tokinfo lexbuf) }
211 | [' ' '\t' '\r' '\011' '\012' ]+
212 { TCommentSpace (tokinfo lexbuf) }
213 | "/*"
214 { let info = tokinfo lexbuf in
215 let com = comment lexbuf in
216 TComment(info +> tok_add_s com)
217 }
218
219
220 (* C++ comment are allowed via gccext, but normally they are deleted by cpp.
221 * So need this here only when dont call cpp before.
222 * note that we don't keep the trailing \n; it will be in another token.
223 *)
224 | "//" [^'\r' '\n' '\011']* { TComment (tokinfo lexbuf) }
225
226 (* ----------------------------------------------------------------------- *)
227 (* cpp *)
228 (* ----------------------------------------------------------------------- *)
229
230 (* old:
231 * | '#' { endline lexbuf} // should be line, and not endline
232 * and endline = parse | '\n' { token lexbuf}
233 * | _ { endline lexbuf}
234 *)
235
236 (* todo?:
237 * have found a # #else in "newfile-2.6.c", legal ? and also a #/* ...
238 * => just "#" -> token {lexbuf} (that is ignore)
239 * il y'a 1 #elif sans rien apres
240 * il y'a 1 #error sans rien apres
241 * il y'a 2 mov dede, #xxx qui genere du coup exn car
242 * entouré par des #if 0
243 * => make as for comment, call a comment_cpp that when #endif finish the
244 * comment and if other cpp stuff raise exn
245 * il y'a environ 10 #if(xxx) ou le ( est collé direct
246 * il y'a des include"" et include<
247 * il y'a 1 ` (derriere un #ifndef linux)
248 *)
249
250
251
252 (* ---------------------- *)
253 (* misc *)
254 (* ---------------------- *)
255
256 (* bugfix: I want now to keep comments for the cComment study
257 * so cant do: sp [^'\n']+ '\n'
258 * http://gcc.gnu.org/onlinedocs/gcc/Pragmas.html
259 *)
260
261 | "#" spopt "pragma" sp [^'\n']* '\n'
262 | "#" spopt "ident" sp [^'\n']* '\n'
263 | "#" spopt "line" sp [^'\n']* '\n'
264 | "#" spopt "error" sp [^'\n']* '\n'
265 | "#" spopt "warning" sp [^'\n']* '\n'
266 | "#" spopt "abort" sp [^'\n']* '\n'
267 { TCppDirectiveOther (tokinfo lexbuf) }
268
269 | "#" [' ' '\t']* '\n'
270 { TCppDirectiveOther (tokinfo lexbuf) }
271
272 (* only after cpp, ex: # 1 "include/linux/module.h" 1 *)
273 | "#" sp pent sp '"' [^ '"']* '"' (spopt pent)* spopt '\n'
274 { TCppDirectiveOther (tokinfo lexbuf) }
275
276
277
278 (* ---------------------- *)
279 (* #define, #undef *)
280 (* ---------------------- *)
281
282 (* the rest of the lexing/parsing of define is done in fix_tokens_define
283 * where we parse until a TCppEscapedNewline and generate a TDefEol
284 *)
285 | "#" [' ' '\t']* "define" { TDefine (tokinfo lexbuf) }
286
287 (* note: in some cases can have stuff after the ident as in #undef XXX 50,
288 * but I currently don't handle it cos I think it's bad code.
289 *)
290 | (("#" [' ' '\t']* "undef" [' ' '\t']+) as _undef) (id as id)
291 { let info = tokinfo lexbuf in
292 TUndef (id, info)
293 (*+> tok_add_s (cpp_eat_until_nl lexbuf))*)
294 }
295
296
297 (* ---------------------- *)
298 (* #include *)
299 (* ---------------------- *)
300
301 (* The difference between a local "" and standard <> include is computed
302 * later in parser_c.mly. So redo a little bit of lexing there; ugly but
303 * simpler to generate a single token here. *)
304 | (("#" [' ''\t']* "include" [' ' '\t']*) as includes)
305 (('"' ([^ '"']+) '"' |
306 '<' [^ '>']+ '>' |
307 ['A'-'Z''_']+
308 ) as filename)
309 { let info = tokinfo lexbuf in
310 TInclude (includes, filename, Ast_c.noInIfdef(), info)
311 }
312 (* gccext: found in glibc *)
313 | (("#" [' ''\t']* "include_next" [' ' '\t']*) as includes)
314 (('"' ([^ '"']+) '"' |
315 '<' [^ '>']+ '>' |
316 ['A'-'Z''_']+
317 ) as filename)
318 { let info = tokinfo lexbuf in
319 TInclude (includes, filename, Ast_c.noInIfdef(), info)
320 }
321
322 (* ---------------------- *)
323 (* #ifdef *)
324 (* ---------------------- *)
325
326 (* The ifdef_mark will be set later in Parsing_hacks.set_ifdef_parenthize_info
327 * when working on the ifdef view.
328 *)
329
330 (* '0'+ because sometimes it is a #if 000 *)
331 | "#" [' ' '\t']* "if" [' ' '\t']* '0'+ (* [^'\n']* '\n' *)
332 { let info = tokinfo lexbuf in
333 TIfdefBool (false, no_ifdef_mark(), info)
334 (* +> tok_add_s (cpp_eat_until_nl lexbuf)*)
335 }
336
337 | "#" [' ' '\t']* "if" [' ' '\t']* '1' (* [^'\n']* '\n' *)
338 { let info = tokinfo lexbuf in
339 TIfdefBool (true, no_ifdef_mark(), info)
340
341 }
342
343 (* DO NOT cherry pick to lexer_cplusplus !!! often used for the extern "C" { *)
344 | "#" [' ' '\t']* "if" sp "defined" sp "(" spopt "__cplusplus" spopt ")" [^'\n']* '\n'
345 { let info = tokinfo lexbuf in
346 TIfdefMisc (false, no_ifdef_mark(), info)
347 }
348
349 (* DO NOT cherry pick to lexer_cplusplus !!! *)
350 | "#" [' ' '\t']* "ifdef" [' ' '\t']* "__cplusplus" [^'\n']* '\n'
351 { let info = tokinfo lexbuf in
352 TIfdefMisc (false, no_ifdef_mark(), info)
353 }
354
355 (* in glibc *)
356 | "#" spopt ("ifdef"|"if") sp "__STDC__"
357 { let info = tokinfo lexbuf in
358 TIfdefVersion (true, no_ifdef_mark(),
359 info +> tok_add_s (cpp_eat_until_nl lexbuf))
360 }
361
362
363 (* linuxext: different possible variations (we do not manage all of them):
364
365 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
366 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,2)
367 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
368 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,3,0)
369 #if LINUX_VERSION_CODE < 0x020600
370 #if LINUX_VERSION_CODE >= 0x2051c
371 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
372 #if !(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,73))
373 #if STREAMER_IOCTL && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
374 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
375 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) && \
376 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
377 #if LINUX_VERSION_CODE > LinuxVersionCode(2,3,12)
378 #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,93)
379 #ifndef LINUX_VERSION_CODE
380 #if LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,2,0) || \
381 (LINUX_VERSION_CODE > ASC_LINUX_VERSION(2,3,0) && \
382 LINUX_VERSION_CODE < ASC_LINUX_VERSION(2,4,0))
383 #if (KERNEL_VERSION(2,4,0) > LINUX_VERSION_CODE)
384 #if LINUX_VERSION_CODE >= ASC_LINUX_VERSION(1,3,0)
385 # if defined(MODULE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,1,30)
386
387 *)
388
389 (* linuxext: must be before the generic rules for if and ifdef *)
390 | "#" spopt "if" sp "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
391 { let info = tokinfo lexbuf in
392 TIfdefVersion (true, no_ifdef_mark(),
393 info +> tok_add_s (cpp_eat_until_nl lexbuf))
394 }
395 (* linuxext: *)
396 | "#" spopt "if" sp "!" "("? "LINUX_VERSION_CODE" sp (">=" | ">") sp
397 | "#" spopt "if" sp ['(']? "LINUX_VERSION_CODE" sp ("<=" | "<") sp
398
399 { let info = tokinfo lexbuf in
400 TIfdefVersion (false, no_ifdef_mark(),
401 info +> tok_add_s (cpp_eat_until_nl lexbuf))
402 }
403
404
405
406
407 (* can have some ifdef 0 hence the letter|digit even at beginning of word *)
408 | "#" [' ''\t']* "ifdef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
409 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
410 | "#" [' ''\t']* "ifndef" [' ''\t']+ (letter|digit) ((letter|digit)*) [' ''\t']*
411 { TIfdef (no_ifdef_mark(), tokinfo lexbuf) }
412 | "#" [' ''\t']* "if" [' ' '\t']+
413 { let info = tokinfo lexbuf in
414 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
415 }
416 | "#" [' ' '\t']* "if" '('
417 { let info = tokinfo lexbuf in
418 TIfdef (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
419 }
420
421 | "#" [' ' '\t']* "elif" [' ' '\t']+
422 { let info = tokinfo lexbuf in
423 TIfdefelif (no_ifdef_mark(), info +> tok_add_s (cpp_eat_until_nl lexbuf))
424 }
425
426
427 (* bugfix: can have #endif LINUX but at the same time if I eat everything
428 * until next line, I may miss some TComment which for some tools
429 * are important such as aComment
430 *)
431 | "#" [' ' '\t']* "endif" (*[^'\n']* '\n'*) {
432 TEndif (no_ifdef_mark(), tokinfo lexbuf)
433 }
434 (* can be at eof *)
435 (*| "#" [' ' '\t']* "endif" { TEndif (tokinfo lexbuf) }*)
436
437 | "#" [' ' '\t']* "else" [' ' '\t' '\n']
438 { TIfdefelse (no_ifdef_mark(), tokinfo lexbuf) }
439
440
441
442
443 (* ---------------------- *)
444 (* #define body *)
445 (* ---------------------- *)
446
447 (* only in cpp directives normally *)
448 | "\\" '\n' { TCppEscapedNewline (tokinfo lexbuf) }
449
450
451 | ((id as s) "...")
452 { TDefParamVariadic (s, tokinfo lexbuf) }
453
454
455 (* could generate separate token for #, ## and then extend grammar,
456 * but there can be ident in many different places, in expression
457 * but also in declaration, in function name. So having 3 tokens
458 * for an ident does not work well with how we add info in
459 * ast_c. So better to generate just one token, for now, just one info,
460 * even if have later to reanalyse those tokens and unsplit.
461 *
462 * todo: our heuristics in parsing_hacks rely on TIdent. So maybe
463 * an easier solution would be to augment the TIdent type such as
464 * TIdent of string * info * cpp_ident_additionnal_info
465 *)
466
467
468 (* cppext: string concatenation of idents *)
469 | id ([' ''\t']* "##" [' ''\t']* id)+
470 { let info = tokinfo lexbuf in
471 TIdent (tok lexbuf, info)
472 }
473
474 (* cppext: stringification.
475 * bugfix: this case must be after the other cases such as #endif
476 * otherwise take precedent.
477 *)
478 | "#" spopt id
479 { let info = tokinfo lexbuf in
480 TIdent (tok lexbuf, info)
481 }
482
483 (* cppext: gccext: ##args for variadic macro *)
484 | "##" spopt id
485 { let info = tokinfo lexbuf in
486 TIdent (tok lexbuf, info)
487 }
488
489
490
491 (* ----------------------------------------------------------------------- *)
492 (* C symbols *)
493 (* ----------------------------------------------------------------------- *)
494 (* stdC:
495 ... && -= >= ~ + ; ]
496 <<= &= -> >> % , < ^
497 >>= *= /= ^= & - = {
498 != ++ << |= ( . > |
499 %= += <= || ) / ? }
500 -- == ! * : [
501 recent addition: <: :> <% %>
502 only at processing: %: %:%: # ##
503 *)
504
505
506 | '[' { TOCro(tokinfo lexbuf) } | ']' { TCCro(tokinfo lexbuf) }
507 | '(' { TOPar(tokinfo lexbuf) } | ')' { TCPar(tokinfo lexbuf) }
508 | '{' { TOBrace(tokinfo lexbuf) } | '}' { TCBrace(tokinfo lexbuf) }
509
510 | '+' { TPlus(tokinfo lexbuf) } | '*' { TMul(tokinfo lexbuf) }
511 | '-' { TMinus(tokinfo lexbuf) } | '/' { TDiv(tokinfo lexbuf) }
512 | '%' { TMod(tokinfo lexbuf) }
513
514 | "++"{ TInc(tokinfo lexbuf) } | "--"{ TDec(tokinfo lexbuf) }
515
516 | "=" { TEq(tokinfo lexbuf) }
517
518 | "-=" { TAssign (OpAssign Minus, (tokinfo lexbuf))}
519 | "+=" { TAssign (OpAssign Plus, (tokinfo lexbuf))}
520 | "*=" { TAssign (OpAssign Mul, (tokinfo lexbuf))}
521 | "/=" { TAssign (OpAssign Div, (tokinfo lexbuf))}
522 | "%=" { TAssign (OpAssign Mod, (tokinfo lexbuf))}
523 | "&=" { TAssign (OpAssign And, (tokinfo lexbuf))}
524 | "|=" { TAssign (OpAssign Or, (tokinfo lexbuf)) }
525 | "^=" { TAssign(OpAssign Xor, (tokinfo lexbuf))}
526 | "<<=" {TAssign (OpAssign DecLeft, (tokinfo lexbuf)) }
527 | ">>=" {TAssign (OpAssign DecRight, (tokinfo lexbuf))}
528
529 | "==" { TEqEq(tokinfo lexbuf) } | "!=" { TNotEq(tokinfo lexbuf) }
530 | ">=" { TSupEq(tokinfo lexbuf) } | "<=" { TInfEq(tokinfo lexbuf) }
531 | "<" { TInf(tokinfo lexbuf) } | ">" { TSup(tokinfo lexbuf) }
532
533 | "&&" { TAndLog(tokinfo lexbuf) } | "||" { TOrLog(tokinfo lexbuf) }
534 | ">>" { TShr(tokinfo lexbuf) } | "<<" { TShl(tokinfo lexbuf) }
535 | "&" { TAnd(tokinfo lexbuf) } | "|" { TOr(tokinfo lexbuf) }
536 | "^" { TXor(tokinfo lexbuf) }
537 | "..." { TEllipsis(tokinfo lexbuf) }
538 | "->" { TPtrOp(tokinfo lexbuf) } | '.' { TDot(tokinfo lexbuf) }
539 | ',' { TComma(tokinfo lexbuf) }
540 | ";" { TPtVirg(tokinfo lexbuf) }
541 | "?" { TWhy(tokinfo lexbuf) } | ":" { TDotDot(tokinfo lexbuf) }
542 | "!" { TBang(tokinfo lexbuf) } | "~" { TTilde(tokinfo lexbuf) }
543
544 | "<:" { TOCro(tokinfo lexbuf) } | ":>" { TCCro(tokinfo lexbuf) }
545 | "<%" { TOBrace(tokinfo lexbuf) } | "%>" { TCBrace(tokinfo lexbuf) }
546
547
548
549 (* ----------------------------------------------------------------------- *)
550 (* C keywords and ident *)
551 (* ----------------------------------------------------------------------- *)
552
553 (* StdC: must handle at least name of length > 509, but can
554 * truncate to 31 when compare and truncate to 6 and even lowerise
555 * in the external linkage phase
556 *)
557 | letter (letter | digit) *
558 { let info = tokinfo lexbuf in
559 let s = tok lexbuf in
560 Common.profile_code "C parsing.lex_ident" (fun () ->
561 match Common.optionise (fun () -> Hashtbl.find keyword_table s)
562 with
563 | Some f -> f info
564
565 (* parse_typedef_fix.
566 * if Lexer_parser.is_typedef s
567 * then TypedefIdent (s, info)
568 * else TIdent (s, info)
569 *
570 * update: now this is no more useful, cos
571 * as we use tokens_all, it first parse all as an ident and
572 * later transform an indent in a typedef. so the typedef job is
573 * now done in parse_c.ml.
574 *)
575
576 | None -> TIdent (s, info)
577 )
578 }
579 (* gccext: apparently gcc allows dollar in variable names. found such
580 * thing a few time in linux and in glibc. No need look in keyword_table
581 * here.
582 *)
583 | (letter | '$') (letter | digit | '$') *
584 {
585 let info = tokinfo lexbuf in
586 let s = tok lexbuf in
587 pr2 ("LEXER: identifier with dollar: " ^ s);
588 TIdent (s, info)
589 }
590
591
592 (* ----------------------------------------------------------------------- *)
593 (* C constant *)
594 (* ----------------------------------------------------------------------- *)
595
596 | "'"
597 { let info = tokinfo lexbuf in
598 let s = char lexbuf in
599 TChar ((s, IsChar), (info +> tok_add_s (s ^ "'")))
600 }
601 | '"'
602 { let info = tokinfo lexbuf in
603 let s = string lexbuf in
604 TString ((s, IsChar), (info +> tok_add_s (s ^ "\"")))
605 }
606 (* wide character encoding, TODO L'toto' valid ? what is allowed ? *)
607 | 'L' "'"
608 { let info = tokinfo lexbuf in
609 let s = char lexbuf in
610 TChar ((s, IsWchar), (info +> tok_add_s (s ^ "'")))
611 }
612 | 'L' '"'
613 { let info = tokinfo lexbuf in
614 let s = string lexbuf in
615 TString ((s, IsWchar), (info +> tok_add_s (s ^ "\"")))
616 }
617
618
619 (* Take care of the order ? No because lex try the longest match. The
620 * strange diff between decimal and octal constant semantic is not
621 * understood too by refman :) refman:11.1.4, and ritchie.
622 *)
623
624 | (( decimal | hexa | octal)
625 ( ['u' 'U']
626 | ['l' 'L']
627 | (['l' 'L'] ['u' 'U'])
628 | (['u' 'U'] ['l' 'L'])
629 | (['u' 'U'] ['l' 'L'] ['l' 'L'])
630 | (['l' 'L'] ['l' 'L'])
631 )?
632 ) as x { TInt (x, tokinfo lexbuf) }
633
634
635 | (real ['f' 'F']) as x { TFloat ((x, CFloat), tokinfo lexbuf) }
636 | (real ['l' 'L']) as x { TFloat ((x, CLongDouble), tokinfo lexbuf) }
637 | (real as x) { TFloat ((x, CDouble), tokinfo lexbuf) }
638
639 | ['0'] ['0'-'9']+
640 { pr2 ("LEXER: " ^ error_radix "octal" ^ tok lexbuf);
641 TUnknown (tokinfo lexbuf)
642 }
643 | ("0x" |"0X") ['0'-'9' 'a'-'z' 'A'-'Z']+
644 { pr2 ("LEXER: " ^ error_radix "hexa" ^ tok lexbuf);
645 TUnknown (tokinfo lexbuf)
646 }
647
648
649 (* !!! to put after other rules !!! otherwise 0xff
650 * will be parsed as an ident.
651 *)
652 | ['0'-'9']+ letter (letter | digit) *
653 { pr2 ("LEXER: ZARB integer_string, certainly a macro:" ^ tok lexbuf);
654 TIdent (tok lexbuf, tokinfo lexbuf)
655 }
656
657 (* gccext: http://gcc.gnu.org/onlinedocs/gcc/Binary-constants.html *)
658 (*
659 | "0b" ['0'-'1'] { TInt (((tok lexbuf)<!!>(??,??)) +> int_of_stringbits) }
660 | ['0'-'1']+'b' { TInt (((tok lexbuf)<!!>(0,-2)) +> int_of_stringbits) }
661 *)
662
663
664 (*------------------------------------------------------------------------ *)
665 | eof { EOF (tokinfo lexbuf +> Ast_c.rewrap_str "") }
666
667 | _
668 {
669 if !Flag_parsing_c.verbose_lexing
670 then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
671 TUnknown (tokinfo lexbuf)
672 }
673
674
675
676 (*****************************************************************************)
677 and char = parse
678 | (_ as x) "'" { String.make 1 x }
679 (* todo?: as for octal, do exception beyond radix exception ? *)
680 | (("\\" (oct | oct oct | oct oct oct)) as x "'") { x }
681 (* this rule must be after the one with octal, lex try first longest
682 * and when \7 we want an octal, not an exn.
683 *)
684 | (("\\x" ((hex | hex hex))) as x "'") { x }
685 | (("\\" (_ as v)) as x "'")
686 {
687 (match v with (* Machine specific ? *)
688 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
689 | 'f' -> () | 'a' -> ()
690 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
691 | 'e' -> () (* linuxext: ? *)
692 | _ ->
693 pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
694 );
695 x
696 }
697 | _
698 { pr2 ("LEXER: unrecognised symbol in char:"^tok lexbuf);
699 tok lexbuf
700 }
701
702
703
704 (*****************************************************************************)
705
706 (* todo? factorise code with char ? but not same ending token so hard. *)
707 and string = parse
708 | '"' { "" }
709 | (_ as x) { string_of_char x^string lexbuf}
710 | ("\\" (oct | oct oct | oct oct oct)) as x { x ^ string lexbuf }
711 | ("\\x" (hex | hex hex)) as x { x ^ string lexbuf }
712 | ("\\" (_ as v)) as x
713 {
714 (match v with (* Machine specific ? *)
715 | 'n' -> () | 't' -> () | 'v' -> () | 'b' -> () | 'r' -> ()
716 | 'f' -> () | 'a' -> ()
717 | '\\' -> () | '?' -> () | '\'' -> () | '"' -> ()
718 | 'e' -> () (* linuxext: ? *)
719
720 (* old: "x" -> 10 gccext ? todo ugly, I put a fake value *)
721
722 (* cppext: can have \ for multiline in string too *)
723 | '\n' -> ()
724 | _ -> pr2 ("LEXER: unrecognised symbol in string:"^tok lexbuf);
725 );
726 x ^ string lexbuf
727 }
728
729 | eof { pr2 "LEXER: WIERD end of file in string"; ""}
730
731 (* Bug if add following code, cos match also the '"' that is needed
732 * to finish the string, and so go until end of file.
733 *)
734 (*
735 | [^ '\\']+
736 { let cs = lexbuf +> tok +> list_of_string +> List.map Char.code in
737 cs ++ string lexbuf
738 }
739 *)
740
741
742
743 (*****************************************************************************)
744
745 (* less: allow only char-'*' ? *)
746 and comment = parse
747 | "*/" { tok lexbuf }
748 (* noteopti: *)
749 | [^ '*']+ { let s = tok lexbuf in s ^ comment lexbuf }
750 | [ '*'] { let s = tok lexbuf in s ^ comment lexbuf }
751 | _
752 { let s = tok lexbuf in
753 pr2 ("LEXER: unrecognised symbol in comment:"^s);
754 s ^ comment lexbuf
755 }
756
757
758
759 (*****************************************************************************)
760
761 (* cpp recognize C comments, so when #define xx (yy) /* comment \n ... */
762 * then he has already erased the /* comment. So:
763 * - dont eat the start of the comment otherwise afterwards we are in the middle
764 * of a comment and so will problably get a parse error somewhere.
765 * - have to recognize comments in cpp_eat_until_nl.
766 *)
767
768 and cpp_eat_until_nl = parse
769 (* bugfix: *)
770 | "/*"
771 { let s = tok lexbuf in
772 let s2 = comment lexbuf in
773 let s3 = cpp_eat_until_nl lexbuf in
774 s ^ s2 ^ s3
775 }
776 | '\\' "\n" { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
777
778 | "\n" { tok lexbuf }
779 (* noteopti:
780 * update: need also deal with comments chars now
781 *)
782 | [^ '\n' '\\' '/' '*' ]+
783 { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }
784 | eof { pr2 "LEXER: end of file in cpp_eat_until_nl"; ""}
785 | _ { let s = tok lexbuf in s ^ cpp_eat_until_nl lexbuf }