Release coccinelle-0.1.8
[bpt/coccinelle.git] / parsing_c / cpp_token_c.ml
1 (* Yoann Padioleau
2 *
3 * Copyright (C) 2007, 2008 Ecole des Mines de Nantes
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License (GPL)
7 * version 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * file license.txt for more details.
13 *)
14
15 open Common
16
17 module TH = Token_helpers
18
19 open Parser_c
20 open Token_views_c
21
22 (*****************************************************************************)
23 (* Prelude *)
24 (*****************************************************************************)
25
26 (* cpp functions working at the token level. Cf cpp_ast_c for cpp functions
27 * working at the AST level (which is very unusual but makes sense in
28 * the coccinelle context for instance).
29 *
30 * Note that as I use a single lexer to work both at the C and cpp level
31 * there are some inconveniencies.
32 * For instance 'for' is a valid name for a macro parameter and macro
33 * body, but is interpreted in a special way by our single lexer, and
34 * so at some places where I expect a TIdent I need also to
35 * handle special cases and accept Tfor, Tif, etc at those places.
36 *
37 * There are multiple issues related to those keywords incorrect tokens.
38 * Those keywords can be:
39 * - (1) in the name of the macro as in #define inline
40 * - (2) in a parameter of the macro as in #define foo(char) char x;
41 * - (3) in an argument to a macro call as in IDENT(if);
42 * Case 1 is easy to fix in define_ident.
43 * Case 2 is easy to fix in define_parse where detect such toks in
44 * the parameter and then replace their occurence in the body in a Tident.
45 * Case 3 is only an issue when the expanded token is not really use
46 * as usual but use for instance in concatenation as in a ## if
47 * when expanded. In the case the grammar this time will not be happy
48 * so this is also easy to fix in cpp_engine.
49 *
50 *)
51
52 (*****************************************************************************)
53 (* Wrappers *)
54 (*****************************************************************************)
55 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_parsing
56
57 (*****************************************************************************)
58 (* Types *)
59 (*****************************************************************************)
60
61
62 (* ------------------------------------------------------------------------- *)
63 (* mimic standard.h *)
64 (* ------------------------------------------------------------------------- *)
65
66 type define_def = string * define_param * define_body
67 and define_param =
68 | NoParam
69 | Params of string list
70 and define_body =
71 | DefineBody of Parser_c.token list
72 | DefineHint of parsinghack_hint
73
74 and parsinghack_hint =
75 | HintIterator
76 | HintDeclarator
77 | HintMacroString
78 | HintMacroStatement
79 | HintAttribute
80 | HintMacroIdentBuilder
81
82
83
84 (* cf also data/test.h *)
85 let assoc_hint_string = [
86 "YACFE_ITERATOR" , HintIterator;
87 "YACFE_DECLARATOR" , HintDeclarator;
88 "YACFE_STRING" , HintMacroString;
89 "YACFE_STATEMENT" , HintMacroStatement;
90 "YACFE_ATTRIBUTE" , HintAttribute;
91 "YACFE_IDENT_BUILDER" , HintMacroIdentBuilder;
92
93 "MACROSTATEMENT" , HintMacroStatement; (* backward compatibility *)
94 ]
95
96
97 let (parsinghack_hint_of_string: string -> parsinghack_hint option) = fun s ->
98 Common.assoc_option s assoc_hint_string
99
100 let (is_parsinghack_hint: string -> bool) = fun s ->
101 parsinghack_hint_of_string s <> None
102
103 let (token_from_parsinghack_hint:
104 (string * Ast_c.info) -> parsinghack_hint -> Parser_c.token) =
105 fun (s,ii) hint ->
106 match hint with
107 | HintIterator ->
108 Parser_c.TMacroIterator (s, ii)
109 | HintDeclarator ->
110 Parser_c.TMacroDecl (s, ii)
111 | HintMacroString ->
112 Parser_c.TMacroString (s, ii)
113 | HintMacroStatement ->
114 Parser_c.TMacroStmt (s, ii)
115 | HintAttribute ->
116 Parser_c.TMacroAttr (s, ii)
117 | HintMacroIdentBuilder ->
118 Parser_c.TMacroIdentBuilder (s, ii)
119
120
121
122
123
124
125 (*****************************************************************************)
126 (* Expansion helpers *)
127 (*****************************************************************************)
128
129 (* In some cases we can have macros like IDENT(if) that expands to some
130 * 'int xxx_if(void)', but as the lexer will currently generate a Tif for
131 * the expanded code, that may not be accepted as a token after a ##
132 * in the grammar. Hence this function to remap some tokens. This is because
133 * we should not use a single lexer for both working at the C level and
134 * cpp level.
135 *
136 * update: it can also rename some TypedefIdent into TIdent, possibly
137 * because of bad interaction with add_typedef_root in parsing_hacks.
138 *)
139 let rec remap_keyword_tokens xs =
140 match xs with
141 | [] -> []
142 | [x] -> [x]
143 | x::y::xs ->
144 (match x, y with
145 | Parser_c.TCppConcatOp _, Parser_c.TIdent _ ->
146 x::y::remap_keyword_tokens xs
147 | Parser_c.TIdent _, Parser_c.TCppConcatOp _ ->
148 x::y::remap_keyword_tokens xs
149
150 | Parser_c.TCppConcatOp (i1), y ->
151
152 let s = TH.str_of_tok y in
153 let ii = TH.info_of_tok y in
154 if s ==~ Common.regexp_alpha
155 then begin
156 pr2 (spf "remaping: %s to an ident in expanded code" s);
157 x::(Parser_c.TIdent (s, ii))::remap_keyword_tokens xs
158 end
159 else
160 x::y::remap_keyword_tokens xs
161
162 | x, Parser_c.TCppConcatOp (i1) ->
163 let s = TH.str_of_tok x in
164 let ii = TH.info_of_tok x in
165 if s ==~ Common.regexp_alpha
166 then begin
167 pr2 (spf "remaping: %s to an ident in expanded code" s);
168 (Parser_c.TIdent (s, ii))::remap_keyword_tokens (y::xs)
169 end
170 else
171 x::y::remap_keyword_tokens xs
172
173 | _, _ ->
174 x::remap_keyword_tokens (y::xs)
175 )
176
177
178
179 (* To expand the parameter of the macro. The env corresponds to the actual
180 * code that is binded to the parameters of the macro.
181 * Recurse ? fixpoint ? the expansion may also contain macro.
182 * Or to macro expansion in a strict manner, that is process first
183 * the parameters, expands macro in params, and then process enclosing
184 * macro call.
185 *
186 * note: do the concatenation job of a##b here ?
187 * normally this should be done in the grammar. Here just expand
188 * tokens. The only thing we handle here is we may have to remap
189 * some tokens.
190 *
191 * todo: handle stringification here ? if #n
192 *
193 * todo? but could parsing_hacks then pass over the remapped tokens,
194 * for instance transform some of the back into some TypedefIdent
195 * so cpp_engine may be fooled?
196 *)
197 let rec (cpp_engine: (string , Parser_c.token list) assoc ->
198 Parser_c.token list -> Parser_c.token list) =
199 fun env xs ->
200 xs +> List.map (fun tok ->
201 (* expand only TIdent ? no cos the parameter of the macro
202 * can actually be some 'register' so may have to look for
203 * any tokens candidates for the expansion.
204 * Only subtelity is maybe dont expand the TDefineIdent.
205 * update: in fact now the caller (define_parse) will have done
206 * the job right and already replaced the macro parameter with a TIdent.
207 *)
208 match tok with
209 | TIdent (s,i1) when List.mem_assoc s env -> Common.assoc s env
210 | x -> [x]
211 )
212 +> List.flatten
213 +> remap_keyword_tokens
214
215
216
217 (* ------------------------------------------------------------------------- *)
218 (* apply macro, using standard.h or other defs *)
219 (* ------------------------------------------------------------------------- *)
220
221 (* Thanks to this function many stuff are not anymore hardcoded in ocaml code.
222 * At some point there were hardcoded in a standard.h file but now I
223 * can even generate them on the fly on demand when there is actually
224 * a parsing problem.
225 *
226 * No need to take care to not substitute the macro name itself
227 * that occurs in the macro definition because the macro name is
228 * after fix_token_define a TDefineIdent, no more a TIdent.
229 *)
230
231 let rec apply_macro_defs
232 ~msg_apply_known_macro
233 ~msg_apply_known_macro_hint
234 defs xs =
235 let rec apply_macro_defs xs =
236 match xs with
237 | [] -> ()
238
239 (* old: "but could do more, could reuse same original token
240 * so that have in the Ast a Dbg, not a MACROSTATEMENT"
241 *
242 * | PToken ({tok = TIdent (s,i1)} as id)::xs
243 * when s = "MACROSTATEMENT" ->
244 *
245 * msg_macro_statement_hint s;
246 * id.tok <- TMacroStmt(TH.info_of_tok id.tok);
247 * find_macro_paren xs
248 *
249 * let msg_macro_statement_hint s =
250 * incr Stat.nMacroHint;
251 * ()
252 *
253 *)
254
255 (* recognized macro of standard.h (or other) *)
256 | PToken ({tok = TIdent (s,i1)} as id)::Parenthised (xxs,info_parens)::xs
257 when Hashtbl.mem defs s ->
258
259 msg_apply_known_macro s;
260 let (s, params, body) = Hashtbl.find defs s in
261
262 (match params with
263 | NoParam ->
264 pr2 ("WEIRD: macro without param used before parenthize: " ^ s);
265 (* ex: PRINTP("NCR53C400 card%s detected\n" ANDP(((struct ... *)
266
267 (match body with
268 | DefineBody bodymacro ->
269 set_as_comment (Token_c.CppMacro) id;
270 id.new_tokens_before <- bodymacro;
271 | DefineHint hint ->
272 msg_apply_known_macro_hint s;
273 id.tok <- token_from_parsinghack_hint (s,i1) hint;
274 )
275 | Params params ->
276 (match body with
277 | DefineBody bodymacro ->
278
279 (* bugfix: better to put this that before the match body,
280 * cos our macrostatement hint can have variable number of
281 * arguments and so it's ok if it does not match exactly
282 * the number of arguments. *)
283 if List.length params != List.length xxs
284 then begin
285 pr2_once ("WEIRD: macro with wrong number of arguments: " ^ s);
286 (* old: id.new_tokens_before <- bodymacro; *)
287
288 (* update: if wrong number, then I just pass this macro *)
289 [Parenthised (xxs, info_parens)] +>
290 iter_token_paren (set_as_comment Token_c.CppMacro);
291 set_as_comment Token_c.CppMacro id;
292
293 ()
294 end
295 else
296
297 let xxs' = xxs +> List.map (fun x ->
298 (tokens_of_paren_ordered x) +> List.map (fun x ->
299 TH.visitor_info_of_tok Ast_c.make_expanded x.tok
300 )
301 ) in
302 id.new_tokens_before <-
303 (* !!! cpp expansion job here !!! *)
304 cpp_engine (Common.zip params xxs') bodymacro;
305
306 (* important to do that after have apply the macro, otherwise
307 * will pass as argument to the macro some tokens that
308 * are all TCommentCpp
309 *)
310 [Parenthised (xxs, info_parens)] +>
311 iter_token_paren (set_as_comment Token_c.CppMacro);
312 set_as_comment Token_c.CppMacro id;
313
314 | DefineHint (HintMacroStatement as hint) ->
315 (* important to do that after have apply the macro, otherwise
316 * will pass as argument to the macro some tokens that
317 * are all TCommentCpp
318 *
319 * note: such macrostatement can have a variable number of
320 * arguments but here we don't care, we just pass all the
321 * parameters.
322 *)
323
324 (match xs with
325 | PToken ({tok = TPtVirg _} as id2)::_ ->
326 pr2_once
327 ("macro stmt with trailing ';', passing also ';' for: "^
328 s);
329 (* sometimes still want pass its params ... as in
330 * DEBUGPOLL(static unsigned int prev_mask = 0);
331 *)
332
333 msg_apply_known_macro_hint s;
334 id.tok <- token_from_parsinghack_hint (s,i1) hint;
335 [Parenthised (xxs, info_parens)] +>
336 iter_token_paren (set_as_comment Token_c.CppMacro);
337 set_as_comment Token_c.CppMacro id2;
338
339 | _ ->
340 msg_apply_known_macro_hint s;
341 id.tok <- token_from_parsinghack_hint (s,i1) hint;
342 [Parenthised (xxs, info_parens)] +>
343 iter_token_paren (set_as_comment Token_c.CppMacro);
344 )
345
346
347 | DefineHint hint ->
348 msg_apply_known_macro_hint s;
349 id.tok <- token_from_parsinghack_hint (s,i1) hint;
350 )
351 );
352 apply_macro_defs xs
353
354 | PToken ({tok = TIdent (s,i1)} as id)::xs
355 when Hashtbl.mem defs s ->
356
357 msg_apply_known_macro s;
358 let (_s, params, body) = Hashtbl.find defs s in
359
360 (match params with
361 | Params params ->
362 pr2 ("WEIRD: macro with params but no parens found: " ^ s);
363 (* dont apply the macro, perhaps a redefinition *)
364 ()
365 | NoParam ->
366 (match body with
367 | DefineBody [newtok] ->
368 (* special case when 1-1 substitution, we reuse the token *)
369 id.tok <- (newtok +> TH.visitor_info_of_tok (fun _ ->
370 TH.info_of_tok id.tok))
371 | DefineBody bodymacro ->
372 set_as_comment Token_c.CppMacro id;
373 id.new_tokens_before <- bodymacro;
374 | DefineHint hint ->
375 msg_apply_known_macro_hint s;
376 id.tok <- token_from_parsinghack_hint (s,i1) hint;
377 )
378 );
379 apply_macro_defs xs
380
381
382
383
384 (* recurse *)
385 | (PToken x)::xs -> apply_macro_defs xs
386 | (Parenthised (xxs, info_parens))::xs ->
387 xxs +> List.iter apply_macro_defs;
388 apply_macro_defs xs
389 in
390 apply_macro_defs xs
391
392
393
394
395 (*****************************************************************************)
396 (* The parsing hack for #define *)
397 (*****************************************************************************)
398
399 (* To parse macro definitions I need to do some tricks
400 * as some information can be get only at the lexing level. For instance
401 * the space after the name of the macro in '#define foo (x)' is meaningful
402 * but the grammar can not get this information. So define_ident below
403 * look at such space and generate a special TOpardefine. In a similar
404 * way macro definitions can contain some antislash and newlines
405 * and the grammar need to know where the macro ends (which is
406 * a line-level and so low token-level information). Hence the
407 * function 'define_line' below and the TDefEol.
408 *
409 * update: TDefEol is handled in a special way at different places,
410 * a little bit like EOF, especially for error recovery, so this
411 * is an important token that should not be retagged!
412 *
413 *
414 * ugly hack, a better solution perhaps would be to erase TDefEOL
415 * from the Ast and list of tokens in parse_c.
416 *
417 * note: I do a +1 somewhere, it's for the unparsing to correctly sync.
418 *
419 * note: can't replace mark_end_define by simply a fakeInfo(). The reason
420 * is where is the \n TCommentSpace. Normally there is always a last token
421 * to synchronize on, either EOF or the token of the next toplevel.
422 * In the case of the #define we got in list of token
423 * [TCommentSpace "\n"; TDefEOL] but if TDefEOL is a fakeinfo then we will
424 * not synchronize on it and so we will not print the "\n".
425 * A solution would be to put the TDefEOL before the "\n".
426 *
427 * todo?: could put a ExpandedTok for that ?
428 *)
429 let mark_end_define ii =
430 let ii' =
431 { Ast_c.pinfo = Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
432 Common.str = "";
433 Common.charpos = Ast_c.pos_of_info ii + 1
434 };
435 cocci_tag = ref Ast_c.emptyAnnot;
436 comments_tag = ref Ast_c.emptyComments;
437 }
438 in
439 TDefEOL (ii')
440
441 (* put the TDefEOL at the good place *)
442 let rec define_line_1 acc xs =
443 match xs with
444 | [] -> List.rev acc
445 | TDefine ii::xs ->
446 let line = Ast_c.line_of_info ii in
447 let acc = (TDefine ii) :: acc in
448 define_line_2 acc line ii xs
449 | TCppEscapedNewline ii::xs ->
450 pr2 "WEIRD: a \\ outside a #define";
451 let acc = (TCommentSpace ii) :: acc in
452 define_line_1 acc xs
453 | x::xs -> define_line_1 (x::acc) xs
454
455 and define_line_2 acc line lastinfo xs =
456 match xs with
457 | [] ->
458 (* should not happened, should meet EOF before *)
459 pr2 "PB: WEIRD";
460 List.rev (mark_end_define lastinfo::acc)
461 | x::xs ->
462 let line' = TH.line_of_tok x in
463 let info = TH.info_of_tok x in
464
465 (match x with
466 | EOF ii ->
467 let acc = (mark_end_define lastinfo) :: acc in
468 let acc = (EOF ii) :: acc in
469 define_line_1 acc xs
470 | TCppEscapedNewline ii ->
471 if (line' <> line) then pr2 "PB: WEIRD: not same line number";
472 let acc = (TCommentSpace ii) :: acc in
473 define_line_2 acc (line+1) info xs
474 | x ->
475 if line' =|= line
476 then define_line_2 (x::acc) line info xs
477 else define_line_1 (mark_end_define lastinfo::acc) (x::xs)
478 )
479
480 let rec define_ident acc xs =
481 match xs with
482 | [] -> List.rev acc
483 | TDefine ii::xs ->
484 let acc = TDefine ii :: acc in
485 (match xs with
486 | TCommentSpace i1::TIdent (s,i2)::TOPar (i3)::xs ->
487 (* Change also the kind of TIdent to avoid bad interaction
488 * with other parsing_hack tricks. For instant if keep TIdent then
489 * the stringication algo can believe the TIdent is a string-macro.
490 * So simpler to change the kind of the ident too.
491 *)
492 (* if TOParDefine sticked to the ident, then
493 * it's a macro-function. Change token to avoid ambiguity
494 * between #define foo(x) and #define foo (x)
495 *)
496 let acc = (TCommentSpace i1) :: acc in
497 let acc = (TIdentDefine (s,i2)) :: acc in
498 let acc = (TOParDefine i3) :: acc in
499 define_ident acc xs
500
501 | TCommentSpace i1::TIdent (s,i2)::xs ->
502 let acc = (TCommentSpace i1) :: acc in
503 let acc = (TIdentDefine (s,i2)) :: acc in
504 define_ident acc xs
505
506 (* bugfix: ident of macro (as well as params, cf below) can be tricky
507 * note, do we need to subst in the body of the define ? no cos
508 * here the issue is the name of the macro, as in #define inline,
509 * so obviously the name of this macro will not be used in its
510 * body (it would be a recursive macro, which is forbidden).
511 *)
512
513 | TCommentSpace i1::t::xs ->
514
515 let s = TH.str_of_tok t in
516 let ii = TH.info_of_tok t in
517 if s ==~ Common.regexp_alpha
518 then begin
519 pr2 (spf "remaping: %s to an ident in macro name" s);
520 let acc = (TCommentSpace i1) :: acc in
521 let acc = (TIdentDefine (s,ii)) :: acc in
522 define_ident acc xs
523 end
524 else begin
525 pr2 "WEIRD: weird #define body";
526 define_ident acc xs
527 end
528
529 | _ ->
530 pr2 "WEIRD: weird #define body";
531 define_ident acc xs
532 )
533 | x::xs ->
534 let acc = x :: acc in
535 define_ident acc xs
536
537
538
539 let fix_tokens_define2 xs =
540 define_ident [] (define_line_1 [] xs)
541
542 let fix_tokens_define a =
543 Common.profile_code "C parsing.fix_define" (fun () -> fix_tokens_define2 a)
544
545
546
547 (*****************************************************************************)
548 (* for the cpp-builtin, standard.h, part 0 *)
549 (*****************************************************************************)
550
551 let macro_body_to_maybe_hint body =
552 match body with
553 | [] -> DefineBody body
554 | [TIdent (s,i1)] ->
555 (match parsinghack_hint_of_string s with
556 | Some hint -> DefineHint hint
557 | None -> DefineBody body
558 )
559 | xs -> DefineBody body
560
561
562 let rec define_parse xs =
563 match xs with
564 | [] -> []
565 | TDefine i1::TIdentDefine (s,i2)::TOParDefine i3::xs ->
566 (* note: the macro could be badly written and have no closing ')' for
567 * its param, which would make us go too far away, but I don't think
568 * it's important to handle such an error *)
569 let (tokparams, _, xs) =
570 xs +> Common.split_when (function TCPar _ -> true | _ -> false) in
571 let (body, _, xs) =
572 xs +> Common.split_when (function TDefEOL _ -> true | _ -> false) in
573 let params =
574 tokparams +> Common.map_filter (function
575 | TComma _ -> None
576 | TIdent (s, _) -> Some s
577
578 (* TODO *)
579 | TDefParamVariadic (s, _) -> Some s
580 (* TODO *)
581 | TEllipsis _ -> Some "..."
582
583 | x ->
584 (* bugfix: param of macros can be tricky *)
585 let s = TH.str_of_tok x in
586 if s ==~ Common.regexp_alpha
587 then begin
588 pr2 (spf "remaping: %s to a macro parameter" s);
589 Some s
590 end
591 else
592 error_cant_have x
593 ) in
594 (* bugfix: also substitute to ident in body so cpp_engine will
595 * have an easy job.
596 *)
597 let body = body +> List.map (fun tok ->
598 match tok with
599 | TIdent _ -> tok
600 | _ ->
601 let s = TH.str_of_tok tok in
602 let ii = TH.info_of_tok tok in
603 if s ==~ Common.regexp_alpha && List.mem s params
604 then begin
605 pr2 (spf "remaping: %s to an ident in macro body" s);
606 TIdent (s, ii)
607 end
608 else tok
609 ) +> List.map (TH.visitor_info_of_tok Ast_c.make_expanded) in
610 let def = (s, (s, Params params, macro_body_to_maybe_hint body)) in
611 def::define_parse xs
612
613 | TDefine i1::TIdentDefine (s,i2)::xs ->
614 let (body, _, xs) =
615 xs +> Common.split_when (function TDefEOL _ -> true | _ -> false) in
616 let body = body +> List.map
617 (TH.visitor_info_of_tok Ast_c.make_expanded) in
618 let def = (s, (s, NoParam, macro_body_to_maybe_hint body)) in
619 def::define_parse xs
620
621 (* cf tests-bis/define_plus.c *)
622 | TDefine i1::xs ->
623 let line = Ast_c.line_of_info i1 in
624 pr2 (spf "WEIRD: no ident in define at line %d" line);
625 define_parse xs
626
627 | x::xs -> define_parse xs
628
629
630
631 let extract_cpp_define xs =
632 let cleaner = xs +> List.filter (fun x ->
633 not (TH.is_comment x)
634 ) in
635 define_parse cleaner
636
637
638
639