permit multiline comments and strings in macros
[bpt/coccinelle.git] / parsing_c / cpp_token_c.ml
1 open Common
2
3 module TH = Token_helpers
4
5 open Parser_c
6 open Token_views_c
7
8 (*****************************************************************************)
9 (* Prelude *)
10 (*****************************************************************************)
11
12 (* cpp functions working at the token level. Cf cpp_ast_c for cpp functions
13 * working at the AST level (which is very unusual but makes sense in
14 * the coccinelle context for instance).
15 *
16 * Note that as I use a single lexer to work both at the C and cpp level
17 * there are some inconveniencies.
18 * For instance 'for' is a valid name for a macro parameter and macro
19 * body, but is interpreted in a special way by our single lexer, and
20 * so at some places where I expect a TIdent I need also to
21 * handle special cases and accept Tfor, Tif, etc at those places.
22 *
23 * There are multiple issues related to those keywords incorrect tokens.
24 * Those keywords can be:
25 * - (1) in the name of the macro as in #define inline
26 * - (2) in a parameter of the macro as in #define foo(char) char x;
27 * - (3) in an argument to a macro call as in IDENT(if);
28 * Case 1 is easy to fix in define_ident.
29 * Case 2 is easy to fix in define_parse where detect such toks in
30 * the parameter and then replace their occurence in the body in a Tident.
31 * Case 3 is only an issue when the expanded token is not really use
32 * as usual but use for instance in concatenation as in a ## if
33 * when expanded. In the case the grammar this time will not be happy
34 * so this is also easy to fix in cpp_engine.
35 *
36 *)
37
38 (*****************************************************************************)
39 (* Wrappers *)
40 (*****************************************************************************)
41 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_parsing
42
43 (*****************************************************************************)
44 (* Types *)
45 (*****************************************************************************)
46
47 (* ------------------------------------------------------------------------- *)
48 (* mimic standard.h *)
49 (* ------------------------------------------------------------------------- *)
50
51 type define_def = string * define_param * define_body
52 and define_param =
53 | NoParam
54 | Params of define_arg list
55 and define_arg = FixedArg of string | VariadicArg of string
56 and define_body =
57 | DefineBody of Parser_c.token list
58 | DefineHint of parsinghack_hint
59
60 and parsinghack_hint =
61 | HintIterator
62 | HintDeclarator
63 | HintMacroString
64 | HintMacroStatement
65 | HintAttribute
66 | HintMacroIdentBuilder
67
68
69 (*****************************************************************************)
70 (* Parsing and helpers of hints *)
71 (*****************************************************************************)
72
73 (* cf also data/test.h *)
74 let assoc_hint_string = [
75 "YACFE_ITERATOR" , HintIterator;
76 "YACFE_DECLARATOR" , HintDeclarator;
77 "YACFE_STRING" , HintMacroString;
78 "YACFE_STATEMENT" , HintMacroStatement;
79 "YACFE_ATTRIBUTE" , HintAttribute;
80 "YACFE_IDENT_BUILDER" , HintMacroIdentBuilder;
81
82 "MACROSTATEMENT" , HintMacroStatement; (* backward compatibility *)
83 ]
84
85
86 let (parsinghack_hint_of_string: string -> parsinghack_hint option) = fun s ->
87 Common.assoc_option s assoc_hint_string
88 let (string_of_parsinghack_hint: parsinghack_hint -> string) = fun hint ->
89 let assoc' = assoc_hint_string +> List.map (fun (a,b) -> (b,a) ) in
90 Common.assoc hint assoc'
91
92
93
94 let (is_parsinghack_hint: string -> bool) = fun s ->
95 parsinghack_hint_of_string s <> None
96
97 let (token_from_parsinghack_hint:
98 (string * Ast_c.info) -> parsinghack_hint -> Parser_c.token) =
99 fun (s,ii) hint ->
100 match hint with
101 | HintIterator ->
102 Parser_c.TMacroIterator (s, ii)
103 | HintDeclarator ->
104 Parser_c.TMacroDecl (s, ii)
105 | HintMacroString ->
106 Parser_c.TMacroString (s, ii)
107 | HintMacroStatement ->
108 Parser_c.TMacroStmt (s, ii)
109 | HintAttribute ->
110 Parser_c.TMacroAttr (s, ii)
111 | HintMacroIdentBuilder ->
112 Parser_c.TMacroIdentBuilder (s, ii)
113
114
115 (* used in extract_macros for example *)
116 let string_of_define_def (s, params, body) =
117
118 let s1 =
119 match params with
120 | NoParam ->
121 spf "#define %s " s
122 | Params xs ->
123 let xs = List.map (function FixedArg s -> s | VariadicArg s -> s) xs in
124 spf "#define %s(%s) " s (Common.join "," xs)
125 in
126 let s2 =
127 match body with
128 | DefineHint hint ->
129 string_of_parsinghack_hint hint
130 | DefineBody xs ->
131 Common.join " " (xs +> List.map Token_helpers.str_of_tok)
132 in
133 s1 ^ s2
134
135
136 (*****************************************************************************)
137 (* Expansion helpers *)
138 (*****************************************************************************)
139
140 (* In some cases we can have macros like IDENT(if) that expands to some
141 * 'int xxx_if(void)', but as the lexer will currently generate a Tif for
142 * the expanded code, that may not be accepted as a token after a ##
143 * in the grammar. Hence this function to remap some tokens. This is because
144 * we should not use a single lexer for both working at the C level and
145 * cpp level.
146 *
147 * update: it can also rename some TypedefIdent into TIdent, possibly
148 * because of bad interaction with add_typedef_root in parsing_hacks.
149 *)
150 let rec remap_keyword_tokens xs =
151 match xs with
152 | [] -> []
153 | [x] -> [x]
154 | x::y::xs ->
155 (match x, y with
156 | Parser_c.TCppConcatOp _, Parser_c.TIdent _ ->
157 x::y::remap_keyword_tokens xs
158 | Parser_c.TIdent _, Parser_c.TCppConcatOp _ ->
159 x::y::remap_keyword_tokens xs
160
161 | Parser_c.TCppConcatOp (i1), y ->
162 let s = TH.str_of_tok y in
163 let ii = TH.info_of_tok y in
164 if s ==~ Common.regexp_alpha
165 then begin
166 pr2 (spf "remapping: %s to an ident in expanded code" s);
167 x::(Parser_c.TIdent (s, ii))::remap_keyword_tokens xs
168 end
169 else
170 x::y::remap_keyword_tokens xs
171
172 | x, Parser_c.TCppConcatOp (i1) ->
173 let s = TH.str_of_tok x in
174 let ii = TH.info_of_tok x in
175 if s ==~ Common.regexp_alpha
176 then begin
177 pr2 (spf "remapping: %s to an ident in expanded code" s);
178 (Parser_c.TIdent (s, ii))::remap_keyword_tokens (y::xs)
179 end
180 else
181 x::y::remap_keyword_tokens xs
182
183 | _, _ ->
184 x::remap_keyword_tokens (y::xs)
185 )
186
187
188 (* works with agglomerate_concat_op_ident below *)
189 let rec get_ident_in_concat_op xs =
190 match xs with
191 | [] ->
192 pr2 "weird: ident after ## operator not found";
193 "", []
194 | [x] ->
195 (match x with
196 | Parser_c.TIdent (s, i1) -> s, []
197 | _ ->
198 pr2 "weird: ident after ## operator not found";
199 "", [x]
200 )
201 | x::y::xs ->
202 (match x, y with
203 | Parser_c.TIdent (s,i1), Parser_c.TCppConcatOp (i2) ->
204 let (s2, rest) = get_ident_in_concat_op xs in
205 s ^ s2, rest
206 | Parser_c.TIdent (s, i1), _ ->
207 s, (y::xs)
208 | _ ->
209 pr2 "weird: ident after ## operator not found";
210 "", x::y::xs
211 )
212
213 (* must be run after the expansion has been done for the parameter so
214 * that all idents are actually ident, not macro parameter names.
215 *)
216 let rec agglomerate_concat_op_ident xs =
217 match xs with
218 | [] -> []
219 | [x] -> [x]
220 | x::y::xs ->
221 (* can we have ## id, and so ## as first token ? yes
222 * but the semantic is different as it represents variadic
223 * names so this must be handled elsewhere.
224 *)
225 (match x, y with
226 | Parser_c.TIdent (s,i1), Parser_c.TCppConcatOp (i2) ->
227 let (all_str_ident, rest_toks) =
228 get_ident_in_concat_op xs
229 in
230 let new_s = s ^ all_str_ident in
231 let i1' = Ast_c.rewrap_str new_s i1 in
232 Parser_c.TIdent (new_s, i1')::agglomerate_concat_op_ident rest_toks
233 | Parser_c.TCppConcatOp _, _ ->
234 pr2 "weird, ## alone";
235 x::agglomerate_concat_op_ident (y::xs)
236 | _ ->
237 x::agglomerate_concat_op_ident (y::xs)
238
239 )
240
241
242
243 (* To expand the parameter of the macro. The env corresponds to the actual
244 * code that is binded to the parameters of the macro.
245 * Recurse ? fixpoint ? the expansion may also contain macro.
246 * Or to macro expansion in a strict manner, that is process first
247 * the parameters, expands macro in params, and then process enclosing
248 * macro call.
249 *
250 * note: do the concatenation job of a##b here ?
251 * normally this should be done in the grammar. Here just expand
252 * tokens. The only thing we handle here is we may have to remap
253 * some tokens.
254 *
255 * todo: handle stringification here ? if #n
256 *
257 * todo? but could parsing_hacks then pass over the remapped tokens,
258 * for instance transform some of the back into some TypedefIdent
259 * so cpp_engine may be fooled?
260 *)
261 let rec (cpp_engine:
262 ?evaluate_concatop:bool ->
263 (string , Parser_c.token list) assoc ->
264 Parser_c.token list -> Parser_c.token list) =
265 fun ?(evaluate_concatop=true) env xs ->
266 xs +> List.map (fun tok ->
267 (* expand only TIdent ? no cos the parameter of the macro
268 * can actually be some 'register' so may have to look for
269 * any tokens candidates for the expansion.
270 * Only subtelity is maybe dont expand the TDefineIdent.
271 *
272 * update: in fact now the caller (define_parse) will have done
273 * the job right and already replaced the macro parameter with a TIdent.
274 *)
275 match tok with
276 | TIdent (s,i1) when List.mem_assoc s env ->
277 Common.assoc s env
278 | x -> [x]
279 )
280 +> List.flatten
281 +> remap_keyword_tokens
282 +> (fun xs ->
283 if evaluate_concatop
284 then agglomerate_concat_op_ident xs
285 else xs
286 )
287
288
289
290 (* ------------------------------------------------------------------------- *)
291 (* apply macro, using standard.h or other defs *)
292 (* ------------------------------------------------------------------------- *)
293
294 (* Thanks to this function many stuff are not anymore hardcoded in ocaml code.
295 * At some point there were hardcoded in a standard.h file but now I
296 * can even generate them on the fly on demand when there is actually
297 * a parsing problem.
298 *
299 * No need to take care to not substitute the macro name itself
300 * that occurs in the macro definition because the macro name is
301 * after fix_token_define a TDefineIdent, no more a TIdent.
302 *)
303
304 let rec apply_macro_defs
305 ~msg_apply_known_macro
306 ~msg_apply_known_macro_hint
307 ?evaluate_concatop
308 ?(inplace_when_single=true)
309 defs xs =
310
311 let rec apply_macro_defs xs =
312 match xs with
313 | [] -> ()
314
315 (* old: "but could do more, could reuse same original token
316 * so that have in the Ast a Dbg, not a MACROSTATEMENT"
317 *
318 * | PToken ({tok = TIdent (s,i1)} as id)::xs
319 * when s = "MACROSTATEMENT" ->
320 *
321 * msg_macro_statement_hint s;
322 * id.tok <- TMacroStmt(TH.info_of_tok id.tok);
323 * find_macro_paren xs
324 *
325 * let msg_macro_statement_hint s =
326 * incr Stat.nMacroHint;
327 * ()
328 *
329 *)
330
331 (* recognized macro of standard.h (or other) *)
332 | PToken ({tok = TIdent (s,i1)} as id)::Parenthised (xxs,info_parens)::xs
333 when Hashtbl.mem defs s ->
334
335 msg_apply_known_macro s;
336 let (s, params, body) = Hashtbl.find defs s in
337
338 (match params with
339 | NoParam ->
340 pr2 ("WEIRD: macro without param used before parenthize: " ^ s);
341 (* ex: PRINTP("NCR53C400 card%s detected\n" ANDP(((struct ... *)
342
343 (match body with
344 | DefineBody bodymacro ->
345 set_as_comment (Token_c.CppMacro) id;
346 id.new_tokens_before <- bodymacro;
347 | DefineHint hint ->
348 msg_apply_known_macro_hint s;
349 id.tok <- token_from_parsinghack_hint (s,i1) hint;
350 )
351 | Params params ->
352 (match body with
353 | DefineBody bodymacro ->
354
355 (* bugfix: better to put this that before the match body,
356 * cos our macrostatement hint can have variable number of
357 * arguments and so it's ok if it does not match exactly
358 * the number of arguments. *)
359 let build_binder params xxs =
360 let rec loop = function
361 ([],[]) -> Some (function [] -> [] | _ -> failwith "bad")
362 | ([],[[]]) -> Some (function [[]] -> [] | _ -> failwith "bad")
363 | ([],l) -> None
364 | ([(VariadicArg s)],l) ->
365 Some (function l -> List.map (function a -> (s,a)) l)
366 | ((VariadicArg _)::_,l) -> None
367 | ((FixedArg _)::_,[]) -> None
368 | ((FixedArg s)::rest,x::xs) ->
369 (match loop (rest,xs) with
370 Some k ->
371 Some (function l -> (s,(List.hd l)) :: k (List.tl l))
372 | None -> None) in
373 loop (params, xxs) in
374 (match build_binder params xxs with
375 None ->
376 pr2_once
377 ("WEIRD: macro with wrong number of arguments: " ^ s);
378 (* old: id.new_tokens_before <- bodymacro; *)
379
380 (* update: if wrong number, then I just pass this macro *)
381 [Parenthised (xxs, info_parens)] +>
382 iter_token_paren (set_as_comment Token_c.CppMacro);
383 set_as_comment Token_c.CppMacro id
384 | Some bind ->
385
386 let xxs' = xxs +> List.map (fun x ->
387 (tokens_of_paren_ordered x) +> List.map (fun x ->
388 TH.visitor_info_of_tok Ast_c.make_expanded x.tok
389 )
390 ) in
391 id.new_tokens_before <-
392 (* !!! cpp expansion job here !!! *)
393 cpp_engine ?evaluate_concatop
394 (bind xxs') bodymacro;
395
396 (* important to do that after have apply the macro,
397 otherwise will pass as argument to the macro some
398 tokens that are all TCommentCpp
399 *)
400 [Parenthised (xxs, info_parens)] +>
401 iter_token_paren (set_as_comment Token_c.CppMacro);
402 set_as_comment Token_c.CppMacro id)
403
404 | DefineHint (HintMacroStatement as hint) ->
405 (* important to do that after have apply the macro, otherwise
406 * will pass as argument to the macro some tokens that
407 * are all TCommentCpp
408 *
409 * note: such macrostatement can have a variable number of
410 * arguments but here we don't care, we just pass all the
411 * parameters.
412 *)
413
414 (match xs with
415 | PToken ({tok = TPtVirg _} as id2)::_ ->
416 pr2_once
417 ("macro stmt with trailing ';', passing also ';' for: "^
418 s);
419 (* sometimes still want pass its params ... as in
420 * DEBUGPOLL(static unsigned int prev_mask = 0);
421 *)
422
423 msg_apply_known_macro_hint s;
424 id.tok <- token_from_parsinghack_hint (s,i1) hint;
425 [Parenthised (xxs, info_parens)] +>
426 iter_token_paren (set_as_comment Token_c.CppMacro);
427 set_as_comment Token_c.CppMacro id2;
428
429 | _ ->
430 msg_apply_known_macro_hint s;
431 id.tok <- token_from_parsinghack_hint (s,i1) hint;
432 [Parenthised (xxs, info_parens)] +>
433 iter_token_paren (set_as_comment Token_c.CppMacro);
434 )
435
436
437 | DefineHint hint ->
438 msg_apply_known_macro_hint s;
439 id.tok <- token_from_parsinghack_hint (s,i1) hint;
440 )
441 );
442 apply_macro_defs xs
443
444 | PToken ({tok = TIdent (s,i1)} as id)::xs
445 when Hashtbl.mem defs s ->
446
447 msg_apply_known_macro s;
448 let (_s, params, body) = Hashtbl.find defs s in
449
450 (match params with
451 | Params _ ->
452 pr2 ("WEIRD: macro with params but no parens found: " ^ s);
453 (* dont apply the macro, perhaps a redefinition *)
454 ()
455 | NoParam ->
456 (match body with
457 (* bugfix: we prefer not using this special case when we come
458 * from extract_macros context
459 *)
460 | DefineBody [newtok] when inplace_when_single ->
461 (* special case when 1-1 substitution, we reuse the token *)
462 id.tok <- (newtok +> TH.visitor_info_of_tok (fun _ ->
463 TH.info_of_tok id.tok))
464 | DefineBody bodymacro ->
465 set_as_comment Token_c.CppMacro id;
466 id.new_tokens_before <- bodymacro;
467 | DefineHint hint ->
468 msg_apply_known_macro_hint s;
469 id.tok <- token_from_parsinghack_hint (s,i1) hint;
470 )
471 );
472 apply_macro_defs xs
473
474
475
476
477 (* recurse *)
478 | (PToken x)::xs -> apply_macro_defs xs
479 | (Parenthised (xxs, info_parens))::xs ->
480 xxs +> List.iter apply_macro_defs;
481 apply_macro_defs xs
482 in
483 apply_macro_defs xs
484
485
486
487 (*****************************************************************************)
488 (* extracting define_def from a standard.h *)
489 (*****************************************************************************)
490 (* was the cpp-builtin, standard.h, part 0 *)
491
492 let macro_body_to_maybe_hint body =
493 match body with
494 | [] -> DefineBody body
495 | [TIdent (s,i1)] ->
496 (match parsinghack_hint_of_string s with
497 | Some hint -> DefineHint hint
498 | None -> DefineBody body
499 )
500 | xs -> DefineBody body
501
502 exception Bad_param
503
504 let rec (define_parse: Parser_c.token list -> (string * define_def) list) =
505 fun xs ->
506 match xs with
507 | [] -> []
508 | TDefine i1::TIdentDefine (s,i2)::TOParDefine i3::xs ->
509 (* note: the macro could be badly written and have no closing ')' for
510 * its param, which would make us go too far away, but I don't think
511 * it's important to handle such an error *)
512 let def =
513 try
514 let (tokparams, _, xs) =
515 xs +> Common.split_when (function TCPar _ -> true | _ -> false) in
516 let (body, _, xs) =
517 xs +> Common.split_when (function TDefEOL _ -> true | _ -> false) in
518 let params =
519 tokparams +> Common.map_filter (function
520 | TComma _ -> None
521 | TIdent (s, _) -> Some (FixedArg s)
522
523 (* TODO *)
524 | TDefParamVariadic (s, _) -> Some (VariadicArg s)
525 (* TODO *)
526 | TEllipsis _ -> Some (VariadicArg "...")
527
528 | x ->
529 (* bugfix: param of macros can be tricky *)
530 let s = TH.str_of_tok x in
531 if s ==~ Common.regexp_alpha
532 then begin
533 pr2 (spf "remapping: %s to a macro parameter" s);
534 Some (FixedArg s)
535 end
536 else
537 begin
538 pr2 (spf "bad character %s in macro parameter list" s);
539 raise Bad_param
540 end) in
541 (* bugfix: also substitute to ident in body so cpp_engine will
542 * have an easy job.
543 *)
544 let body = body +> List.map (fun tok ->
545 match tok with
546 | TIdent _ -> tok
547 | _ ->
548 let s = TH.str_of_tok tok in
549 let ii = TH.info_of_tok tok in
550 let params =
551 List.map
552 (function FixedArg s -> s | VariadicArg s -> s)
553 params in
554 if s ==~ Common.regexp_alpha && List.mem s params
555 then begin
556 pr2 (spf "remapping: %s to an ident in macro body" s);
557 TIdent (s, ii)
558 end
559 else tok) +>
560 List.map (TH.visitor_info_of_tok Ast_c.make_expanded) in
561 Some (s, (s, Params params, macro_body_to_maybe_hint body))
562 with Bad_param -> None in
563 (match def with
564 Some def -> def::define_parse xs
565 | None -> define_parse xs)
566
567 | TDefine i1::TIdentDefine (s,i2)::xs ->
568 let (body, _, xs) =
569 xs +> Common.split_when (function TDefEOL _ -> true | _ -> false) in
570 let body = body +> List.map
571 (TH.visitor_info_of_tok Ast_c.make_expanded) in
572 let def = (s, (s, NoParam, macro_body_to_maybe_hint body)) in
573 def::define_parse xs
574
575 (* cf tests-bis/define_plus.c *)
576 | TDefine i1::xs ->
577 let line = Ast_c.line_of_info i1 in
578 pr2 (spf "WEIRD: no ident in define at line %d" line);
579 define_parse xs
580
581 | x::xs -> define_parse xs
582
583
584
585 let extract_macros xs =
586 let cleaner = xs +> List.filter (fun x ->
587 not (TH.is_comment x)
588 ) in
589 define_parse cleaner
590
591