permit multiline comments and strings in macros
[bpt/coccinelle.git] / parsing_c / token_views_c.ml
1 (* Yoann Padioleau
2 *
3 * Copyright (C) 2010, University of Copenhagen DIKU and INRIA.
4 * Copyright (C) 2007, 2008 Ecole des Mines de Nantes
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License (GPL)
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * file license.txt for more details.
14 *)
15
16 open Common
17
18 module TH = Token_helpers
19
20 open Parser_c
21
22 (*****************************************************************************)
23 (* Some debugging functions *)
24 (*****************************************************************************)
25
26 let pr2, pr2_once = Common.mk_pr2_wrappers Flag_parsing_c.verbose_parsing
27
28 (* ------------------------------------------------------------------------- *)
29 (* fuzzy parsing, different "views" over the same program *)
30 (* ------------------------------------------------------------------------- *)
31
32
33 (* Normally I should not use ref/mutable in the token_extended type
34 * and I should have a set of functions taking a list of tokens and
35 * returning a list of tokens. The problem is that to make easier some
36 * functions, it is better to work on better representation, on "views"
37 * over this list of tokens. But then modifying those views and get
38 * back from those views to the original simple list of tokens is
39 * tedious. One way is to maintain next to the view a list of "actions"
40 * (I was using a hash storing the charpos of the token and associating
41 * the action) but it is tedious too. Simpler to use mutable/ref. We
42 * use the same idea that we use when working on the Ast_c. *)
43
44 (* old: when I was using the list of "actions" next to the views, the hash
45 * indexed by the charpos, there could have been some problems:
46 * how my fake_pos interact with the way I tag and adjust token ?
47 * because I base my tagging on the position of the token ! so sometimes
48 * could tag another fakeInfo that should not be tagged ?
49 * fortunately I don't use anymore this technique.
50 *)
51
52 (* update: quite close to the Place_c.Inxxx *)
53 type context =
54 InFunction | InEnum | InStruct | InInitializer | NoContext
55
56 type token_extended = {
57 mutable tok: Parser_c.token;
58 mutable where: context;
59
60 (* less: need also a after ? *)
61 mutable new_tokens_before : Parser_c.token list;
62
63 (* line x col cache, more easily accessible, of the info in the token *)
64 line: int;
65 col : int;
66 }
67
68 (* todo? is it ok to reset as a comment a TDefEOL ? if do that, then
69 * can confuse the parser.
70 *)
71 let set_as_comment cppkind x =
72 if TH.is_eof x.tok
73 then () (* otherwise parse_c will be lost if don't find a EOF token *)
74 else
75 x.tok <- TCommentCpp (cppkind, TH.info_of_tok x.tok)
76
77 let save_as_comment cppkind x =
78 if TH.is_eof x.tok
79 then () (* otherwise parse_c will be lost if don't find a EOF token *)
80 else
81 let t =
82 match x.tok with
83 TIfdef _ | TIfdefMisc _ | TIfdefVersion _ -> Token_c.IfDef
84 | TIfdefBool _ -> Token_c.IfDef0
85 | TIfdefelse _ | TIfdefelif _ -> Token_c.Else
86 | TEndif _ -> Token_c.Endif
87 | _ -> Token_c.Other in
88 x.tok <- TCommentCpp (cppkind t, TH.info_of_tok x.tok)
89
90 let mk_token_extended x =
91 let (line, col) = TH.linecol_of_tok x in
92 { tok = x;
93 line = line; col = col;
94 where = NoContext;
95 new_tokens_before = [];
96 }
97
98
99 let rebuild_tokens_extented toks_ext =
100 let _tokens = ref [] in
101 toks_ext +> List.iter (fun tok ->
102 tok.new_tokens_before +> List.iter (fun x -> push2 x _tokens);
103 push2 tok.tok _tokens
104 );
105 let tokens = List.rev !_tokens in
106 (tokens +> acc_map mk_token_extended)
107
108
109
110 (* x list list, because x list separated by ',' *)
111 type paren_grouped =
112 | Parenthised of paren_grouped list list * token_extended list
113 | PToken of token_extended
114
115 type brace_grouped =
116 | Braceised of
117 brace_grouped list list * token_extended * token_extended option
118 | BToken of token_extended
119
120 (* Far better data structure than doing hacks in the lexer or parser
121 * because in lexer we don't know to which ifdef a endif is related
122 * and so when we want to comment a ifdef, we don't know which endif
123 * we must also comment. Especially true for the #if 0 which sometimes
124 * have a #else part.
125 *
126 * x list list, because x list separated by #else or #elif
127 *)
128 type ifdef_grouped =
129 | Ifdef of ifdef_grouped list list * token_extended list
130 | Ifdefbool of bool * ifdef_grouped list list * token_extended list
131 | NotIfdefLine of token_extended list
132
133
134 type 'a line_grouped =
135 Line of 'a list
136
137
138 type body_function_grouped =
139 | BodyFunction of token_extended list
140 | NotBodyLine of token_extended list
141
142
143 (* ------------------------------------------------------------------------- *)
144 (* view builders *)
145 (* ------------------------------------------------------------------------- *)
146
147 (* todo: synchro ! use more indentation
148 * if paren not closed and same indentation level, certainly because
149 * part of a mid-ifdef-expression.
150 *)
151 let rec mk_parenthised xs =
152 let rec loop acc = function
153 | [] -> acc
154 | x::xs ->
155 (match x.tok with
156 | TOPar _ | TOParDefine _ ->
157 let body, extras, xs = mk_parameters [x] [] xs in
158 loop (Parenthised (body,extras)::acc) xs
159 | _ ->
160 loop (PToken x::acc) xs
161 ) in
162 List.rev(loop [] xs)
163
164 (* return the body of the parenthised expression and the rest of the tokens *)
165 and mk_parameters extras acc_before_sep xs =
166 match xs with
167 | [] ->
168 (* maybe because of #ifdef which "opens" '(' in 2 branches *)
169 pr2 "PB: not found closing paren in fuzzy parsing";
170 [List.rev acc_before_sep], List.rev extras, []
171 | x::xs ->
172 (match x.tok with
173 (* synchro *)
174 | TOBrace _ when x.col =|= 0 ->
175 pr2 "PB: found synchro point } in paren";
176 [List.rev acc_before_sep], List.rev (extras), (x::xs)
177
178 | TCPar _ | TCParEOL _ ->
179 [List.rev acc_before_sep], List.rev (x::extras), xs
180 | TOPar _ | TOParDefine _ ->
181 let body, extrasnest, xs = mk_parameters [x] [] xs in
182 mk_parameters extras
183 (Parenthised (body,extrasnest)::acc_before_sep)
184 xs
185 | TComma _ ->
186 let body, extras, xs = mk_parameters (x::extras) [] xs in
187 (List.rev acc_before_sep)::body, extras, xs
188 | _ ->
189 mk_parameters extras (PToken x::acc_before_sep) xs
190 )
191
192
193
194
195 let rec mk_braceised xs =
196 let rec loop acc = function
197 | [] -> acc
198 | x::xs ->
199 (match x.tok with
200 | TOBrace _ ->
201 let body, endbrace, xs = mk_braceised_aux [] xs in
202 loop (Braceised (body, x, endbrace)::acc) xs
203 | TCBrace _ ->
204 pr2 "PB: found closing brace alone in fuzzy parsing";
205 loop (BToken x::acc) xs
206 | _ ->
207 loop (BToken x::acc) xs) in
208 List.rev(loop [] xs)
209
210 (* return the body of the parenthised expression and the rest of the tokens *)
211 and mk_braceised_aux acc xs =
212 match xs with
213 | [] ->
214 (* maybe because of #ifdef which "opens" '(' in 2 branches *)
215 pr2 "PB: not found closing brace in fuzzy parsing";
216 [List.rev acc], None, []
217 | x::xs ->
218 (match x.tok with
219 | TCBrace _ -> [List.rev acc], Some x, xs
220 | TOBrace _ ->
221 let body, endbrace, xs = mk_braceised_aux [] xs in
222 mk_braceised_aux (Braceised (body,x, endbrace)::acc) xs
223 | _ ->
224 mk_braceised_aux (BToken x::acc) xs
225 )
226
227
228
229
230 let rec mk_ifdef xs =
231 match xs with
232 | [] -> []
233 | x::xs ->
234 (match x.tok with
235 | TIfdef _ ->
236 let body, extra, xs = mk_ifdef_parameters [x] [] xs in
237 Ifdef (body, extra)::mk_ifdef xs
238 | TIfdefBool (b,_, _) ->
239 let body, extra, xs = mk_ifdef_parameters [x] [] xs in
240
241 (* if not passing, then consider a #if 0 as an ordinary #ifdef *)
242 if !Flag_parsing_c.if0_passing
243 then Ifdefbool (b, body, extra)::mk_ifdef xs
244 else Ifdef(body, extra)::mk_ifdef xs
245
246 | TIfdefMisc (b,_,_) | TIfdefVersion (b,_,_) ->
247 let body, extra, xs = mk_ifdef_parameters [x] [] xs in
248 Ifdefbool (b, body, extra)::mk_ifdef xs
249
250
251 | _ ->
252 (* todo? can have some Ifdef in the line ? *)
253 let line, xs = Common.span (fun y -> y.line =|= x.line) (x::xs) in
254 NotIfdefLine line::mk_ifdef xs
255 )
256
257 and mk_ifdef_parameters extras acc_before_sep xs =
258 match xs with
259 | [] ->
260 (* Note that mk_ifdef is assuming that CPP instruction are alone
261 * on their line. Because I do a span (fun x -> is_same_line ...)
262 * I might take with me a #endif if this one is mixed on a line
263 * with some "normal" tokens.
264 *)
265 pr2 "PB: not found closing ifdef in fuzzy parsing";
266 [List.rev acc_before_sep], List.rev extras, []
267 | x::xs ->
268 (match x.tok with
269 | TEndif _ ->
270 [List.rev acc_before_sep], List.rev (x::extras), xs
271 | TIfdef _ ->
272 let body, extrasnest, xs = mk_ifdef_parameters [x] [] xs in
273 mk_ifdef_parameters
274 extras (Ifdef (body, extrasnest)::acc_before_sep) xs
275
276 | TIfdefBool (b,_,_) ->
277 let body, extrasnest, xs = mk_ifdef_parameters [x] [] xs in
278
279 if !Flag_parsing_c.if0_passing
280 then
281 mk_ifdef_parameters
282 extras (Ifdefbool (b, body, extrasnest)::acc_before_sep) xs
283 else
284 mk_ifdef_parameters
285 extras (Ifdef (body, extrasnest)::acc_before_sep) xs
286
287
288 | TIfdefMisc (b,_,_) | TIfdefVersion (b,_,_) ->
289 let body, extrasnest, xs = mk_ifdef_parameters [x] [] xs in
290 mk_ifdef_parameters
291 extras (Ifdefbool (b, body, extrasnest)::acc_before_sep) xs
292
293 | TIfdefelse _
294 | TIfdefelif _ ->
295 let body, extras, xs = mk_ifdef_parameters (x::extras) [] xs in
296 (List.rev acc_before_sep)::body, extras, xs
297 | _ ->
298 let line, xs = Common.span (fun y -> y.line =|= x.line) (x::xs) in
299 mk_ifdef_parameters extras (NotIfdefLine line::acc_before_sep) xs
300 )
301
302 (* --------------------------------------- *)
303
304 let line_of_paren = function
305 | PToken x -> x.line
306 | Parenthised (xxs, info_parens) ->
307 (match info_parens with
308 | [] -> raise (Impossible 121)
309 | x::xs -> x.line
310 )
311
312
313 let rec span_line_paren line = function
314 | [] -> [],[]
315 | x::xs ->
316 (match x with
317 | PToken tok when TH.is_eof tok.tok ->
318 [], x::xs
319 | _ ->
320 if line_of_paren x =|= line
321 then
322 let (l1, l2) = span_line_paren line xs in
323 (x::l1, l2)
324 else ([], x::xs)
325 )
326
327
328 let rec mk_line_parenthised xs =
329 match xs with
330 | [] -> []
331 | x::xs ->
332 let line_no = line_of_paren x in
333 let line, xs = span_line_paren line_no xs in
334 Line (x::line)::mk_line_parenthised xs
335
336
337 (* --------------------------------------- *)
338 let rec mk_body_function_grouped xs =
339 match xs with
340 | [] -> []
341 | x::xs ->
342 (match x with
343 | {tok = TOBrace _; col = 0} ->
344 let is_closing_brace = function
345 | {tok = TCBrace _; col = 0 } -> true
346 | _ -> false
347 in
348 let body, xs = Common.span (fun x -> not (is_closing_brace x)) xs in
349 (match xs with
350 | ({tok = TCBrace _; col = 0 })::xs ->
351 BodyFunction body::mk_body_function_grouped xs
352 | [] ->
353 pr2 "PB:not found closing brace in fuzzy parsing";
354 [NotBodyLine body]
355 | _ -> raise (Impossible 122)
356 )
357
358 | _ ->
359 let line, xs = Common.span (fun y -> y.line =|= x.line) (x::xs) in
360 NotBodyLine line::mk_body_function_grouped xs
361 )
362
363
364 (* ------------------------------------------------------------------------- *)
365 (* view iterators *)
366 (* ------------------------------------------------------------------------- *)
367
368 let rec iter_token_paren f xs =
369 xs +> List.iter (function
370 | PToken tok -> f tok;
371 | Parenthised (xxs, info_parens) ->
372 info_parens +> List.iter f;
373 xxs +> List.iter (fun xs -> iter_token_paren f xs)
374 )
375
376 let rec iter_token_brace f xs =
377 xs +> List.iter (function
378 | BToken tok -> f tok;
379 | Braceised (xxs, tok1, tok2opt) ->
380 f tok1; do_option f tok2opt;
381 xxs +> List.iter (fun xs -> iter_token_brace f xs)
382 )
383
384 let rec iter_token_ifdef f xs =
385 xs +> List.iter (function
386 | NotIfdefLine xs -> xs +> List.iter f;
387 | Ifdefbool (_, xxs, info_ifdef)
388 | Ifdef (xxs, info_ifdef) ->
389 info_ifdef +> List.iter f;
390 xxs +> List.iter (iter_token_ifdef f)
391 )
392
393
394
395
396 let tokens_of_paren xs =
397 let g = ref [] in
398 xs +> iter_token_paren (fun tok -> push2 tok g);
399 List.rev !g
400
401
402 let tokens_of_paren_ordered xs =
403 let g = ref [] in
404
405 let rec aux_tokens_ordered = function
406 | PToken tok -> push2 tok g;
407 | Parenthised (xxs, info_parens) ->
408 let (opar, cpar, commas) =
409 match info_parens with
410 | opar::xs ->
411 (match List.rev xs with
412 | cpar::xs ->
413 opar, cpar, List.rev xs
414 | _ -> raise (Impossible 123)
415 )
416 | _ -> raise (Impossible 124)
417 in
418 push2 opar g;
419 aux_args (xxs,commas);
420 push2 cpar g;
421
422 and aux_args (xxs, commas) =
423 match xxs, commas with
424 | [], [] -> ()
425 | [xs], [] -> xs +> List.iter aux_tokens_ordered
426 | xs::ys::xxs, comma::commas ->
427 xs +> List.iter aux_tokens_ordered;
428 push2 comma g;
429 aux_args (ys::xxs, commas)
430 | _ -> raise (Impossible 125)
431
432 in
433
434 xs +> List.iter aux_tokens_ordered;
435 List.rev !g
436
437
438
439 (* ------------------------------------------------------------------------- *)
440 (* set the context info in token *)
441 (* ------------------------------------------------------------------------- *)
442
443
444 let rec set_in_function_tag xs =
445 (* could try: ) { } but it can be the ) of a if or while, so
446 * better to base the heuristic on the position in column zero.
447 * Note that some struct or enum or init put also their { in first column
448 * but set_in_other will overwrite the previous InFunction tag.
449 *)
450 match xs with
451 | [] -> ()
452 (* ) { and the closing } is in column zero, then certainly a function *)
453 | BToken ({tok = TCPar _ })::(Braceised (body, tok1, Some tok2))::xs
454 when tok1.col <> 0 && tok2.col =|= 0 ->
455 body +> List.iter (iter_token_brace (fun tok ->
456 tok.where <- InFunction
457 ));
458 set_in_function_tag xs
459
460 | (BToken x)::xs -> set_in_function_tag xs
461
462 | (Braceised (body, tok1, Some tok2))::xs
463 when tok1.col =|= 0 && tok2.col =|= 0 ->
464 body +> List.iter (iter_token_brace (fun tok ->
465 tok.where <- InFunction
466 ));
467 set_in_function_tag xs
468 | Braceised (body, tok1, tok2)::xs ->
469 set_in_function_tag xs
470
471
472 let rec set_in_other xs =
473 match xs with
474 | [] -> ()
475 (* enum x { } *)
476 | BToken ({tok = Tenum _})::BToken ({tok = TIdent _})
477 ::Braceised(body, tok1, tok2)::xs
478 | BToken ({tok = Tenum _})
479 ::Braceised(body, tok1, tok2)::xs
480 ->
481 body +> List.iter (iter_token_brace (fun tok ->
482 tok.where <- InEnum;
483 ));
484 set_in_other xs
485
486 (* struct x { } *)
487 | BToken ({tok = Tstruct _})::BToken ({tok = TIdent _})
488 ::Braceised(body, tok1, tok2)::xs ->
489 body +> List.iter (iter_token_brace (fun tok ->
490 tok.where <- InStruct;
491 ));
492 set_in_other xs
493 (* = { } *)
494 | BToken ({tok = TEq _})
495 ::Braceised(body, tok1, tok2)::xs ->
496 body +> List.iter (iter_token_brace (fun tok ->
497 tok.where <- InInitializer;
498 ));
499 set_in_other xs
500
501 | BToken _::xs -> set_in_other xs
502
503 | Braceised(body, tok1, tok2)::xs ->
504 body +> List.iter set_in_other;
505 set_in_other xs
506
507
508
509
510 let set_context_tag xs =
511 begin
512 set_in_function_tag xs;
513 set_in_other xs;
514 end
515