c00b746cbd0425402ddf3ace30cc167c806470fa
[bpt/coccinelle.git] / parsing_c / parse_c.ml
1 (* Copyright (C) 2006, 2007, 2008 Yoann Padioleau
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU General Public License (GPL)
5 * version 2 as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * file license.txt for more details.
11 *)
12
13 open Common
14
15 module TH = Token_helpers
16 module LP = Lexer_parser
17
18 module Stat = Parsing_stat
19
20 (*****************************************************************************)
21 (* Wrappers *)
22 (*****************************************************************************)
23 let pr2 s =
24 if !Flag_parsing_c.verbose_parsing
25 then Common.pr2 s
26
27 let pr2_once s =
28 if !Flag_parsing_c.verbose_parsing
29 then Common.pr2_once s
30
31 (*****************************************************************************)
32 (* Helpers *)
33 (*****************************************************************************)
34
35 let lexbuf_to_strpos lexbuf =
36 (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
37
38 let token_to_strpos tok =
39 (TH.str_of_tok tok, TH.pos_of_tok tok)
40
41
42 let error_msg_tok tok =
43 let file = TH.file_of_tok tok in
44 if !Flag_parsing_c.verbose_parsing
45 then Common.error_message file (token_to_strpos tok)
46 else ("error in " ^ file ^ "set verbose_parsing for more info")
47
48
49 let print_bad line_error (start_line, end_line) filelines =
50 begin
51 pr2 ("badcount: " ^ i_to_s (end_line - start_line));
52
53 for i = start_line to end_line do
54 let line = filelines.(i) in
55
56 if i = line_error
57 then pr2 ("BAD:!!!!!" ^ " " ^ line)
58 else pr2 ("bad:" ^ " " ^ line)
59 done
60 end
61
62
63
64 let mk_info_item2 filename toks =
65 let buf = Buffer.create 100 in
66 let s =
67 (* old: get_slice_file filename (line1, line2) *)
68 begin
69 toks +> List.iter (fun tok ->
70 match TH.pinfo_of_tok tok with
71 | Ast_c.OriginTok _ ->
72 Buffer.add_string buf (TH.str_of_tok tok)
73 | Ast_c.AbstractLineTok _ ->
74 raise Impossible
75 | _ -> ()
76 );
77 Buffer.contents buf
78 end
79 in
80 (s, toks)
81
82 let mk_info_item a b =
83 Common.profile_code "C parsing.mk_info_item"
84 (fun () -> mk_info_item2 a b)
85
86
87 let info_same_line line xs =
88 xs +> List.filter (fun info -> Ast_c.line_of_info info = line)
89
90
91 (*****************************************************************************)
92 (* Stats on what was passed/commentized *)
93 (*****************************************************************************)
94
95 let commentized xs = xs +> Common.map_filter (function
96 | Parser_c.TCommentCpp (cppkind, ii) ->
97 let s = Ast_c.str_of_info ii in
98 let legal_passing =
99 match !Flag_parsing_c.filter_passed_level with
100 | 0 -> false
101 | 1 ->
102 List.mem cppkind [Ast_c.CppAttr]
103 ||
104 (s =~ "__.*")
105 | 2 ->
106 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal]
107 ||
108 (s =~ "__.*")
109 | 3 ->
110 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective]
111 ||
112 (s =~ "__.*")
113 | 4 ->
114 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppMacro]
115 ||
116 (s =~ "__.*")
117
118
119 | 5 ->
120 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective;Ast_c.CppMacro]
121 ||
122 (s =~ "__.*")
123
124
125
126
127 | _ -> failwith "not valid level passing number"
128 in
129 if legal_passing then None else Some (ii.Ast_c.pinfo)
130
131 (*
132 | Ast_c.CppOther ->
133 (match s with
134 | s when s =~ "KERN_.*" -> None
135 | s when s =~ "__.*" -> None
136 | _ ->
137 Some (ii.Ast_c.pinfo)
138 )
139 *)
140
141
142 | Parser_c.TCommentMisc ii
143 | Parser_c.TAction ii
144 ->
145 Some (ii.Ast_c.pinfo)
146 | _ ->
147 None
148 )
149
150 let count_lines_commentized xs =
151 let line = ref (-1) in
152 let count = ref 0 in
153 begin
154 commentized xs +>
155 List.iter
156 (function
157 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
158 let newline = pinfo.Common.line in
159 if newline <> !line
160 then begin
161 line := newline;
162 incr count
163 end
164 | _ -> ());
165 !count
166 end
167
168
169
170 let print_commentized xs =
171 let line = ref (-1) in
172 begin
173 let ys = commentized xs in
174 ys +>
175 List.iter
176 (function
177 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
178 let newline = pinfo.Common.line in
179 let s = pinfo.Common.str in
180 let s = Str.global_substitute
181 (Str.regexp "\n") (fun s -> "") s
182 in
183 if newline = !line
184 then prerr_string (s ^ " ")
185 else begin
186 if !line = -1
187 then pr2_no_nl "passed:"
188 else pr2_no_nl "\npassed:";
189 line := newline;
190 pr2_no_nl (s ^ " ");
191 end
192 | _ -> ());
193 if not (null ys) then pr2 "";
194 end
195
196
197
198
199 (*****************************************************************************)
200 (* Lexing only *)
201 (*****************************************************************************)
202
203 (* called by parse_print_error_heuristic *)
204 let tokens2 file =
205 let table = Common.full_charpos_to_pos file in
206
207 Common.with_open_infile file (fun chan ->
208 let lexbuf = Lexing.from_channel chan in
209 try
210 let rec tokens_aux acc =
211 let tok = Lexer_c.token lexbuf in
212 (* fill in the line and col information *)
213 let tok = tok +> TH.visitor_info_of_tok (fun ii ->
214 { ii with Ast_c.pinfo=
215 (* could assert pinfo.filename = file ? *)
216 match Ast_c.pinfo_of_info ii with
217 Ast_c.OriginTok pi ->
218 Ast_c.OriginTok (Common.complete_parse_info file table pi)
219 | Ast_c.ExpandedTok (pi,vpi) ->
220 Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
221 | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
222 | Ast_c.AbstractLineTok pi -> failwith "should not occur"
223 })
224 in
225
226 if TH.is_eof tok
227 then List.rev (tok::acc)
228 else tokens_aux (tok::acc)
229 in
230 tokens_aux []
231 with
232 | Lexer_c.Lexical s ->
233 failwith ("lexical error " ^ s ^ "\n =" ^
234 (Common.error_message file (lexbuf_to_strpos lexbuf)))
235 | e -> raise e
236 )
237
238 let time_lexing ?(profile=true) a =
239 if profile
240 then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
241 else tokens2 a
242 let tokens ?profile a =
243 Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
244
245
246 let tokens_of_string string =
247 let lexbuf = Lexing.from_string string in
248 try
249 let rec tokens_s_aux () =
250 let tok = Lexer_c.token lexbuf in
251 if TH.is_eof tok
252 then [tok]
253 else tok::(tokens_s_aux ())
254 in
255 tokens_s_aux ()
256 with
257 | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
258 | e -> raise e
259
260
261 (*****************************************************************************)
262 (* Parsing, but very basic, no more used *)
263 (*****************************************************************************)
264
265 (*
266 * !!!Those function use refs, and are not reentrant !!! so take care.
267 * It use globals defined in Lexer_parser.
268 *
269 * update: because now lexer return comments tokens, those functions
270 * may not work anymore.
271 *)
272
273 let parse file =
274 let lexbuf = Lexing.from_channel (open_in file) in
275 let result = Parser_c.main Lexer_c.token lexbuf in
276 result
277
278
279 let parse_print_error file =
280 let chan = (open_in file) in
281 let lexbuf = Lexing.from_channel chan in
282
283 let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
284 try
285 lexbuf +> Parser_c.main Lexer_c.token
286 with
287 | Lexer_c.Lexical s ->
288 failwith ("lexical error " ^s^ "\n =" ^ error_msg ())
289 | Parsing.Parse_error ->
290 failwith ("parse error \n = " ^ error_msg ())
291 | Semantic_c.Semantic (s, i) ->
292 failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
293 | e -> raise e
294
295
296
297
298 (*****************************************************************************)
299 (* Parsing subelements, useful to debug parser *)
300 (*****************************************************************************)
301
302 (*
303 * !!!Those function use refs, and are not reentrant !!! so take care.
304 * It use globals defined in Lexer_parser.
305 *)
306
307
308 (* old:
309 * let parse_gen parsefunc s =
310 * let lexbuf = Lexing.from_string s in
311 * let result = parsefunc Lexer_c.token lexbuf in
312 * result
313 *)
314
315 let parse_gen parsefunc s =
316 let toks = tokens_of_string s +> List.filter TH.is_not_comment in
317
318
319 (* Why use this lexing scheme ? Why not classically give lexer func
320 * to parser ? Because I now keep comments in lexer. Could
321 * just do a simple wrapper that when comment ask again for a token,
322 * but maybe simpler to use cur_tok technique.
323 *)
324 let all_tokens = ref toks in
325 let cur_tok = ref (List.hd !all_tokens) in
326
327 let lexer_function =
328 (fun _ ->
329 if TH.is_eof !cur_tok
330 then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
331 else
332 let v = Common.pop2 all_tokens in
333 cur_tok := v;
334 !cur_tok
335 )
336 in
337 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
338 let result = parsefunc lexer_function lexbuf_fake in
339 result
340
341
342 let type_of_string = parse_gen Parser_c.type_name
343 let statement_of_string = parse_gen Parser_c.statement
344 let expression_of_string = parse_gen Parser_c.expr
345
346 (* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
347
348
349
350
351
352 (*****************************************************************************)
353 (* Consistency checking *)
354 (*****************************************************************************)
355
356 type class_ident =
357 | CIdent (* can be var, func, field, tag, enum constant *)
358 | CTypedef
359
360 let str_of_class_ident = function
361 | CIdent -> "Ident"
362 | CTypedef -> "Typedef"
363
364 (*
365 | CMacro
366 | CMacroString
367 | CMacroStmt
368 | CMacroDecl
369 | CMacroIterator
370 | CAttr
371
372 (* but take care that must still be able to use '=' *)
373 type context = InFunction | InEnum | InStruct | InInitializer | InParams
374 type class_token =
375 | CIdent of class_ident
376
377 | CComment
378 | CSpace
379 | CCommentCpp of cppkind
380 | CCommentMisc
381 | CCppDirective
382
383 | COPar
384 | CCPar
385 | COBrace
386 | CCBrace
387
388 | CSymbol
389 | CReservedKwd (type | decl | qualif | flow | misc | attr)
390 *)
391
392 (* parse_typedef_fix4 *)
393 let consistency_checking2 xs =
394
395 (* first phase, gather data *)
396 let stat = Hashtbl.create 101 in
397
398 (* default value for hash *)
399 let v1 () = Hashtbl.create 101 in
400 let v2 () = ref 0 in
401
402 let bigf = { Visitor_c.default_visitor_c with
403
404 Visitor_c.kexpr = (fun (k,bigf) x ->
405 match Ast_c.unwrap_expr x with
406 | Ast_c.Ident s ->
407 stat +>
408 Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
409 (fun aref -> incr aref)
410
411 | _ -> k x
412 );
413 Visitor_c.ktype = (fun (k,bigf) t ->
414 match Ast_c.unwrap_typeC t with
415 | Ast_c.TypeName (s,_typ) ->
416 stat +>
417 Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
418 (fun aref -> incr aref)
419
420 | _ -> k t
421 );
422 }
423 in
424 xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
425
426
427 let ident_to_type = ref [] in
428
429
430 (* second phase, analyze data *)
431 stat +> Hashtbl.iter (fun k v ->
432 let xs = Common.hash_to_list v in
433 if List.length xs >= 2
434 then begin
435 pr2 ("CONFLICT:" ^ k);
436 let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
437 if !va = !vb then
438 (match ka, kb with
439 | CTypedef, _ -> 1 (* first is smaller *)
440 | _, CTypedef -> -1
441 | _ -> 0
442 )
443 else compare !va !vb
444 ) in
445 let sorted = List.rev sorted in
446 match sorted with
447 | [CTypedef, i1;CIdent, i2] ->
448 pr2 ("transforming some ident in typedef");
449 push2 k ident_to_type;
450 | _ ->
451 pr2 ("TODO:other transforming?");
452
453 end
454 );
455
456 (* third phase, update ast.
457 * todo? but normally should try to handle correctly scope ? maybe sometime
458 * sizeof(id) and even if id was for a long time an identifier, maybe
459 * a few time, because of the scope it's actually really a type.
460 *)
461 if (null !ident_to_type)
462 then xs
463 else
464 let bigf = { Visitor_c.default_visitor_c_s with
465 Visitor_c.kdefineval_s = (fun (k,bigf) x ->
466 match x with
467 | Ast_c.DefineExpr e ->
468 (match e with
469 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
470 let t = (Ast_c.nQ,
471 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
472
473 Ast_c.DefineType t
474 | _ -> k x
475 )
476 | _ -> k x
477 );
478 Visitor_c.kexpr_s = (fun (k, bigf) x ->
479 match x with
480 | (Ast_c.SizeOfExpr e, tref), isizeof ->
481 let i1 = tuple_of_list1 isizeof in
482 (match e with
483 | (Ast_c.ParenExpr e, _), iiparen ->
484 (match e with
485 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
486 let (i2, i3) = tuple_of_list2 iiparen in
487 let t = (Ast_c.nQ,
488 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
489 (Ast_c.SizeOfType t, tref), [i1;i2;i3]
490
491 | _ -> k x
492 )
493 | _ -> k x
494 )
495 | _ -> k x
496 );
497 } in
498 xs +> List.map (fun (p, info_item) ->
499 Visitor_c.vk_toplevel_s bigf p, info_item
500 )
501
502
503 let consistency_checking a =
504 Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
505
506
507
508 (*****************************************************************************)
509 (* Error recovery *)
510 (*****************************************************************************)
511
512 (* todo: do something if find Parser_c.Eof ? *)
513 let rec find_next_synchro next already_passed =
514
515 (* Maybe because not enough }, because for example an ifdef contains
516 * in both branch some opening {, we later eat too much, "on deborde
517 * sur la fonction d'apres". So already_passed may be too big and
518 * looking for next synchro point starting from next may not be the
519 * best. So maybe we can find synchro point inside already_passed
520 * instead of looking in next.
521 *
522 * But take care! must progress. We must not stay in infinite loop!
523 * For instance now I have as a error recovery to look for
524 * a "start of something", corresponding to start of function,
525 * but must go beyond this start otherwise will loop.
526 * So look at premier(external_declaration2) in parser.output and
527 * pass at least those first tokens.
528 *
529 * I have chosen to start search for next synchro point after the
530 * first { I found, so quite sure we will not loop. *)
531
532 let last_round = List.rev already_passed in
533 let is_define =
534 let xs = last_round +> List.filter TH.is_not_comment in
535 match xs with
536 | Parser_c.TDefine _::_ -> true
537 | _ -> false
538 in
539 if is_define
540 then find_next_synchro_define (last_round ++ next) []
541 else
542
543 let (before, after) =
544 last_round +> Common.span (fun tok ->
545 match tok with
546 (* by looking at TOBrace we are sure that the "start of something"
547 * will not arrive too early
548 *)
549 | Parser_c.TOBrace _ -> false
550 | Parser_c.TDefine _ -> false
551 | _ -> true
552 )
553 in
554 find_next_synchro_orig (after ++ next) (List.rev before)
555
556
557
558 and find_next_synchro_define next already_passed =
559 match next with
560 | [] ->
561 pr2 "ERROR-RECOV: end of file while in recovery mode";
562 already_passed, []
563 | (Parser_c.TDefEOL i as v)::xs ->
564 pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
565 v::already_passed, xs
566 | v::xs ->
567 find_next_synchro_define xs (v::already_passed)
568
569
570
571
572 and find_next_synchro_orig next already_passed =
573 match next with
574 | [] ->
575 pr2 "ERROR-RECOV: end of file while in recovery mode";
576 already_passed, []
577
578 | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v = 0 ->
579 pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
580
581 (match xs with
582 | [] -> raise Impossible (* there is a EOF token normally *)
583
584 (* still useful: now parser.mly allow empty ';' so normally no pb *)
585 | Parser_c.TPtVirg iptvirg::xs ->
586 pr2 "ERROR-RECOV: found sync bis, eating } and ;";
587 (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
588
589 | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
590 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
591 (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
592 xs
593
594 | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
595 ::xs ->
596 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
597 (Parser_c.TCommentSpace sp)::
598 (Parser_c.TPtVirg iptvirg)::
599 (Parser_c.TIdent x)::
600 v::
601 already_passed,
602 xs
603
604 | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
605 ::xs ->
606 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
607 (Parser_c.TCommentNewline sp)::
608 (Parser_c.TPtVirg iptvirg)::
609 (Parser_c.TIdent x)::
610 v::
611 already_passed,
612 xs
613
614 | _ ->
615 v::already_passed, xs
616 )
617 | v::xs when TH.col_of_tok v = 0 && TH.is_start_of_something v ->
618 pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
619 already_passed, v::xs
620
621 | v::xs ->
622 find_next_synchro_orig xs (v::already_passed)
623
624
625 (*****************************************************************************)
626 (* Include/Define hacks *)
627 (*****************************************************************************)
628
629 (* Sometimes I prefer to generate a single token for a list of things in the
630 * lexer so that if I have to passed them, like for passing TInclude then
631 * it's easy. Also if I don't do a single token, then I need to
632 * parse the rest which may not need special stuff, like detecting
633 * end of line which the parser is not really ready for. So for instance
634 * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
635 * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
636 * but this kind of token is valid only after a #include and the
637 * lexing and parsing rules are different for such tokens so not that
638 * easy to parse such things in parser_c.mly. Hence the following hacks.
639 *
640 * less?: maybe could get rid of this like I get rid of some of fix_define.
641 *)
642
643 (* ------------------------------------------------------------------------- *)
644 (* helpers *)
645 (* ------------------------------------------------------------------------- *)
646
647 (* used to generate new token from existing one *)
648 let new_info posadd str ii =
649 { Ast_c.pinfo =
650 Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
651 charpos = Ast_c.pos_of_info ii + posadd;
652 str = str;
653 column = Ast_c.col_of_info ii + posadd;
654 };
655 (* must generate a new ref each time, otherwise share *)
656 cocci_tag = ref Ast_c.emptyAnnot;
657 comments_tag = ref Ast_c.emptyComments;
658 }
659
660
661 let rec comment_until_defeol xs =
662 match xs with
663 | [] -> failwith "cant find end of define token TDefEOL"
664 | x::xs ->
665 (match x with
666 | Parser_c.TDefEOL i ->
667 Parser_c.TCommentCpp (Ast_c.CppDirective, TH.info_of_tok x)
668 ::xs
669 | _ ->
670 let x' =
671 (* bugfix: otherwise may lose a TComment token *)
672 if TH.is_real_comment x
673 then x
674 else Parser_c.TCommentCpp (Ast_c.CppPassingNormal (*good?*), TH.info_of_tok x)
675 in
676 x'::comment_until_defeol xs
677 )
678
679 let drop_until_defeol xs =
680 List.tl
681 (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
682
683
684
685 (* ------------------------------------------------------------------------- *)
686 (* returns a pair (replaced token, list of next tokens) *)
687 (* ------------------------------------------------------------------------- *)
688
689 let tokens_include (info, includes, filename, inifdef) =
690 Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
691 [Parser_c.TIncludeFilename
692 (filename, (new_info (String.length includes) filename info))
693 ]
694
695 (*****************************************************************************)
696 (* Parsing default define macros, usually in a standard.h file *)
697 (*****************************************************************************)
698
699 let parse_cpp_define_file2 file =
700 let toks = tokens ~profile:false file in
701 let toks = Parsing_hacks.fix_tokens_define toks in
702 Parsing_hacks.extract_cpp_define toks
703
704 let parse_cpp_define_file a =
705 Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
706
707 (* can not be put in parsing_hack, cos then mutually recursive problem as
708 * we also want to parse the standard.h file.
709 *)
710 let init_defs std_h =
711 if not (Common.lfile_exists std_h)
712 then pr2 ("warning: Can't find default macro file: " ^ std_h)
713 else begin
714 pr2 ("init_defs: " ^ std_h);
715 Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
716 end
717
718
719 (*****************************************************************************)
720 (* Main entry point *)
721 (*****************************************************************************)
722
723 type info_item = string * Parser_c.token list
724
725 type program2 = toplevel2 list
726 and toplevel2 = Ast_c.toplevel * info_item
727
728 let program_of_program2 xs =
729 xs +> List.map fst
730
731 let with_program2 f program2 =
732 program2
733 +> Common.unzip
734 +> (fun (program, infos) ->
735 f program, infos
736 )
737 +> Common.uncurry Common.zip
738
739
740
741 (* The use of local refs (remaining_tokens, passed_tokens, ...) makes
742 * possible error recovery. Indeed, they allow to skip some tokens and
743 * still be able to call again the ocamlyacc parser. It is ugly code
744 * because we cant modify ocamllex and ocamlyacc. As we want some
745 * extended lexing tricks, we have to use such refs.
746 *
747 * Those refs are now also used for my lalr(k) technique. Indeed They
748 * store the futur and previous tokens that were parsed, and so
749 * provide enough context information for powerful lex trick.
750 *
751 * - passed_tokens_last_ckp stores the passed tokens since last
752 * checkpoint. Used for NotParsedCorrectly and also to build the
753 * info_item attached to each program_element.
754 * - passed_tokens_clean is used for lookahead, in fact for lookback.
755 * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
756 * contain some comments and so would make pattern matching difficult
757 * in lookahead. Hence this variable. We would like also to get rid
758 * of cpp instruction because sometimes a cpp instruction is between
759 * two tokens and makes a pattern matching fail. But lookahead also
760 * transform some cpp instruction (in comment) so can't remove them.
761 *
762 * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
763 * whereas passed_tokens_clean and remaining_tokens_clean does not contain
764 * comment-tokens.
765 *
766 * Normally we have:
767 * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
768 * after the call to pop2.
769 * toks = (reverse passed_tok) ++ remaining_tokens
770 * at the and of the lexer_function call.
771 * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
772 * At the end of lexer_function call, cur_tok overlap with passed_tok.
773 *
774 * convention: I use "tr" for "tokens refs"
775 *
776 * I now also need this lexing trick because the lexer return comment
777 * tokens.
778 *)
779
780 type tokens_state = {
781 mutable rest : Parser_c.token list;
782 mutable rest_clean : Parser_c.token list;
783 mutable current : Parser_c.token;
784 (* it's passed since last "checkpoint", not passed from the beginning *)
785 mutable passed : Parser_c.token list;
786 mutable passed_clean : Parser_c.token list;
787 }
788 let clone_tokens_stat tr =
789 { rest = tr.rest;
790 rest_clean = tr.rest_clean;
791 current = tr.current;
792 passed = tr.passed;
793 passed_clean = tr.passed_clean;
794 }
795 let copy_tokens_stat ~src ~dst =
796 dst.rest <- src.rest;
797 dst.rest_clean <- src.rest_clean;
798 dst.current <- src.current;
799 dst.passed <- src.passed;
800 dst.passed_clean <- src.passed_clean;
801 ()
802
803 let rec filter_noise n xs =
804 match n, xs with
805 | _, [] -> []
806 | 0, xs -> xs
807 | n, x::xs ->
808 (match x with
809 | Parser_c.TMacroAttr _ ->
810 filter_noise (n-1) xs
811 | _ ->
812 x::filter_noise (n-1) xs
813 )
814
815 let clean_for_lookahead xs =
816 match xs with
817 | [] -> []
818 | [x] -> [x]
819 | x::xs ->
820 x::filter_noise 10 xs
821
822
823
824 (* Hacked lex. This function use refs passed by parse_print_error_heuristic
825 * tr means token refs.
826 *)
827 let rec lexer_function ~pass tr = fun lexbuf ->
828 match tr.rest with
829 | [] -> pr2 "ALREADY AT END"; tr.current
830 | v::xs ->
831 tr.rest <- xs;
832 tr.current <- v;
833
834 if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
835
836 if TH.is_comment v
837 then begin
838 tr.passed <- v::tr.passed;
839 lexer_function ~pass tr lexbuf
840 end
841 else begin
842 let x = List.hd tr.rest_clean in
843 tr.rest_clean <- List.tl tr.rest_clean;
844 assert (x = v);
845
846 (match v with
847 (* fix_define1. Why not in parsing_hacks lookahead and do passing like
848 * I do for some ifdef directives ? Because here I also need to
849 * generate some tokens sometimes.
850 *)
851 | Parser_c.TDefine (tok) ->
852 if not (LP.current_context () = LP.InTopLevel) &&
853 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
854 then begin
855 incr Stat.nDefinePassing;
856 pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
857 let v' = Parser_c.TCommentCpp (Ast_c.CppDirective,TH.info_of_tok v)
858 in
859 tr.passed <- v'::tr.passed;
860 tr.rest <- comment_until_defeol tr.rest;
861 tr.rest_clean <- drop_until_defeol tr.rest_clean;
862 lexer_function ~pass tr lexbuf
863 end
864 else begin
865 tr.passed <- v::tr.passed;
866 tr.passed_clean <- v::tr.passed_clean;
867 v
868 end
869
870 | Parser_c.TInclude (includes, filename, inifdef, info) ->
871 if not (LP.current_context () = LP.InTopLevel) &&
872 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
873 then begin
874 incr Stat.nIncludePassing;
875 pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
876 let v = Parser_c.TCommentCpp(Ast_c.CppDirective, info) in
877 tr.passed <- v::tr.passed;
878 lexer_function ~pass tr lexbuf
879 end
880 else begin
881 let (v,new_tokens) =
882 tokens_include (info, includes, filename, inifdef) in
883 let new_tokens_clean =
884 new_tokens +> List.filter TH.is_not_comment in
885
886 tr.passed <- v::tr.passed;
887 tr.passed_clean <- v::tr.passed_clean;
888 tr.rest <- new_tokens ++ tr.rest;
889 tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
890 v
891 end
892
893 | _ ->
894
895 (* typedef_fix1 *)
896 let v = match v with
897 | Parser_c.TIdent (s, ii) ->
898 if
899 LP.is_typedef s &&
900 not (!Flag_parsing_c.disable_add_typedef) &&
901 pass = 1
902 then Parser_c.TypedefIdent (s, ii)
903 else Parser_c.TIdent (s, ii)
904 | x -> x
905 in
906
907 let v = Parsing_hacks.lookahead ~pass
908 (clean_for_lookahead (v::tr.rest_clean))
909 tr.passed_clean in
910
911 tr.passed <- v::tr.passed;
912
913 (* the lookahead may have changed the status of the token and
914 * consider it as a comment, for instance some #include are
915 * turned into comments, hence this code. *)
916 match v with
917 | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
918 | v ->
919 tr.passed_clean <- v::tr.passed_clean;
920 v
921 )
922 end
923
924
925
926 let get_one_elem ~pass tr (file, filelines) =
927
928 if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
929 then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
930
931 (* normally have to do that only when come from an exception in which
932 * case the dt() may not have been done
933 * TODO but if was in scoped scope ? have to let only the last scope
934 * so need do a LP.lexer_reset_typedef ();
935 *)
936 LP.enable_typedef();
937 LP._lexer_hint := (LP.default_hint ());
938 LP.save_typedef_state();
939
940 tr.passed <- [];
941
942 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
943
944 (try
945 (* -------------------------------------------------- *)
946 (* Call parser *)
947 (* -------------------------------------------------- *)
948 Common.profile_code_exclusif "YACC" (fun () ->
949 Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
950 )
951 with e -> begin
952 if (pass = 1 && !Flag_parsing_c.disable_two_pass)|| (pass = 2)
953 then begin
954 (match e with
955 (* Lexical is not anymore launched I think *)
956 | Lexer_c.Lexical s ->
957 pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
958 | Parsing.Parse_error ->
959 pr2 ("parse error \n = " ^ error_msg_tok tr.current)
960 | Semantic_c.Semantic (s, i) ->
961 pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
962 | e -> raise e
963 )
964 end;
965 LP.restore_typedef_state();
966
967 (* must keep here, before the code that adjusts the tr fields *)
968 let line_error = TH.line_of_tok tr.current in
969
970
971 (* error recovery, go to next synchro point *)
972 let (passed', rest') = find_next_synchro tr.rest tr.passed in
973 tr.rest <- rest';
974 tr.passed <- passed';
975
976 tr.current <- List.hd passed';
977 tr.passed_clean <- []; (* enough ? *)
978 (* with error recovery, rest and rest_clean may not be in sync *)
979 tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
980
981
982 let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
983 Right (info_of_bads, line_error, tr.passed)
984 end
985 )
986
987
988
989
990 (* note: as now we go in 2 passes, there is first all the error message of
991 * the lexer, and then the error of the parser. It is not anymore
992 * interwinded.
993 *
994 * !!!This function use refs, and is not reentrant !!! so take care.
995 * It use globals defined in Lexer_parser and also the _defs global
996 * in parsing_hack.ml.
997 *
998 * This function uses internally some semi globals in the
999 * tokens_stat record and parsing_stat record.
1000 *)
1001
1002 let parse_print_error_heuristic2 file =
1003
1004 let filelines = Common.cat_array file in
1005 let stat = Parsing_stat.default_stat file in
1006
1007 (* -------------------------------------------------- *)
1008 (* call lexer and get all the tokens *)
1009 (* -------------------------------------------------- *)
1010 LP.lexer_reset_typedef();
1011 Parsing_hacks.ifdef_paren_cnt := 0;
1012 let toks_orig = tokens file in
1013
1014 let toks = Parsing_hacks.fix_tokens_define toks_orig in
1015 let toks = Parsing_hacks.fix_tokens_cpp toks in
1016
1017 let tr = {
1018 rest = toks;
1019 rest_clean = (toks +> List.filter TH.is_not_comment);
1020 current = (List.hd toks);
1021 passed = [];
1022 passed_clean = [];
1023 } in
1024
1025
1026
1027
1028 let rec loop tr =
1029
1030 (* todo?: I am not sure that it represents current_line, cos maybe
1031 * tr.current partipated in the previous parsing phase, so maybe tr.current
1032 * is not the first token of the next parsing phase. Same with checkpoint2.
1033 * It would be better to record when we have a } or ; in parser.mly,
1034 * cos we know that they are the last symbols of external_declaration2.
1035 *
1036 * bugfix: may not be equal to 'file' as after macro expansions we can
1037 * start to parse a new entity from the body of a macro, for instance
1038 * when parsing a define_machine() body, cf standard.h
1039 *)
1040 let checkpoint = TH.line_of_tok tr.current in
1041 let checkpoint_file = TH.file_of_tok tr.current in
1042
1043 let tr_save = clone_tokens_stat tr in
1044
1045 (* call the parser *)
1046 let elem =
1047 let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1048 match pass1 with
1049 | Left e -> Left e
1050 | Right res ->
1051 if !Flag_parsing_c.disable_two_pass
1052 then Right res
1053 else begin
1054 pr2 "parsing pass2: try again";
1055 copy_tokens_stat ~src:tr_save ~dst: tr;
1056 let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1057 pass2
1058 end
1059 in
1060
1061
1062 (* again not sure if checkpoint2 corresponds to end of bad region *)
1063 let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1064 let checkpoint2_file = TH.file_of_tok tr.current in
1065
1066 let was_define =
1067 (match elem with
1068 | Left _ -> false
1069 | Right (_, line_error, _) ->
1070 let was_define =
1071 let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1072 if List.length xs >= 2
1073 then
1074 (match Common.head_middle_tail xs with
1075 | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
1076 true
1077 | _ -> false
1078 )
1079 else begin
1080 pr2 "WIERD: length list of error recovery tokens < 2 ";
1081 false
1082 end
1083 in
1084 (if was_define && !Flag_parsing_c.filter_msg_define_error
1085 then ()
1086 else
1087 (* bugfix: *)
1088 if (checkpoint_file = checkpoint2_file) && checkpoint_file = file
1089 then print_bad line_error (checkpoint, checkpoint2) filelines
1090 else pr2 "PB: bad: but on tokens not from original file"
1091 );
1092 was_define
1093 ) in
1094
1095
1096 let diffline =
1097 if (checkpoint_file = checkpoint2_file) && (checkpoint_file = file)
1098 then (checkpoint2 - checkpoint)
1099 else 0
1100 (* TODO? so if error come in middle of something ? where the
1101 * start token was from original file but synchro found in body
1102 * of macro ? then can have wrong number of lines stat.
1103 * Maybe simpler just to look at tr.passed and count
1104 * the lines in the token from the correct file ?
1105 *)
1106 in
1107 let info = mk_info_item file (List.rev tr.passed) in
1108
1109 (* some stat updates *)
1110 stat.Stat.commentized <-
1111 stat.Stat.commentized + count_lines_commentized (snd info);
1112
1113 let elem =
1114 match elem with
1115 | Left e ->
1116 stat.Stat.correct <- stat.Stat.correct + diffline;
1117 e
1118 | Right (info_of_bads, line_error, toks_of_bads) ->
1119 if was_define && !Flag_parsing_c.filter_define_error
1120 then stat.Stat.correct <- stat.Stat.correct + diffline
1121 else stat.Stat.bad <- stat.Stat.bad + diffline;
1122
1123 let pbline =
1124 toks_of_bads
1125 +> Common.filter (TH.is_same_line line_error)
1126 +> Common.filter TH.is_ident_like
1127 in
1128 let error_info =
1129 (pbline +> List.map TH.str_of_tok), line_error
1130 in
1131 stat.Stat.problematic_lines <-
1132 error_info::stat.Stat.problematic_lines;
1133
1134 Ast_c.NotParsedCorrectly info_of_bads
1135 in
1136
1137 (match elem with
1138 | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
1139 | xs -> (xs, info):: loop tr (* recurse *)
1140 )
1141 in
1142 let v = loop tr in
1143 let v = consistency_checking v in
1144 (v, stat)
1145
1146
1147 let time_total_parsing a =
1148 Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1149
1150 let parse_print_error_heuristic a =
1151 Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1152
1153
1154 (* alias *)
1155 let parse_c_and_cpp a = parse_print_error_heuristic a
1156
1157 (*****************************************************************************)
1158 (* Same but faster cos memoize stuff *)
1159 (*****************************************************************************)
1160 let parse_cache file =
1161 if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1162 else
1163 let _ = pr2 "TOFIX" in
1164 let need_no_changed_files =
1165 (* should use Sys.argv.(0), would be safer. *)
1166
1167 [
1168 (* TOFIX
1169 Config.path ^ "/parsing_c/c_parser.cma";
1170 (* we may also depend now on the semantic patch because
1171 the SP may use macro and so we will disable some of the
1172 macro expansions from standard.h.
1173 *)
1174 !Config.std_h;
1175 *)
1176 ]
1177 in
1178 let need_no_changed_variables =
1179 (* could add some of the flags of flag_parsing_c.ml *)
1180 []
1181 in
1182 Common.cache_computation_robust
1183 file ".ast_raw"
1184 (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1185 (fun () -> parse_print_error_heuristic file)
1186
1187
1188
1189 (*****************************************************************************)
1190 (* Some special cases *)
1191 (*****************************************************************************)
1192
1193 let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1194 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1195 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1196 program +> Common.find_some (fun (e,_) ->
1197 match e with
1198 | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1199 | _ -> None
1200 )
1201
1202 let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1203 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1204 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1205 program +> Common.find_some (fun (e,_) ->
1206 match e with
1207 | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1208 (match compound with
1209 | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1210 | _ -> None
1211 )
1212 | _ -> None
1213 )