6e0b505de86c04c85686150f6f1f73b3b6f66fd3
[bpt/coccinelle.git] / parsing_c / parse_c.ml
1 (* Yoann Padioleau
2 *
3 * Copyright (C) 2006, 2007, 2008 Ecole des Mines de Nantes
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License (GPL)
7 * version 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * file license.txt for more details.
13 *)
14
15 open Common
16
17 module TH = Token_helpers
18 module LP = Lexer_parser
19
20 module Stat = Parsing_stat
21
22 (*****************************************************************************)
23 (* Wrappers *)
24 (*****************************************************************************)
25 let pr2 s =
26 if !Flag_parsing_c.verbose_parsing
27 then Common.pr2 s
28
29 let pr2_once s =
30 if !Flag_parsing_c.verbose_parsing
31 then Common.pr2_once s
32
33 (*****************************************************************************)
34 (* Helpers *)
35 (*****************************************************************************)
36
37 let lexbuf_to_strpos lexbuf =
38 (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
39
40 let token_to_strpos tok =
41 (TH.str_of_tok tok, TH.pos_of_tok tok)
42
43
44 let error_msg_tok tok =
45 let file = TH.file_of_tok tok in
46 if !Flag_parsing_c.verbose_parsing
47 then Common.error_message file (token_to_strpos tok)
48 else ("error in " ^ file ^ "set verbose_parsing for more info")
49
50
51 let print_bad line_error (start_line, end_line) filelines =
52 begin
53 pr2 ("badcount: " ^ i_to_s (end_line - start_line));
54
55 for i = start_line to end_line do
56 let line = filelines.(i) in
57
58 if i =|= line_error
59 then pr2 ("BAD:!!!!!" ^ " " ^ line)
60 else pr2 ("bad:" ^ " " ^ line)
61 done
62 end
63
64
65
66 let mk_info_item2 filename toks =
67 let buf = Buffer.create 100 in
68 let s =
69 (* old: get_slice_file filename (line1, line2) *)
70 begin
71 toks +> List.iter (fun tok ->
72 match TH.pinfo_of_tok tok with
73 | Ast_c.OriginTok _ ->
74 Buffer.add_string buf (TH.str_of_tok tok)
75 | Ast_c.AbstractLineTok _ ->
76 raise Impossible
77 | _ -> ()
78 );
79 Buffer.contents buf
80 end
81 in
82 (s, toks)
83
84 let mk_info_item a b =
85 Common.profile_code "C parsing.mk_info_item"
86 (fun () -> mk_info_item2 a b)
87
88
89 let info_same_line line xs =
90 xs +> List.filter (fun info -> Ast_c.line_of_info info =|= line)
91
92
93 (*****************************************************************************)
94 (* Stats on what was passed/commentized *)
95 (*****************************************************************************)
96
97 let commentized xs = xs +> Common.map_filter (function
98 | Parser_c.TCommentCpp (cppkind, ii) ->
99 let s = Ast_c.str_of_info ii in
100 let legal_passing =
101 match !Flag_parsing_c.filter_passed_level with
102 | 0 -> false
103 | 1 ->
104 List.mem cppkind [Token_c.CppAttr]
105 ||
106 (s =~ "__.*")
107 | 2 ->
108 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal]
109 ||
110 (s =~ "__.*")
111 | 3 ->
112 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective]
113 ||
114 (s =~ "__.*")
115 | 4 ->
116 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppMacro]
117 ||
118 (s =~ "__.*")
119
120
121 | 5 ->
122 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective;Token_c.CppMacro]
123 ||
124 (s =~ "__.*")
125
126
127
128
129 | _ -> failwith "not valid level passing number"
130 in
131 if legal_passing then None else Some (ii.Ast_c.pinfo)
132
133 (*
134 | Ast_c.CppOther ->
135 (match s with
136 | s when s =~ "KERN_.*" -> None
137 | s when s =~ "__.*" -> None
138 | _ ->
139 Some (ii.Ast_c.pinfo)
140 )
141 *)
142
143
144 | Parser_c.TCommentMisc ii
145 | Parser_c.TAction ii
146 ->
147 Some (ii.Ast_c.pinfo)
148 | _ ->
149 None
150 )
151
152 let count_lines_commentized xs =
153 let line = ref (-1) in
154 let count = ref 0 in
155 begin
156 commentized xs +>
157 List.iter
158 (function
159 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
160 let newline = pinfo.Common.line in
161 if newline <> !line
162 then begin
163 line := newline;
164 incr count
165 end
166 | _ -> ());
167 !count
168 end
169
170
171
172 let print_commentized xs =
173 let line = ref (-1) in
174 begin
175 let ys = commentized xs in
176 ys +>
177 List.iter
178 (function
179 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
180 let newline = pinfo.Common.line in
181 let s = pinfo.Common.str in
182 let s = Str.global_substitute
183 (Str.regexp "\n") (fun s -> "") s
184 in
185 if newline =|= !line
186 then prerr_string (s ^ " ")
187 else begin
188 if !line =|= -1
189 then pr2_no_nl "passed:"
190 else pr2_no_nl "\npassed:";
191 line := newline;
192 pr2_no_nl (s ^ " ");
193 end
194 | _ -> ());
195 if not (null ys) then pr2 "";
196 end
197
198
199
200
201 (*****************************************************************************)
202 (* Lexing only *)
203 (*****************************************************************************)
204
205 (* called by parse_print_error_heuristic *)
206 let tokens2 file =
207 let table = Common.full_charpos_to_pos file in
208
209 Common.with_open_infile file (fun chan ->
210 let lexbuf = Lexing.from_channel chan in
211 try
212 let rec tokens_aux acc =
213 let tok = Lexer_c.token lexbuf in
214 (* fill in the line and col information *)
215 let tok = tok +> TH.visitor_info_of_tok (fun ii ->
216 { ii with Ast_c.pinfo=
217 (* could assert pinfo.filename = file ? *)
218 match Ast_c.pinfo_of_info ii with
219 Ast_c.OriginTok pi ->
220 Ast_c.OriginTok (Common.complete_parse_info file table pi)
221 | Ast_c.ExpandedTok (pi,vpi) ->
222 Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
223 | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
224 | Ast_c.AbstractLineTok pi -> failwith "should not occur"
225 })
226 in
227
228 if TH.is_eof tok
229 then List.rev (tok::acc)
230 else tokens_aux (tok::acc)
231 in
232 tokens_aux []
233 with
234 | Lexer_c.Lexical s ->
235 failwith ("lexical error " ^ s ^ "\n =" ^
236 (Common.error_message file (lexbuf_to_strpos lexbuf)))
237 | e -> raise e
238 )
239
240 let time_lexing ?(profile=true) a =
241 if profile
242 then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
243 else tokens2 a
244 let tokens ?profile a =
245 Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
246
247
248 let tokens_of_string string =
249 let lexbuf = Lexing.from_string string in
250 try
251 let rec tokens_s_aux () =
252 let tok = Lexer_c.token lexbuf in
253 if TH.is_eof tok
254 then [tok]
255 else tok::(tokens_s_aux ())
256 in
257 tokens_s_aux ()
258 with
259 | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
260 | e -> raise e
261
262
263 (*****************************************************************************)
264 (* Parsing, but very basic, no more used *)
265 (*****************************************************************************)
266
267 (*
268 * !!!Those function use refs, and are not reentrant !!! so take care.
269 * It use globals defined in Lexer_parser.
270 *
271 * update: because now lexer return comments tokens, those functions
272 * may not work anymore.
273 *)
274
275 let parse file =
276 let lexbuf = Lexing.from_channel (open_in file) in
277 let result = Parser_c.main Lexer_c.token lexbuf in
278 result
279
280
281 let parse_print_error file =
282 let chan = (open_in file) in
283 let lexbuf = Lexing.from_channel chan in
284
285 let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
286 try
287 lexbuf +> Parser_c.main Lexer_c.token
288 with
289 | Lexer_c.Lexical s ->
290 failwith ("lexical error " ^s^ "\n =" ^ error_msg ())
291 | Parsing.Parse_error ->
292 failwith ("parse error \n = " ^ error_msg ())
293 | Semantic_c.Semantic (s, i) ->
294 failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
295 | e -> raise e
296
297
298
299
300 (*****************************************************************************)
301 (* Parsing subelements, useful to debug parser *)
302 (*****************************************************************************)
303
304 (*
305 * !!!Those function use refs, and are not reentrant !!! so take care.
306 * It use globals defined in Lexer_parser.
307 *)
308
309
310 (* old:
311 * let parse_gen parsefunc s =
312 * let lexbuf = Lexing.from_string s in
313 * let result = parsefunc Lexer_c.token lexbuf in
314 * result
315 *)
316
317 let parse_gen parsefunc s =
318 let toks = tokens_of_string s +> List.filter TH.is_not_comment in
319
320
321 (* Why use this lexing scheme ? Why not classically give lexer func
322 * to parser ? Because I now keep comments in lexer. Could
323 * just do a simple wrapper that when comment ask again for a token,
324 * but maybe simpler to use cur_tok technique.
325 *)
326 let all_tokens = ref toks in
327 let cur_tok = ref (List.hd !all_tokens) in
328
329 let lexer_function =
330 (fun _ ->
331 if TH.is_eof !cur_tok
332 then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
333 else
334 let v = Common.pop2 all_tokens in
335 cur_tok := v;
336 !cur_tok
337 )
338 in
339 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
340 let result = parsefunc lexer_function lexbuf_fake in
341 result
342
343
344 let type_of_string = parse_gen Parser_c.type_name
345 let statement_of_string = parse_gen Parser_c.statement
346 let expression_of_string = parse_gen Parser_c.expr
347
348 (* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
349
350
351
352
353
354 (*****************************************************************************)
355 (* Consistency checking *)
356 (*****************************************************************************)
357
358 (* todo:
359 * could check that an ident has always the same class, be it a typedef
360 * (but sometimes do 'acpi_val acpi_val;'), an ident, a TMacroStatement,
361 * etc.
362 *)
363
364 type class_ident =
365 | CIdent (* can be var, func, field, tag, enum constant *)
366 | CTypedef
367
368 let str_of_class_ident = function
369 | CIdent -> "Ident"
370 | CTypedef -> "Typedef"
371
372 (*
373 | CMacro
374 | CMacroString
375 | CMacroStmt
376 | CMacroDecl
377 | CMacroIterator
378 | CAttr
379
380 (* but take care that must still be able to use '=' *)
381 type context = InFunction | InEnum | InStruct | InInitializer | InParams
382 type class_token =
383 | CIdent of class_ident
384
385 | CComment
386 | CSpace
387 | CCommentCpp of cppkind
388 | CCommentMisc
389 | CCppDirective
390
391 | COPar
392 | CCPar
393 | COBrace
394 | CCBrace
395
396 | CSymbol
397 | CReservedKwd (type | decl | qualif | flow | misc | attr)
398 *)
399
400 let ident_to_typename ident =
401 (Ast_c.nQ, (Ast_c.TypeName (ident, Ast_c.noTypedefDef()), Ast_c.noii))
402
403
404 (* parse_typedef_fix4 *)
405 let consistency_checking2 xs =
406
407 (* first phase, gather data *)
408 let stat = Hashtbl.create 101 in
409
410 (* default value for hash *)
411 let v1 () = Hashtbl.create 101 in
412 let v2 () = ref 0 in
413
414 let bigf = { Visitor_c.default_visitor_c with
415
416 Visitor_c.kexpr = (fun (k,bigf) x ->
417 match Ast_c.unwrap_expr x with
418 | Ast_c.Ident (id) ->
419 let s = Ast_c.str_of_name id in
420 stat +>
421 Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
422 (fun aref -> incr aref)
423
424 | _ -> k x
425 );
426 Visitor_c.ktype = (fun (k,bigf) t ->
427 match Ast_c.unwrap_typeC t with
428 | Ast_c.TypeName (name,_typ) ->
429 let s = Ast_c.str_of_name name in
430 stat +>
431 Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
432 (fun aref -> incr aref)
433
434 | _ -> k t
435 );
436 }
437 in
438 xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
439
440
441 let ident_to_type = ref [] in
442
443
444 (* second phase, analyze data *)
445 stat +> Hashtbl.iter (fun k v ->
446 let xs = Common.hash_to_list v in
447 if List.length xs >= 2
448 then begin
449 pr2 ("CONFLICT:" ^ k);
450 let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
451 if !va =|= !vb then
452 (match ka, kb with
453 | CTypedef, _ -> 1 (* first is smaller *)
454 | _, CTypedef -> -1
455 | _ -> 0
456 )
457 else compare !va !vb
458 ) in
459 let sorted = List.rev sorted in
460 match sorted with
461 | [CTypedef, i1;CIdent, i2] ->
462 pr2 ("transforming some ident in typedef");
463 push2 k ident_to_type;
464 | _ ->
465 pr2 ("TODO:other transforming?");
466
467 end
468 );
469
470 (* third phase, update ast.
471 * todo? but normally should try to handle correctly scope ? maybe sometime
472 * sizeof(id) and even if id was for a long time an identifier, maybe
473 * a few time, because of the scope it's actually really a type.
474 *)
475 if (null !ident_to_type)
476 then xs
477 else
478 let bigf = { Visitor_c.default_visitor_c_s with
479 Visitor_c.kdefineval_s = (fun (k,bigf) x ->
480 match x with
481 | Ast_c.DefineExpr e ->
482 (match e with
483 | (Ast_c.Ident (ident), _), _ii ->
484 let s = Ast_c.str_of_name ident in
485 if List.mem s !ident_to_type
486 then
487 let t = ident_to_typename ident in
488 Ast_c.DefineType t
489 else k x
490 | _ -> k x
491 )
492 | _ -> k x
493 );
494 Visitor_c.kexpr_s = (fun (k, bigf) x ->
495 match x with
496 | (Ast_c.SizeOfExpr e, tref), isizeof ->
497 let i1 = tuple_of_list1 isizeof in
498 (match e with
499 | (Ast_c.ParenExpr e, _), iiparen ->
500 (match e with
501 | (Ast_c.Ident (ident), _), _ii ->
502
503 let s = Ast_c.str_of_name ident in
504 if List.mem s !ident_to_type
505 then
506 let t = ident_to_typename ident in
507 let (i2, i3) = tuple_of_list2 iiparen in
508 (Ast_c.SizeOfType t, tref), [i1;i2;i3]
509 else k x
510 | _ -> k x
511 )
512 | _ -> k x
513 )
514 | _ -> k x
515 );
516 } in
517 xs +> List.map (fun (p, info_item) ->
518 Visitor_c.vk_toplevel_s bigf p, info_item
519 )
520
521
522 let consistency_checking a =
523 Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
524
525
526
527 (*****************************************************************************)
528 (* Error recovery *)
529 (*****************************************************************************)
530
531 (* todo: do something if find Parser_c.Eof ? *)
532 let rec find_next_synchro next already_passed =
533
534 (* Maybe because not enough }, because for example an ifdef contains
535 * in both branch some opening {, we later eat too much, "on deborde
536 * sur la fonction d'apres". So already_passed may be too big and
537 * looking for next synchro point starting from next may not be the
538 * best. So maybe we can find synchro point inside already_passed
539 * instead of looking in next.
540 *
541 * But take care! must progress. We must not stay in infinite loop!
542 * For instance now I have as a error recovery to look for
543 * a "start of something", corresponding to start of function,
544 * but must go beyond this start otherwise will loop.
545 * So look at premier(external_declaration2) in parser.output and
546 * pass at least those first tokens.
547 *
548 * I have chosen to start search for next synchro point after the
549 * first { I found, so quite sure we will not loop. *)
550
551 let last_round = List.rev already_passed in
552 let is_define =
553 let xs = last_round +> List.filter TH.is_not_comment in
554 match xs with
555 | Parser_c.TDefine _::_ -> true
556 | _ -> false
557 in
558 if is_define
559 then find_next_synchro_define (last_round ++ next) []
560 else
561
562 let (before, after) =
563 last_round +> Common.span (fun tok ->
564 match tok with
565 (* by looking at TOBrace we are sure that the "start of something"
566 * will not arrive too early
567 *)
568 | Parser_c.TOBrace _ -> false
569 | Parser_c.TDefine _ -> false
570 | _ -> true
571 )
572 in
573 find_next_synchro_orig (after ++ next) (List.rev before)
574
575
576
577 and find_next_synchro_define next already_passed =
578 match next with
579 | [] ->
580 pr2 "ERROR-RECOV: end of file while in recovery mode";
581 already_passed, []
582 | (Parser_c.TDefEOL i as v)::xs ->
583 pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
584 v::already_passed, xs
585 | v::xs ->
586 find_next_synchro_define xs (v::already_passed)
587
588
589
590
591 and find_next_synchro_orig next already_passed =
592 match next with
593 | [] ->
594 pr2 "ERROR-RECOV: end of file while in recovery mode";
595 already_passed, []
596
597 | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v =|= 0 ->
598 pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
599
600 (match xs with
601 | [] -> raise Impossible (* there is a EOF token normally *)
602
603 (* still useful: now parser.mly allow empty ';' so normally no pb *)
604 | Parser_c.TPtVirg iptvirg::xs ->
605 pr2 "ERROR-RECOV: found sync bis, eating } and ;";
606 (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
607
608 | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
609 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
610 (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
611 xs
612
613 | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
614 ::xs ->
615 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
616 (Parser_c.TCommentSpace sp)::
617 (Parser_c.TPtVirg iptvirg)::
618 (Parser_c.TIdent x)::
619 v::
620 already_passed,
621 xs
622
623 | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
624 ::xs ->
625 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
626 (Parser_c.TCommentNewline sp)::
627 (Parser_c.TPtVirg iptvirg)::
628 (Parser_c.TIdent x)::
629 v::
630 already_passed,
631 xs
632
633 | _ ->
634 v::already_passed, xs
635 )
636 | v::xs when TH.col_of_tok v =|= 0 && TH.is_start_of_something v ->
637 pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
638 already_passed, v::xs
639
640 | v::xs ->
641 find_next_synchro_orig xs (v::already_passed)
642
643
644 (*****************************************************************************)
645 (* Include/Define hacks *)
646 (*****************************************************************************)
647
648 (* Sometimes I prefer to generate a single token for a list of things in the
649 * lexer so that if I have to passed them, like for passing TInclude then
650 * it's easy. Also if I don't do a single token, then I need to
651 * parse the rest which may not need special stuff, like detecting
652 * end of line which the parser is not really ready for. So for instance
653 * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
654 * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
655 * but this kind of token is valid only after a #include and the
656 * lexing and parsing rules are different for such tokens so not that
657 * easy to parse such things in parser_c.mly. Hence the following hacks.
658 *
659 * less?: maybe could get rid of this like I get rid of some of fix_define.
660 *)
661
662 (* ------------------------------------------------------------------------- *)
663 (* helpers *)
664 (* ------------------------------------------------------------------------- *)
665
666 (* used to generate new token from existing one *)
667 let new_info posadd str ii =
668 { Ast_c.pinfo =
669 Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
670 charpos = Ast_c.pos_of_info ii + posadd;
671 str = str;
672 column = Ast_c.col_of_info ii + posadd;
673 };
674 (* must generate a new ref each time, otherwise share *)
675 cocci_tag = ref Ast_c.emptyAnnot;
676 comments_tag = ref Ast_c.emptyComments;
677 }
678
679
680 let rec comment_until_defeol xs =
681 match xs with
682 | [] -> failwith "cant find end of define token TDefEOL"
683 | x::xs ->
684 (match x with
685 | Parser_c.TDefEOL i ->
686 Parser_c.TCommentCpp (Token_c.CppDirective, TH.info_of_tok x)
687 ::xs
688 | _ ->
689 let x' =
690 (* bugfix: otherwise may lose a TComment token *)
691 if TH.is_real_comment x
692 then x
693 else Parser_c.TCommentCpp (Token_c.CppPassingNormal (*good?*), TH.info_of_tok x)
694 in
695 x'::comment_until_defeol xs
696 )
697
698 let drop_until_defeol xs =
699 List.tl
700 (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
701
702
703
704 (* ------------------------------------------------------------------------- *)
705 (* returns a pair (replaced token, list of next tokens) *)
706 (* ------------------------------------------------------------------------- *)
707
708 let tokens_include (info, includes, filename, inifdef) =
709 Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
710 [Parser_c.TIncludeFilename
711 (filename, (new_info (String.length includes) filename info))
712 ]
713
714 (*****************************************************************************)
715 (* Parsing default define macros, usually in a standard.h file *)
716 (*****************************************************************************)
717
718 let parse_cpp_define_file2 file =
719 let toks = tokens ~profile:false file in
720 let toks = Parsing_hacks.fix_tokens_define toks in
721 Parsing_hacks.extract_cpp_define toks
722
723 let parse_cpp_define_file a =
724 Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
725
726 (* can not be put in parsing_hack, cos then mutually recursive problem as
727 * we also want to parse the standard.h file.
728 *)
729 let init_defs std_h =
730 if not (Common.lfile_exists std_h)
731 then pr2 ("warning: Can't find default macro file: " ^ std_h)
732 else begin
733 pr2 ("init_defs: " ^ std_h);
734 Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
735 end
736
737
738 (*****************************************************************************)
739 (* Main entry point *)
740 (*****************************************************************************)
741
742 type info_item = string * Parser_c.token list
743
744 type program2 = toplevel2 list
745 and toplevel2 = Ast_c.toplevel * info_item
746
747 let program_of_program2 xs =
748 xs +> List.map fst
749
750 let with_program2 f program2 =
751 program2
752 +> Common.unzip
753 +> (fun (program, infos) ->
754 f program, infos
755 )
756 +> Common.uncurry Common.zip
757
758
759
760 (* The use of local refs (remaining_tokens, passed_tokens, ...) makes
761 * possible error recovery. Indeed, they allow to skip some tokens and
762 * still be able to call again the ocamlyacc parser. It is ugly code
763 * because we cant modify ocamllex and ocamlyacc. As we want some
764 * extended lexing tricks, we have to use such refs.
765 *
766 * Those refs are now also used for my lalr(k) technique. Indeed They
767 * store the futur and previous tokens that were parsed, and so
768 * provide enough context information for powerful lex trick.
769 *
770 * - passed_tokens_last_ckp stores the passed tokens since last
771 * checkpoint. Used for NotParsedCorrectly and also to build the
772 * info_item attached to each program_element.
773 * - passed_tokens_clean is used for lookahead, in fact for lookback.
774 * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
775 * contain some comments and so would make pattern matching difficult
776 * in lookahead. Hence this variable. We would like also to get rid
777 * of cpp instruction because sometimes a cpp instruction is between
778 * two tokens and makes a pattern matching fail. But lookahead also
779 * transform some cpp instruction (in comment) so can't remove them.
780 *
781 * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
782 * whereas passed_tokens_clean and remaining_tokens_clean does not contain
783 * comment-tokens.
784 *
785 * Normally we have:
786 * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
787 * after the call to pop2.
788 * toks = (reverse passed_tok) ++ remaining_tokens
789 * at the and of the lexer_function call.
790 * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
791 * At the end of lexer_function call, cur_tok overlap with passed_tok.
792 *
793 * convention: I use "tr" for "tokens refs"
794 *
795 * I now also need this lexing trick because the lexer return comment
796 * tokens.
797 *)
798
799 type tokens_state = {
800 mutable rest : Parser_c.token list;
801 mutable rest_clean : Parser_c.token list;
802 mutable current : Parser_c.token;
803 (* it's passed since last "checkpoint", not passed from the beginning *)
804 mutable passed : Parser_c.token list;
805 mutable passed_clean : Parser_c.token list;
806 }
807 let clone_tokens_stat tr =
808 { rest = tr.rest;
809 rest_clean = tr.rest_clean;
810 current = tr.current;
811 passed = tr.passed;
812 passed_clean = tr.passed_clean;
813 }
814 let copy_tokens_stat ~src ~dst =
815 dst.rest <- src.rest;
816 dst.rest_clean <- src.rest_clean;
817 dst.current <- src.current;
818 dst.passed <- src.passed;
819 dst.passed_clean <- src.passed_clean;
820 ()
821
822 let rec filter_noise n xs =
823 match n, xs with
824 | _, [] -> []
825 | 0, xs -> xs
826 | n, x::xs ->
827 (match x with
828 | Parser_c.TMacroAttr _ ->
829 filter_noise (n-1) xs
830 | _ ->
831 x::filter_noise (n-1) xs
832 )
833
834 let clean_for_lookahead xs =
835 match xs with
836 | [] -> []
837 | [x] -> [x]
838 | x::xs ->
839 x::filter_noise 10 xs
840
841
842
843 (* Hacked lex. This function use refs passed by parse_print_error_heuristic
844 * tr means token refs.
845 *)
846 let rec lexer_function ~pass tr = fun lexbuf ->
847 match tr.rest with
848 | [] -> pr2 "ALREADY AT END"; tr.current
849 | v::xs ->
850 tr.rest <- xs;
851 tr.current <- v;
852
853 if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
854
855 if TH.is_comment v
856 then begin
857 tr.passed <- v::tr.passed;
858 lexer_function ~pass tr lexbuf
859 end
860 else begin
861 let x = List.hd tr.rest_clean in
862 tr.rest_clean <- List.tl tr.rest_clean;
863 assert (x =*= v);
864
865 (match v with
866
867 (* fix_define1.
868 *
869 * Why not in parsing_hacks lookahead and do passing like
870 * I do for some ifdef directives ? Because here I also need to
871 * generate some tokens sometimes and so I need access to the
872 * tr.passed, tr.rest, etc.
873 *)
874 | Parser_c.TDefine (tok) ->
875 if not (LP.current_context () =*= LP.InTopLevel) &&
876 (!Flag_parsing_c.cpp_directive_passing || (pass =|= 2))
877 then begin
878 incr Stat.nDefinePassing;
879 pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
880 let v' = Parser_c.TCommentCpp (Token_c.CppDirective,TH.info_of_tok v)
881 in
882 tr.passed <- v'::tr.passed;
883 tr.rest <- comment_until_defeol tr.rest;
884 tr.rest_clean <- drop_until_defeol tr.rest_clean;
885 lexer_function ~pass tr lexbuf
886 end
887 else begin
888 tr.passed <- v::tr.passed;
889 tr.passed_clean <- v::tr.passed_clean;
890 v
891 end
892
893 | Parser_c.TInclude (includes, filename, inifdef, info) ->
894 if not (LP.current_context () =*= LP.InTopLevel) &&
895 (!Flag_parsing_c.cpp_directive_passing || (pass =|= 2))
896 then begin
897 incr Stat.nIncludePassing;
898 pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
899 let v = Parser_c.TCommentCpp(Token_c.CppDirective, info) in
900 tr.passed <- v::tr.passed;
901 lexer_function ~pass tr lexbuf
902 end
903 else begin
904 let (v,new_tokens) =
905 tokens_include (info, includes, filename, inifdef) in
906 let new_tokens_clean =
907 new_tokens +> List.filter TH.is_not_comment in
908
909 tr.passed <- v::tr.passed;
910 tr.passed_clean <- v::tr.passed_clean;
911 tr.rest <- new_tokens ++ tr.rest;
912 tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
913 v
914 end
915
916 | _ ->
917
918 (* typedef_fix1 *)
919 let v = match v with
920 | Parser_c.TIdent (s, ii) ->
921 if
922 LP.is_typedef s &&
923 not (!Flag_parsing_c.disable_add_typedef) &&
924 pass =|= 1
925 then Parser_c.TypedefIdent (s, ii)
926 else Parser_c.TIdent (s, ii)
927 | x -> x
928 in
929
930 let v = Parsing_hacks.lookahead ~pass
931 (clean_for_lookahead (v::tr.rest_clean))
932 tr.passed_clean in
933
934 tr.passed <- v::tr.passed;
935
936 (* the lookahead may have changed the status of the token and
937 * consider it as a comment, for instance some #include are
938 * turned into comments, hence this code. *)
939 match v with
940 | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
941 | v ->
942 tr.passed_clean <- v::tr.passed_clean;
943 v
944 )
945 end
946
947
948
949 let get_one_elem ~pass tr (file, filelines) =
950
951 if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
952 then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
953
954 (* normally have to do that only when come from an exception in which
955 * case the dt() may not have been done
956 * TODO but if was in scoped scope ? have to let only the last scope
957 * so need do a LP.lexer_reset_typedef ();
958 *)
959 LP.enable_typedef();
960 LP._lexer_hint := (LP.default_hint ());
961 LP.save_typedef_state();
962
963 tr.passed <- [];
964
965 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
966
967 (try
968 (* -------------------------------------------------- *)
969 (* Call parser *)
970 (* -------------------------------------------------- *)
971 Common.profile_code_exclusif "YACC" (fun () ->
972 Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
973 )
974 with e -> begin
975 if (pass =|= 1 && !Flag_parsing_c.disable_two_pass)|| (pass =|= 2)
976 then begin
977 (match e with
978 (* Lexical is not anymore launched I think *)
979 | Lexer_c.Lexical s ->
980 pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
981 | Parsing.Parse_error ->
982 pr2 ("parse error \n = " ^ error_msg_tok tr.current)
983 | Semantic_c.Semantic (s, i) ->
984 pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
985 | e -> raise e
986 )
987 end;
988 LP.restore_typedef_state();
989
990 (* must keep here, before the code that adjusts the tr fields *)
991 let line_error = TH.line_of_tok tr.current in
992
993
994 (* error recovery, go to next synchro point *)
995 let (passed', rest') = find_next_synchro tr.rest tr.passed in
996 tr.rest <- rest';
997 tr.passed <- passed';
998
999 tr.current <- List.hd passed';
1000 tr.passed_clean <- []; (* enough ? *)
1001 (* with error recovery, rest and rest_clean may not be in sync *)
1002 tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
1003
1004
1005 let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
1006 Right (info_of_bads, line_error, tr.passed)
1007 end
1008 )
1009
1010
1011
1012
1013 (* note: as now we go in 2 passes, there is first all the error message of
1014 * the lexer, and then the error of the parser. It is not anymore
1015 * interwinded.
1016 *
1017 * !!!This function use refs, and is not reentrant !!! so take care.
1018 * It use globals defined in Lexer_parser and also the _defs global
1019 * in parsing_hack.ml.
1020 *
1021 * This function uses internally some semi globals in the
1022 * tokens_stat record and parsing_stat record.
1023 *)
1024
1025 let parse_print_error_heuristic2 file =
1026
1027 let filelines = Common.cat_array file in
1028 let stat = Parsing_stat.default_stat file in
1029
1030 (* -------------------------------------------------- *)
1031 (* call lexer and get all the tokens *)
1032 (* -------------------------------------------------- *)
1033 LP.lexer_reset_typedef();
1034 Parsing_hacks.ifdef_paren_cnt := 0;
1035 let toks_orig = tokens file in
1036
1037 let toks = Parsing_hacks.fix_tokens_define toks_orig in
1038 let toks = Parsing_hacks.fix_tokens_cpp toks in
1039
1040 let tr = {
1041 rest = toks;
1042 rest_clean = (toks +> List.filter TH.is_not_comment);
1043 current = (List.hd toks);
1044 passed = [];
1045 passed_clean = [];
1046 } in
1047
1048
1049
1050
1051 let rec loop tr =
1052
1053 (* todo?: I am not sure that it represents current_line, cos maybe
1054 * tr.current partipated in the previous parsing phase, so maybe tr.current
1055 * is not the first token of the next parsing phase. Same with checkpoint2.
1056 * It would be better to record when we have a } or ; in parser.mly,
1057 * cos we know that they are the last symbols of external_declaration2.
1058 *
1059 * bugfix: may not be equal to 'file' as after macro expansions we can
1060 * start to parse a new entity from the body of a macro, for instance
1061 * when parsing a define_machine() body, cf standard.h
1062 *)
1063 let checkpoint = TH.line_of_tok tr.current in
1064 let checkpoint_file = TH.file_of_tok tr.current in
1065
1066 let tr_save = clone_tokens_stat tr in
1067
1068 (* call the parser *)
1069 let elem =
1070 let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1071 match pass1 with
1072 | Left e -> Left e
1073 | Right res ->
1074 if !Flag_parsing_c.disable_two_pass
1075 then Right res
1076 else begin
1077 pr2 "parsing pass2: try again";
1078 copy_tokens_stat ~src:tr_save ~dst: tr;
1079 let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1080 pass2
1081 end
1082 in
1083
1084
1085 (* again not sure if checkpoint2 corresponds to end of bad region *)
1086 let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1087 let checkpoint2_file = TH.file_of_tok tr.current in
1088
1089 let was_define =
1090 (match elem with
1091 | Left _ -> false
1092 | Right (_, line_error, _) ->
1093 let was_define =
1094 let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1095 if List.length xs >= 2
1096 then
1097 (match Common.head_middle_tail xs with
1098 | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
1099 true
1100 | _ -> false
1101 )
1102 else begin
1103 pr2 "WEIRD: length list of error recovery tokens < 2 ";
1104 false
1105 end
1106 in
1107 (if was_define && !Flag_parsing_c.filter_msg_define_error
1108 then ()
1109 else
1110 (* bugfix: *)
1111 if (checkpoint_file =$= checkpoint2_file) &&
1112 checkpoint_file =$= file
1113 then print_bad line_error (checkpoint, checkpoint2) filelines
1114 else pr2 "PB: bad: but on tokens not from original file"
1115 );
1116 was_define
1117 ) in
1118
1119
1120 let diffline =
1121 if (checkpoint_file =$= checkpoint2_file) && (checkpoint_file =$= file)
1122 then (checkpoint2 - checkpoint)
1123 else 0
1124 (* TODO? so if error come in middle of something ? where the
1125 * start token was from original file but synchro found in body
1126 * of macro ? then can have wrong number of lines stat.
1127 * Maybe simpler just to look at tr.passed and count
1128 * the lines in the token from the correct file ?
1129 *)
1130 in
1131 let info = mk_info_item file (List.rev tr.passed) in
1132
1133 (* some stat updates *)
1134 stat.Stat.commentized <-
1135 stat.Stat.commentized + count_lines_commentized (snd info);
1136
1137 let elem =
1138 match elem with
1139 | Left e ->
1140 stat.Stat.correct <- stat.Stat.correct + diffline;
1141 e
1142 | Right (info_of_bads, line_error, toks_of_bads) ->
1143 if was_define && !Flag_parsing_c.filter_define_error
1144 then stat.Stat.correct <- stat.Stat.correct + diffline
1145 else stat.Stat.bad <- stat.Stat.bad + diffline;
1146
1147 let pbline =
1148 toks_of_bads
1149 +> Common.filter (TH.is_same_line_or_close line_error)
1150 +> Common.filter TH.is_ident_like
1151 in
1152 let error_info =
1153 (pbline +> List.map TH.str_of_tok), line_error
1154 in
1155 stat.Stat.problematic_lines <-
1156 error_info::stat.Stat.problematic_lines;
1157
1158 Ast_c.NotParsedCorrectly info_of_bads
1159 in
1160
1161 (match elem with
1162 | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
1163 | xs -> (xs, info):: loop tr (* recurse *)
1164 )
1165 in
1166 let v = loop tr in
1167 let v = consistency_checking v in
1168 (v, stat)
1169
1170
1171 let time_total_parsing a =
1172 Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1173
1174 let parse_print_error_heuristic a =
1175 Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1176
1177
1178 (* alias *)
1179 let parse_c_and_cpp a = parse_print_error_heuristic a
1180
1181 (*****************************************************************************)
1182 (* Same but faster cos memoize stuff *)
1183 (*****************************************************************************)
1184 let parse_cache file =
1185 if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1186 else
1187 let _ = pr2 "TOFIX" in
1188 let need_no_changed_files =
1189 (* should use Sys.argv.(0), would be safer. *)
1190
1191 [
1192 (* TOFIX
1193 Config.path ^ "/parsing_c/c_parser.cma";
1194 (* we may also depend now on the semantic patch because
1195 the SP may use macro and so we will disable some of the
1196 macro expansions from standard.h.
1197 *)
1198 !Config.std_h;
1199 *)
1200 ]
1201 in
1202 let need_no_changed_variables =
1203 (* could add some of the flags of flag_parsing_c.ml *)
1204 []
1205 in
1206 Common.cache_computation_robust
1207 file ".ast_raw"
1208 (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1209 (fun () -> parse_print_error_heuristic file)
1210
1211
1212
1213 (*****************************************************************************)
1214 (* Some special cases *)
1215 (*****************************************************************************)
1216
1217 let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1218 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1219 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1220 program +> Common.find_some (fun (e,_) ->
1221 match e with
1222 | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1223 | _ -> None
1224 )
1225
1226 let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1227 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1228 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1229 program +> Common.find_some (fun (e,_) ->
1230 match e with
1231 | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1232 (match compound with
1233 | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1234 | _ -> None
1235 )
1236 | _ -> None
1237 )