Release coccinelle-0.1.6
[bpt/coccinelle.git] / parsing_c / parse_c.ml
CommitLineData
0708f913
C
1(* Yoann Padioleau
2 *
3 * Copyright (C) 2006, 2007, 2008 Ecole des Mines de Nantes
34e49164
C
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License (GPL)
7 * version 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * file license.txt for more details.
13 *)
14
15open Common
16
17module TH = Token_helpers
18module LP = Lexer_parser
19
485bce71
C
20module Stat = Parsing_stat
21
34e49164
C
22(*****************************************************************************)
23(* Wrappers *)
24(*****************************************************************************)
25let pr2 s =
26 if !Flag_parsing_c.verbose_parsing
27 then Common.pr2 s
485bce71
C
28
29let pr2_once s =
30 if !Flag_parsing_c.verbose_parsing
31 then Common.pr2_once s
34e49164
C
32
33(*****************************************************************************)
34(* Helpers *)
35(*****************************************************************************)
36
37let lexbuf_to_strpos lexbuf =
38 (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
39
40let token_to_strpos tok =
41 (TH.str_of_tok tok, TH.pos_of_tok tok)
42
43
44let error_msg_tok tok =
45 let file = TH.file_of_tok tok in
46 if !Flag_parsing_c.verbose_parsing
47 then Common.error_message file (token_to_strpos tok)
48 else ("error in " ^ file ^ "set verbose_parsing for more info")
49
50
51let print_bad line_error (start_line, end_line) filelines =
34e49164
C
52 begin
53 pr2 ("badcount: " ^ i_to_s (end_line - start_line));
485bce71 54
34e49164 55 for i = start_line to end_line do
485bce71
C
56 let line = filelines.(i) in
57
34e49164 58 if i = line_error
485bce71
C
59 then pr2 ("BAD:!!!!!" ^ " " ^ line)
60 else pr2 ("bad:" ^ " " ^ line)
34e49164
C
61 done
62 end
63
64
65
66let mk_info_item2 filename toks =
67 let buf = Buffer.create 100 in
68 let s =
69 (* old: get_slice_file filename (line1, line2) *)
70 begin
71 toks +> List.iter (fun tok ->
72 match TH.pinfo_of_tok tok with
91eba41f
C
73 | Ast_c.OriginTok _ ->
74 Buffer.add_string buf (TH.str_of_tok tok)
75 | Ast_c.AbstractLineTok _ ->
76 raise Impossible
34e49164
C
77 | _ -> ()
78 );
79 Buffer.contents buf
80 end
81 in
82 (s, toks)
83
84let mk_info_item a b =
85 Common.profile_code "C parsing.mk_info_item"
86 (fun () -> mk_info_item2 a b)
87
88
91eba41f
C
89let info_same_line line xs =
90 xs +> List.filter (fun info -> Ast_c.line_of_info info = line)
34e49164 91
34e49164
C
92
93(*****************************************************************************)
94(* Stats on what was passed/commentized *)
95(*****************************************************************************)
96
97let commentized xs = xs +> Common.map_filter (function
98 | Parser_c.TCommentCpp (cppkind, ii) ->
485bce71
C
99 let s = Ast_c.str_of_info ii in
100 let legal_passing =
101 match !Flag_parsing_c.filter_passed_level with
102 | 0 -> false
103 | 1 ->
0708f913 104 List.mem cppkind [Token_c.CppAttr]
485bce71
C
105 ||
106 (s =~ "__.*")
107 | 2 ->
0708f913 108 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal]
485bce71
C
109 ||
110 (s =~ "__.*")
111 | 3 ->
0708f913 112 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective]
485bce71
C
113 ||
114 (s =~ "__.*")
115 | 4 ->
0708f913 116 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppMacro]
485bce71
C
117 ||
118 (s =~ "__.*")
119
120
121 | 5 ->
0708f913 122 List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective;Token_c.CppMacro]
485bce71
C
123 ||
124 (s =~ "__.*")
125
126
127
128
129 | _ -> failwith "not valid level passing number"
130 in
131 if legal_passing then None else Some (ii.Ast_c.pinfo)
132
133 (*
34e49164 134 | Ast_c.CppOther ->
34e49164
C
135 (match s with
136 | s when s =~ "KERN_.*" -> None
137 | s when s =~ "__.*" -> None
485bce71
C
138 | _ ->
139 Some (ii.Ast_c.pinfo)
34e49164 140 )
485bce71
C
141 *)
142
34e49164
C
143
144 | Parser_c.TCommentMisc ii
145 | Parser_c.TAction ii
146 ->
147 Some (ii.Ast_c.pinfo)
148 | _ ->
149 None
150 )
151
152let count_lines_commentized xs =
153 let line = ref (-1) in
154 let count = ref 0 in
155 begin
156 commentized xs +>
157 List.iter
158 (function
159 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
160 let newline = pinfo.Common.line in
161 if newline <> !line
162 then begin
163 line := newline;
164 incr count
165 end
166 | _ -> ());
167 !count
168 end
169
170
171
172let print_commentized xs =
173 let line = ref (-1) in
174 begin
175 let ys = commentized xs in
176 ys +>
177 List.iter
178 (function
179 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
180 let newline = pinfo.Common.line in
181 let s = pinfo.Common.str in
182 let s = Str.global_substitute
183 (Str.regexp "\n") (fun s -> "") s
184 in
185 if newline = !line
186 then prerr_string (s ^ " ")
187 else begin
188 if !line = -1
189 then pr2_no_nl "passed:"
190 else pr2_no_nl "\npassed:";
191 line := newline;
192 pr2_no_nl (s ^ " ");
193 end
194 | _ -> ());
195 if not (null ys) then pr2 "";
196 end
197
198
199
200
201(*****************************************************************************)
202(* Lexing only *)
203(*****************************************************************************)
204
205(* called by parse_print_error_heuristic *)
206let tokens2 file =
207 let table = Common.full_charpos_to_pos file in
208
209 Common.with_open_infile file (fun chan ->
210 let lexbuf = Lexing.from_channel chan in
211 try
212 let rec tokens_aux acc =
213 let tok = Lexer_c.token lexbuf in
214 (* fill in the line and col information *)
215 let tok = tok +> TH.visitor_info_of_tok (fun ii ->
216 { ii with Ast_c.pinfo=
217 (* could assert pinfo.filename = file ? *)
218 match Ast_c.pinfo_of_info ii with
219 Ast_c.OriginTok pi ->
220 Ast_c.OriginTok (Common.complete_parse_info file table pi)
221 | Ast_c.ExpandedTok (pi,vpi) ->
222 Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
223 | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
224 | Ast_c.AbstractLineTok pi -> failwith "should not occur"
225 })
226 in
227
228 if TH.is_eof tok
229 then List.rev (tok::acc)
230 else tokens_aux (tok::acc)
231 in
232 tokens_aux []
233 with
234 | Lexer_c.Lexical s ->
235 failwith ("lexical error " ^ s ^ "\n =" ^
236 (Common.error_message file (lexbuf_to_strpos lexbuf)))
237 | e -> raise e
238 )
239
485bce71
C
240let time_lexing ?(profile=true) a =
241 if profile
242 then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
243 else tokens2 a
244let tokens ?profile a =
245 Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
34e49164
C
246
247
485bce71 248let tokens_of_string string =
34e49164
C
249 let lexbuf = Lexing.from_string string in
250 try
251 let rec tokens_s_aux () =
252 let tok = Lexer_c.token lexbuf in
253 if TH.is_eof tok
254 then [tok]
255 else tok::(tokens_s_aux ())
256 in
257 tokens_s_aux ()
258 with
259 | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
260 | e -> raise e
261
262
263(*****************************************************************************)
264(* Parsing, but very basic, no more used *)
265(*****************************************************************************)
266
267(*
268 * !!!Those function use refs, and are not reentrant !!! so take care.
269 * It use globals defined in Lexer_parser.
270 *
271 * update: because now lexer return comments tokens, those functions
272 * may not work anymore.
273 *)
274
275let parse file =
276 let lexbuf = Lexing.from_channel (open_in file) in
277 let result = Parser_c.main Lexer_c.token lexbuf in
278 result
279
280
281let parse_print_error file =
282 let chan = (open_in file) in
283 let lexbuf = Lexing.from_channel chan in
284
285 let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
286 try
287 lexbuf +> Parser_c.main Lexer_c.token
288 with
289 | Lexer_c.Lexical s ->
290 failwith ("lexical error " ^s^ "\n =" ^ error_msg ())
291 | Parsing.Parse_error ->
292 failwith ("parse error \n = " ^ error_msg ())
293 | Semantic_c.Semantic (s, i) ->
294 failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
295 | e -> raise e
296
297
298
299
300(*****************************************************************************)
301(* Parsing subelements, useful to debug parser *)
302(*****************************************************************************)
303
304(*
305 * !!!Those function use refs, and are not reentrant !!! so take care.
306 * It use globals defined in Lexer_parser.
307 *)
308
309
310(* old:
311 * let parse_gen parsefunc s =
312 * let lexbuf = Lexing.from_string s in
313 * let result = parsefunc Lexer_c.token lexbuf in
314 * result
315 *)
316
317let parse_gen parsefunc s =
485bce71 318 let toks = tokens_of_string s +> List.filter TH.is_not_comment in
34e49164
C
319
320
321 (* Why use this lexing scheme ? Why not classically give lexer func
322 * to parser ? Because I now keep comments in lexer. Could
323 * just do a simple wrapper that when comment ask again for a token,
324 * but maybe simpler to use cur_tok technique.
325 *)
326 let all_tokens = ref toks in
327 let cur_tok = ref (List.hd !all_tokens) in
328
329 let lexer_function =
330 (fun _ ->
331 if TH.is_eof !cur_tok
332 then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
333 else
334 let v = Common.pop2 all_tokens in
335 cur_tok := v;
336 !cur_tok
337 )
338 in
339 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
340 let result = parsefunc lexer_function lexbuf_fake in
341 result
342
343
344let type_of_string = parse_gen Parser_c.type_name
345let statement_of_string = parse_gen Parser_c.statement
346let expression_of_string = parse_gen Parser_c.expr
347
348(* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
349
350
351
352
353
354(*****************************************************************************)
355(* Consistency checking *)
356(*****************************************************************************)
357
113803cf
C
358(* todo:
359 * could check that an ident has always the same class, be it a typedef
360 * (but sometimes do 'acpi_val acpi_val;'), an ident, a TMacroStatement,
361 * etc.
362 *)
363
34e49164
C
364type class_ident =
365 | CIdent (* can be var, func, field, tag, enum constant *)
366 | CTypedef
367
368let str_of_class_ident = function
369 | CIdent -> "Ident"
370 | CTypedef -> "Typedef"
371
372(*
373 | CMacro
374 | CMacroString
375 | CMacroStmt
376 | CMacroDecl
377 | CMacroIterator
378 | CAttr
379
380(* but take care that must still be able to use '=' *)
381type context = InFunction | InEnum | InStruct | InInitializer | InParams
382type class_token =
383 | CIdent of class_ident
384
385 | CComment
386 | CSpace
387 | CCommentCpp of cppkind
388 | CCommentMisc
389 | CCppDirective
390
391 | COPar
392 | CCPar
393 | COBrace
394 | CCBrace
395
396 | CSymbol
397 | CReservedKwd (type | decl | qualif | flow | misc | attr)
398*)
399
400(* parse_typedef_fix4 *)
401let consistency_checking2 xs =
402
403 (* first phase, gather data *)
404 let stat = Hashtbl.create 101 in
405
406 (* default value for hash *)
407 let v1 () = Hashtbl.create 101 in
408 let v2 () = ref 0 in
409
410 let bigf = { Visitor_c.default_visitor_c with
411
412 Visitor_c.kexpr = (fun (k,bigf) x ->
413 match Ast_c.unwrap_expr x with
414 | Ast_c.Ident s ->
415 stat +>
416 Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
417 (fun aref -> incr aref)
418
419 | _ -> k x
420 );
421 Visitor_c.ktype = (fun (k,bigf) t ->
422 match Ast_c.unwrap_typeC t with
423 | Ast_c.TypeName (s,_typ) ->
424 stat +>
425 Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
426 (fun aref -> incr aref)
427
428 | _ -> k t
429 );
430 }
431 in
432 xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
433
434
435 let ident_to_type = ref [] in
436
437
438 (* second phase, analyze data *)
439 stat +> Hashtbl.iter (fun k v ->
440 let xs = Common.hash_to_list v in
441 if List.length xs >= 2
442 then begin
443 pr2 ("CONFLICT:" ^ k);
444 let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
445 if !va = !vb then
446 (match ka, kb with
447 | CTypedef, _ -> 1 (* first is smaller *)
448 | _, CTypedef -> -1
449 | _ -> 0
450 )
451 else compare !va !vb
452 ) in
453 let sorted = List.rev sorted in
454 match sorted with
455 | [CTypedef, i1;CIdent, i2] ->
456 pr2 ("transforming some ident in typedef");
457 push2 k ident_to_type;
458 | _ ->
459 pr2 ("TODO:other transforming?");
460
461 end
462 );
463
485bce71
C
464 (* third phase, update ast.
465 * todo? but normally should try to handle correctly scope ? maybe sometime
466 * sizeof(id) and even if id was for a long time an identifier, maybe
467 * a few time, because of the scope it's actually really a type.
468 *)
34e49164
C
469 if (null !ident_to_type)
470 then xs
471 else
472 let bigf = { Visitor_c.default_visitor_c_s with
473 Visitor_c.kdefineval_s = (fun (k,bigf) x ->
474 match x with
475 | Ast_c.DefineExpr e ->
476 (match e with
477 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
478 let t = (Ast_c.nQ,
479 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
480
481 Ast_c.DefineType t
482 | _ -> k x
483 )
484 | _ -> k x
485 );
486 Visitor_c.kexpr_s = (fun (k, bigf) x ->
487 match x with
488 | (Ast_c.SizeOfExpr e, tref), isizeof ->
489 let i1 = tuple_of_list1 isizeof in
490 (match e with
491 | (Ast_c.ParenExpr e, _), iiparen ->
492 (match e with
493 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
494 let (i2, i3) = tuple_of_list2 iiparen in
495 let t = (Ast_c.nQ,
496 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
497 (Ast_c.SizeOfType t, tref), [i1;i2;i3]
498
499 | _ -> k x
500 )
501 | _ -> k x
502 )
503 | _ -> k x
504 );
505 } in
506 xs +> List.map (fun (p, info_item) ->
507 Visitor_c.vk_toplevel_s bigf p, info_item
508 )
509
510
511let consistency_checking a =
512 Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
513
514
515
516(*****************************************************************************)
517(* Error recovery *)
518(*****************************************************************************)
519
520(* todo: do something if find Parser_c.Eof ? *)
521let rec find_next_synchro next already_passed =
522
523 (* Maybe because not enough }, because for example an ifdef contains
524 * in both branch some opening {, we later eat too much, "on deborde
525 * sur la fonction d'apres". So already_passed may be too big and
526 * looking for next synchro point starting from next may not be the
527 * best. So maybe we can find synchro point inside already_passed
528 * instead of looking in next.
529 *
530 * But take care! must progress. We must not stay in infinite loop!
531 * For instance now I have as a error recovery to look for
532 * a "start of something", corresponding to start of function,
533 * but must go beyond this start otherwise will loop.
534 * So look at premier(external_declaration2) in parser.output and
535 * pass at least those first tokens.
536 *
537 * I have chosen to start search for next synchro point after the
538 * first { I found, so quite sure we will not loop. *)
539
540 let last_round = List.rev already_passed in
541 let is_define =
542 let xs = last_round +> List.filter TH.is_not_comment in
543 match xs with
544 | Parser_c.TDefine _::_ -> true
545 | _ -> false
546 in
547 if is_define
548 then find_next_synchro_define (last_round ++ next) []
549 else
550
551 let (before, after) =
552 last_round +> Common.span (fun tok ->
553 match tok with
554 (* by looking at TOBrace we are sure that the "start of something"
555 * will not arrive too early
556 *)
557 | Parser_c.TOBrace _ -> false
558 | Parser_c.TDefine _ -> false
559 | _ -> true
560 )
561 in
562 find_next_synchro_orig (after ++ next) (List.rev before)
563
564
565
566and find_next_synchro_define next already_passed =
567 match next with
568 | [] ->
569 pr2 "ERROR-RECOV: end of file while in recovery mode";
570 already_passed, []
571 | (Parser_c.TDefEOL i as v)::xs ->
485bce71 572 pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
34e49164
C
573 v::already_passed, xs
574 | v::xs ->
575 find_next_synchro_define xs (v::already_passed)
576
577
578
579
580and find_next_synchro_orig next already_passed =
581 match next with
582 | [] ->
583 pr2 "ERROR-RECOV: end of file while in recovery mode";
584 already_passed, []
585
586 | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v = 0 ->
587 pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
588
589 (match xs with
590 | [] -> raise Impossible (* there is a EOF token normally *)
591
592 (* still useful: now parser.mly allow empty ';' so normally no pb *)
593 | Parser_c.TPtVirg iptvirg::xs ->
594 pr2 "ERROR-RECOV: found sync bis, eating } and ;";
595 (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
596
597 | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
598 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
599 (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
600 xs
601
602 | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
603 ::xs ->
604 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
605 (Parser_c.TCommentSpace sp)::
606 (Parser_c.TPtVirg iptvirg)::
607 (Parser_c.TIdent x)::
608 v::
609 already_passed,
610 xs
611
612 | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
613 ::xs ->
614 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
615 (Parser_c.TCommentNewline sp)::
616 (Parser_c.TPtVirg iptvirg)::
617 (Parser_c.TIdent x)::
618 v::
619 already_passed,
620 xs
621
622 | _ ->
623 v::already_passed, xs
624 )
625 | v::xs when TH.col_of_tok v = 0 && TH.is_start_of_something v ->
626 pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
627 already_passed, v::xs
628
629 | v::xs ->
630 find_next_synchro_orig xs (v::already_passed)
631
632
633(*****************************************************************************)
634(* Include/Define hacks *)
635(*****************************************************************************)
636
485bce71
C
637(* Sometimes I prefer to generate a single token for a list of things in the
638 * lexer so that if I have to passed them, like for passing TInclude then
639 * it's easy. Also if I don't do a single token, then I need to
640 * parse the rest which may not need special stuff, like detecting
641 * end of line which the parser is not really ready for. So for instance
642 * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
643 * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
644 * but this kind of token is valid only after a #include and the
645 * lexing and parsing rules are different for such tokens so not that
646 * easy to parse such things in parser_c.mly. Hence the following hacks.
647 *
648 * less?: maybe could get rid of this like I get rid of some of fix_define.
649 *)
650
34e49164
C
651(* ------------------------------------------------------------------------- *)
652(* helpers *)
653(* ------------------------------------------------------------------------- *)
654
655(* used to generate new token from existing one *)
656let new_info posadd str ii =
657 { Ast_c.pinfo =
658 Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
659 charpos = Ast_c.pos_of_info ii + posadd;
660 str = str;
661 column = Ast_c.col_of_info ii + posadd;
662 };
663 (* must generate a new ref each time, otherwise share *)
664 cocci_tag = ref Ast_c.emptyAnnot;
665 comments_tag = ref Ast_c.emptyComments;
666 }
667
668
669let rec comment_until_defeol xs =
670 match xs with
671 | [] -> failwith "cant find end of define token TDefEOL"
672 | x::xs ->
673 (match x with
674 | Parser_c.TDefEOL i ->
0708f913 675 Parser_c.TCommentCpp (Token_c.CppDirective, TH.info_of_tok x)
34e49164
C
676 ::xs
677 | _ ->
485bce71
C
678 let x' =
679 (* bugfix: otherwise may lose a TComment token *)
680 if TH.is_real_comment x
681 then x
0708f913 682 else Parser_c.TCommentCpp (Token_c.CppPassingNormal (*good?*), TH.info_of_tok x)
485bce71
C
683 in
684 x'::comment_until_defeol xs
34e49164
C
685 )
686
687let drop_until_defeol xs =
688 List.tl
689 (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
690
691
692
693(* ------------------------------------------------------------------------- *)
694(* returns a pair (replaced token, list of next tokens) *)
695(* ------------------------------------------------------------------------- *)
696
697let tokens_include (info, includes, filename, inifdef) =
698 Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
699 [Parser_c.TIncludeFilename
700 (filename, (new_info (String.length includes) filename info))
701 ]
702
703(*****************************************************************************)
485bce71 704(* Parsing default define macros, usually in a standard.h file *)
34e49164
C
705(*****************************************************************************)
706
485bce71
C
707let parse_cpp_define_file2 file =
708 let toks = tokens ~profile:false file in
34e49164
C
709 let toks = Parsing_hacks.fix_tokens_define toks in
710 Parsing_hacks.extract_cpp_define toks
711
485bce71
C
712let parse_cpp_define_file a =
713 Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
714
715(* can not be put in parsing_hack, cos then mutually recursive problem as
716 * we also want to parse the standard.h file.
717 *)
718let init_defs std_h =
719 if not (Common.lfile_exists std_h)
720 then pr2 ("warning: Can't find default macro file: " ^ std_h)
721 else begin
722 pr2 ("init_defs: " ^ std_h);
723 Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
724 end
725
34e49164
C
726
727(*****************************************************************************)
728(* Main entry point *)
729(*****************************************************************************)
730
731type info_item = string * Parser_c.token list
732
733type program2 = toplevel2 list
734 and toplevel2 = Ast_c.toplevel * info_item
735
485bce71
C
736let program_of_program2 xs =
737 xs +> List.map fst
738
739let with_program2 f program2 =
740 program2
741 +> Common.unzip
742 +> (fun (program, infos) ->
743 f program, infos
744 )
745 +> Common.uncurry Common.zip
746
747
34e49164
C
748
749(* The use of local refs (remaining_tokens, passed_tokens, ...) makes
750 * possible error recovery. Indeed, they allow to skip some tokens and
751 * still be able to call again the ocamlyacc parser. It is ugly code
752 * because we cant modify ocamllex and ocamlyacc. As we want some
753 * extended lexing tricks, we have to use such refs.
754 *
755 * Those refs are now also used for my lalr(k) technique. Indeed They
756 * store the futur and previous tokens that were parsed, and so
757 * provide enough context information for powerful lex trick.
758 *
759 * - passed_tokens_last_ckp stores the passed tokens since last
485bce71 760 * checkpoint. Used for NotParsedCorrectly and also to build the
34e49164
C
761 * info_item attached to each program_element.
762 * - passed_tokens_clean is used for lookahead, in fact for lookback.
763 * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
764 * contain some comments and so would make pattern matching difficult
765 * in lookahead. Hence this variable. We would like also to get rid
766 * of cpp instruction because sometimes a cpp instruction is between
767 * two tokens and makes a pattern matching fail. But lookahead also
768 * transform some cpp instruction (in comment) so can't remove them.
769 *
770 * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
771 * whereas passed_tokens_clean and remaining_tokens_clean does not contain
772 * comment-tokens.
773 *
774 * Normally we have:
775 * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
776 * after the call to pop2.
777 * toks = (reverse passed_tok) ++ remaining_tokens
778 * at the and of the lexer_function call.
779 * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
780 * At the end of lexer_function call, cur_tok overlap with passed_tok.
781 *
782 * convention: I use "tr" for "tokens refs"
485bce71
C
783 *
784 * I now also need this lexing trick because the lexer return comment
785 * tokens.
34e49164
C
786 *)
787
788type tokens_state = {
789 mutable rest : Parser_c.token list;
790 mutable rest_clean : Parser_c.token list;
791 mutable current : Parser_c.token;
792 (* it's passed since last "checkpoint", not passed from the beginning *)
793 mutable passed : Parser_c.token list;
794 mutable passed_clean : Parser_c.token list;
795}
485bce71
C
796let clone_tokens_stat tr =
797 { rest = tr.rest;
798 rest_clean = tr.rest_clean;
799 current = tr.current;
800 passed = tr.passed;
801 passed_clean = tr.passed_clean;
802 }
803let copy_tokens_stat ~src ~dst =
804 dst.rest <- src.rest;
805 dst.rest_clean <- src.rest_clean;
806 dst.current <- src.current;
807 dst.passed <- src.passed;
808 dst.passed_clean <- src.passed_clean;
809 ()
810
811let rec filter_noise n xs =
812 match n, xs with
813 | _, [] -> []
814 | 0, xs -> xs
815 | n, x::xs ->
816 (match x with
817 | Parser_c.TMacroAttr _ ->
818 filter_noise (n-1) xs
819 | _ ->
820 x::filter_noise (n-1) xs
821 )
822
823let clean_for_lookahead xs =
824 match xs with
825 | [] -> []
826 | [x] -> [x]
827 | x::xs ->
828 x::filter_noise 10 xs
829
34e49164 830
485bce71
C
831
832(* Hacked lex. This function use refs passed by parse_print_error_heuristic
833 * tr means token refs.
834 *)
835let rec lexer_function ~pass tr = fun lexbuf ->
34e49164
C
836 match tr.rest with
837 | [] -> pr2 "ALREADY AT END"; tr.current
838 | v::xs ->
839 tr.rest <- xs;
840 tr.current <- v;
841
842 if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
843
844 if TH.is_comment v
845 then begin
846 tr.passed <- v::tr.passed;
485bce71 847 lexer_function ~pass tr lexbuf
34e49164
C
848 end
849 else begin
850 let x = List.hd tr.rest_clean in
851 tr.rest_clean <- List.tl tr.rest_clean;
852 assert (x = v);
853
854 (match v with
113803cf
C
855
856 (* fix_define1.
857 *
858 * Why not in parsing_hacks lookahead and do passing like
485bce71 859 * I do for some ifdef directives ? Because here I also need to
113803cf
C
860 * generate some tokens sometimes and so I need access to the
861 * tr.passed, tr.rest, etc.
485bce71 862 *)
34e49164 863 | Parser_c.TDefine (tok) ->
485bce71
C
864 if not (LP.current_context () = LP.InTopLevel) &&
865 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
34e49164 866 then begin
485bce71 867 incr Stat.nDefinePassing;
34e49164 868 pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
0708f913 869 let v' = Parser_c.TCommentCpp (Token_c.CppDirective,TH.info_of_tok v)
34e49164
C
870 in
871 tr.passed <- v'::tr.passed;
872 tr.rest <- comment_until_defeol tr.rest;
873 tr.rest_clean <- drop_until_defeol tr.rest_clean;
485bce71 874 lexer_function ~pass tr lexbuf
34e49164
C
875 end
876 else begin
877 tr.passed <- v::tr.passed;
878 tr.passed_clean <- v::tr.passed_clean;
879 v
880 end
881
882 | Parser_c.TInclude (includes, filename, inifdef, info) ->
485bce71
C
883 if not (LP.current_context () = LP.InTopLevel) &&
884 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
34e49164 885 then begin
485bce71 886 incr Stat.nIncludePassing;
34e49164 887 pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
0708f913 888 let v = Parser_c.TCommentCpp(Token_c.CppDirective, info) in
34e49164 889 tr.passed <- v::tr.passed;
485bce71 890 lexer_function ~pass tr lexbuf
34e49164
C
891 end
892 else begin
893 let (v,new_tokens) =
894 tokens_include (info, includes, filename, inifdef) in
895 let new_tokens_clean =
896 new_tokens +> List.filter TH.is_not_comment in
897
898 tr.passed <- v::tr.passed;
899 tr.passed_clean <- v::tr.passed_clean;
900 tr.rest <- new_tokens ++ tr.rest;
901 tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
902 v
903 end
904
905 | _ ->
906
907 (* typedef_fix1 *)
908 let v = match v with
909 | Parser_c.TIdent (s, ii) ->
485bce71
C
910 if
911 LP.is_typedef s &&
912 not (!Flag_parsing_c.disable_add_typedef) &&
913 pass = 1
34e49164
C
914 then Parser_c.TypedefIdent (s, ii)
915 else Parser_c.TIdent (s, ii)
916 | x -> x
917 in
918
485bce71
C
919 let v = Parsing_hacks.lookahead ~pass
920 (clean_for_lookahead (v::tr.rest_clean))
921 tr.passed_clean in
34e49164
C
922
923 tr.passed <- v::tr.passed;
924
485bce71 925 (* the lookahead may have changed the status of the token and
34e49164 926 * consider it as a comment, for instance some #include are
485bce71 927 * turned into comments, hence this code. *)
34e49164 928 match v with
485bce71 929 | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
34e49164
C
930 | v ->
931 tr.passed_clean <- v::tr.passed_clean;
932 v
933 )
934 end
935
936
937
485bce71
C
938let get_one_elem ~pass tr (file, filelines) =
939
940 if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
941 then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
942
943 (* normally have to do that only when come from an exception in which
944 * case the dt() may not have been done
945 * TODO but if was in scoped scope ? have to let only the last scope
946 * so need do a LP.lexer_reset_typedef ();
947 *)
948 LP.enable_typedef();
949 LP._lexer_hint := (LP.default_hint ());
950 LP.save_typedef_state();
951
952 tr.passed <- [];
953
954 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
955
956 (try
957 (* -------------------------------------------------- *)
958 (* Call parser *)
959 (* -------------------------------------------------- *)
960 Common.profile_code_exclusif "YACC" (fun () ->
961 Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
962 )
963 with e -> begin
964 if (pass = 1 && !Flag_parsing_c.disable_two_pass)|| (pass = 2)
965 then begin
966 (match e with
967 (* Lexical is not anymore launched I think *)
968 | Lexer_c.Lexical s ->
969 pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
970 | Parsing.Parse_error ->
971 pr2 ("parse error \n = " ^ error_msg_tok tr.current)
972 | Semantic_c.Semantic (s, i) ->
973 pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
974 | e -> raise e
975 )
976 end;
977 LP.restore_typedef_state();
978
979 (* must keep here, before the code that adjusts the tr fields *)
980 let line_error = TH.line_of_tok tr.current in
981
982
983 (* error recovery, go to next synchro point *)
984 let (passed', rest') = find_next_synchro tr.rest tr.passed in
985 tr.rest <- rest';
986 tr.passed <- passed';
987
988 tr.current <- List.hd passed';
989 tr.passed_clean <- []; (* enough ? *)
990 (* with error recovery, rest and rest_clean may not be in sync *)
991 tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
992
993
994 let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
91eba41f 995 Right (info_of_bads, line_error, tr.passed)
485bce71
C
996 end
997 )
998
999
1000
1001
34e49164 1002(* note: as now we go in 2 passes, there is first all the error message of
485bce71 1003 * the lexer, and then the error of the parser. It is not anymore
34e49164
C
1004 * interwinded.
1005 *
1006 * !!!This function use refs, and is not reentrant !!! so take care.
1007 * It use globals defined in Lexer_parser and also the _defs global
485bce71
C
1008 * in parsing_hack.ml.
1009 *
1010 * This function uses internally some semi globals in the
1011 * tokens_stat record and parsing_stat record.
34e49164
C
1012 *)
1013
1014let parse_print_error_heuristic2 file =
1015
91eba41f 1016 let filelines = Common.cat_array file in
485bce71
C
1017 let stat = Parsing_stat.default_stat file in
1018
34e49164
C
1019 (* -------------------------------------------------- *)
1020 (* call lexer and get all the tokens *)
1021 (* -------------------------------------------------- *)
1022 LP.lexer_reset_typedef();
485bce71
C
1023 Parsing_hacks.ifdef_paren_cnt := 0;
1024 let toks_orig = tokens file in
34e49164 1025
485bce71 1026 let toks = Parsing_hacks.fix_tokens_define toks_orig in
34e49164
C
1027 let toks = Parsing_hacks.fix_tokens_cpp toks in
1028
34e49164
C
1029 let tr = {
1030 rest = toks;
1031 rest_clean = (toks +> List.filter TH.is_not_comment);
1032 current = (List.hd toks);
1033 passed = [];
1034 passed_clean = [];
1035 } in
34e49164 1036
34e49164 1037
34e49164 1038
485bce71
C
1039
1040 let rec loop tr =
34e49164
C
1041
1042 (* todo?: I am not sure that it represents current_line, cos maybe
1043 * tr.current partipated in the previous parsing phase, so maybe tr.current
1044 * is not the first token of the next parsing phase. Same with checkpoint2.
1045 * It would be better to record when we have a } or ; in parser.mly,
1046 * cos we know that they are the last symbols of external_declaration2.
485bce71
C
1047 *
1048 * bugfix: may not be equal to 'file' as after macro expansions we can
1049 * start to parse a new entity from the body of a macro, for instance
1050 * when parsing a define_machine() body, cf standard.h
34e49164
C
1051 *)
1052 let checkpoint = TH.line_of_tok tr.current in
485bce71 1053 let checkpoint_file = TH.file_of_tok tr.current in
34e49164 1054
485bce71
C
1055 let tr_save = clone_tokens_stat tr in
1056
1057 (* call the parser *)
34e49164 1058 let elem =
485bce71
C
1059 let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1060 match pass1 with
1061 | Left e -> Left e
1062 | Right res ->
1063 if !Flag_parsing_c.disable_two_pass
1064 then Right res
1065 else begin
1066 pr2 "parsing pass2: try again";
1067 copy_tokens_stat ~src:tr_save ~dst: tr;
1068 let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1069 pass2
1070 end
1071 in
1072
1073
1074 (* again not sure if checkpoint2 corresponds to end of bad region *)
1075 let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1076 let checkpoint2_file = TH.file_of_tok tr.current in
1077
1078 let was_define =
1079 (match elem with
1080 | Left _ -> false
91eba41f 1081 | Right (_, line_error, _) ->
485bce71 1082 let was_define =
34e49164
C
1083 let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1084 if List.length xs >= 2
1085 then
1086 (match Common.head_middle_tail xs with
1087 | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
485bce71
C
1088 true
1089 | _ -> false
34e49164 1090 )
485bce71 1091 else begin
0708f913 1092 pr2 "WEIRD: length list of error recovery tokens < 2 ";
485bce71
C
1093 false
1094 end
1095 in
1096 (if was_define && !Flag_parsing_c.filter_msg_define_error
1097 then ()
1098 else
1099 (* bugfix: *)
1100 if (checkpoint_file = checkpoint2_file) && checkpoint_file = file
1101 then print_bad line_error (checkpoint, checkpoint2) filelines
1102 else pr2 "PB: bad: but on tokens not from original file"
1103 );
1104 was_define
1105 ) in
1106
34e49164 1107
485bce71
C
1108 let diffline =
1109 if (checkpoint_file = checkpoint2_file) && (checkpoint_file = file)
1110 then (checkpoint2 - checkpoint)
1111 else 0
1112 (* TODO? so if error come in middle of something ? where the
1113 * start token was from original file but synchro found in body
1114 * of macro ? then can have wrong number of lines stat.
1115 * Maybe simpler just to look at tr.passed and count
1116 * the lines in the token from the correct file ?
1117 *)
34e49164 1118 in
34e49164
C
1119 let info = mk_info_item file (List.rev tr.passed) in
1120
485bce71
C
1121 (* some stat updates *)
1122 stat.Stat.commentized <-
1123 stat.Stat.commentized + count_lines_commentized (snd info);
1124
1125 let elem =
1126 match elem with
91eba41f
C
1127 | Left e ->
1128 stat.Stat.correct <- stat.Stat.correct + diffline;
1129 e
1130 | Right (info_of_bads, line_error, toks_of_bads) ->
1131 if was_define && !Flag_parsing_c.filter_define_error
1132 then stat.Stat.correct <- stat.Stat.correct + diffline
1133 else stat.Stat.bad <- stat.Stat.bad + diffline;
1134
1135 let pbline =
1136 toks_of_bads
0708f913 1137 +> Common.filter (TH.is_same_line_or_close line_error)
91eba41f
C
1138 +> Common.filter TH.is_ident_like
1139 in
1140 let error_info =
1141 (pbline +> List.map TH.str_of_tok), line_error
1142 in
1143 stat.Stat.problematic_lines <-
1144 error_info::stat.Stat.problematic_lines;
1145
485bce71
C
1146 Ast_c.NotParsedCorrectly info_of_bads
1147 in
34e49164
C
1148
1149 (match elem with
1150 | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
485bce71 1151 | xs -> (xs, info):: loop tr (* recurse *)
34e49164
C
1152 )
1153 in
485bce71 1154 let v = loop tr in
34e49164
C
1155 let v = consistency_checking v in
1156 (v, stat)
1157
1158
485bce71
C
1159let time_total_parsing a =
1160 Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1161
34e49164 1162let parse_print_error_heuristic a =
485bce71
C
1163 Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1164
34e49164
C
1165
1166(* alias *)
1167let parse_c_and_cpp a = parse_print_error_heuristic a
1168
1169(*****************************************************************************)
1170(* Same but faster cos memoize stuff *)
1171(*****************************************************************************)
1172let parse_cache file =
1173 if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1174 else
485bce71 1175 let _ = pr2 "TOFIX" in
34e49164
C
1176 let need_no_changed_files =
1177 (* should use Sys.argv.(0), would be safer. *)
485bce71
C
1178
1179 [
1180 (* TOFIX
1181 Config.path ^ "/parsing_c/c_parser.cma";
1182 (* we may also depend now on the semantic patch because
1183 the SP may use macro and so we will disable some of the
1184 macro expansions from standard.h.
1185 *)
1186 !Config.std_h;
1187 *)
34e49164
C
1188 ]
1189 in
1190 let need_no_changed_variables =
1191 (* could add some of the flags of flag_parsing_c.ml *)
1192 []
1193 in
1194 Common.cache_computation_robust
1195 file ".ast_raw"
1196 (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1197 (fun () -> parse_print_error_heuristic file)
1198
1199
1200
1201(*****************************************************************************)
485bce71 1202(* Some special cases *)
34e49164
C
1203(*****************************************************************************)
1204
485bce71
C
1205let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1206 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1207 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1208 program +> Common.find_some (fun (e,_) ->
1209 match e with
1210 | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1211 | _ -> None
1212 )
1213
1214let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1215 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1216 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1217 program +> Common.find_some (fun (e,_) ->
1218 match e with
1219 | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1220 (match compound with
1221 | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1222 | _ -> None
1223 )
1224 | _ -> None
1225 )