Release coccinelle-0.1.2
[bpt/coccinelle.git] / parsing_c / parse_c.ml
CommitLineData
485bce71 1(* Copyright (C) 2006, 2007, 2008 Yoann Padioleau
34e49164
C
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU General Public License (GPL)
5 * version 2 as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * file license.txt for more details.
11 *)
12
13open Common
14
15module TH = Token_helpers
16module LP = Lexer_parser
17
485bce71
C
18module Stat = Parsing_stat
19
34e49164
C
20(*****************************************************************************)
21(* Wrappers *)
22(*****************************************************************************)
23let pr2 s =
24 if !Flag_parsing_c.verbose_parsing
25 then Common.pr2 s
485bce71
C
26
27let pr2_once s =
28 if !Flag_parsing_c.verbose_parsing
29 then Common.pr2_once s
34e49164
C
30
31(*****************************************************************************)
32(* Helpers *)
33(*****************************************************************************)
34
35let lexbuf_to_strpos lexbuf =
36 (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
37
38let token_to_strpos tok =
39 (TH.str_of_tok tok, TH.pos_of_tok tok)
40
41
42let error_msg_tok tok =
43 let file = TH.file_of_tok tok in
44 if !Flag_parsing_c.verbose_parsing
45 then Common.error_message file (token_to_strpos tok)
46 else ("error in " ^ file ^ "set verbose_parsing for more info")
47
48
49let print_bad line_error (start_line, end_line) filelines =
34e49164
C
50 begin
51 pr2 ("badcount: " ^ i_to_s (end_line - start_line));
485bce71 52
34e49164 53 for i = start_line to end_line do
485bce71
C
54 let line = filelines.(i) in
55
34e49164 56 if i = line_error
485bce71
C
57 then pr2 ("BAD:!!!!!" ^ " " ^ line)
58 else pr2 ("bad:" ^ " " ^ line)
34e49164
C
59 done
60 end
61
62
63
64let mk_info_item2 filename toks =
65 let buf = Buffer.create 100 in
66 let s =
67 (* old: get_slice_file filename (line1, line2) *)
68 begin
69 toks +> List.iter (fun tok ->
70 match TH.pinfo_of_tok tok with
71 | Ast_c.OriginTok _ -> Buffer.add_string buf (TH.str_of_tok tok)
72 | Ast_c.AbstractLineTok _ -> raise Impossible
73 | _ -> ()
74 );
75 Buffer.contents buf
76 end
77 in
78 (s, toks)
79
80let mk_info_item a b =
81 Common.profile_code "C parsing.mk_info_item"
82 (fun () -> mk_info_item2 a b)
83
84
85
34e49164
C
86
87(*****************************************************************************)
88(* Stats on what was passed/commentized *)
89(*****************************************************************************)
90
91let commentized xs = xs +> Common.map_filter (function
92 | Parser_c.TCommentCpp (cppkind, ii) ->
485bce71
C
93 let s = Ast_c.str_of_info ii in
94 let legal_passing =
95 match !Flag_parsing_c.filter_passed_level with
96 | 0 -> false
97 | 1 ->
98 List.mem cppkind [Ast_c.CppAttr]
99 ||
100 (s =~ "__.*")
101 | 2 ->
102 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal]
103 ||
104 (s =~ "__.*")
105 | 3 ->
106 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective]
107 ||
108 (s =~ "__.*")
109 | 4 ->
110 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppMacro]
111 ||
112 (s =~ "__.*")
113
114
115 | 5 ->
116 List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective;Ast_c.CppMacro]
117 ||
118 (s =~ "__.*")
119
120
121
122
123 | _ -> failwith "not valid level passing number"
124 in
125 if legal_passing then None else Some (ii.Ast_c.pinfo)
126
127 (*
34e49164 128 | Ast_c.CppOther ->
34e49164
C
129 (match s with
130 | s when s =~ "KERN_.*" -> None
131 | s when s =~ "__.*" -> None
485bce71
C
132 | _ ->
133 Some (ii.Ast_c.pinfo)
34e49164 134 )
485bce71
C
135 *)
136
34e49164
C
137
138 | Parser_c.TCommentMisc ii
139 | Parser_c.TAction ii
140 ->
141 Some (ii.Ast_c.pinfo)
142 | _ ->
143 None
144 )
145
146let count_lines_commentized xs =
147 let line = ref (-1) in
148 let count = ref 0 in
149 begin
150 commentized xs +>
151 List.iter
152 (function
153 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
154 let newline = pinfo.Common.line in
155 if newline <> !line
156 then begin
157 line := newline;
158 incr count
159 end
160 | _ -> ());
161 !count
162 end
163
164
165
166let print_commentized xs =
167 let line = ref (-1) in
168 begin
169 let ys = commentized xs in
170 ys +>
171 List.iter
172 (function
173 Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
174 let newline = pinfo.Common.line in
175 let s = pinfo.Common.str in
176 let s = Str.global_substitute
177 (Str.regexp "\n") (fun s -> "") s
178 in
179 if newline = !line
180 then prerr_string (s ^ " ")
181 else begin
182 if !line = -1
183 then pr2_no_nl "passed:"
184 else pr2_no_nl "\npassed:";
185 line := newline;
186 pr2_no_nl (s ^ " ");
187 end
188 | _ -> ());
189 if not (null ys) then pr2 "";
190 end
191
192
193
194
195(*****************************************************************************)
196(* Lexing only *)
197(*****************************************************************************)
198
199(* called by parse_print_error_heuristic *)
200let tokens2 file =
201 let table = Common.full_charpos_to_pos file in
202
203 Common.with_open_infile file (fun chan ->
204 let lexbuf = Lexing.from_channel chan in
205 try
206 let rec tokens_aux acc =
207 let tok = Lexer_c.token lexbuf in
208 (* fill in the line and col information *)
209 let tok = tok +> TH.visitor_info_of_tok (fun ii ->
210 { ii with Ast_c.pinfo=
211 (* could assert pinfo.filename = file ? *)
212 match Ast_c.pinfo_of_info ii with
213 Ast_c.OriginTok pi ->
214 Ast_c.OriginTok (Common.complete_parse_info file table pi)
215 | Ast_c.ExpandedTok (pi,vpi) ->
216 Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
217 | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
218 | Ast_c.AbstractLineTok pi -> failwith "should not occur"
219 })
220 in
221
222 if TH.is_eof tok
223 then List.rev (tok::acc)
224 else tokens_aux (tok::acc)
225 in
226 tokens_aux []
227 with
228 | Lexer_c.Lexical s ->
229 failwith ("lexical error " ^ s ^ "\n =" ^
230 (Common.error_message file (lexbuf_to_strpos lexbuf)))
231 | e -> raise e
232 )
233
485bce71
C
234let time_lexing ?(profile=true) a =
235 if profile
236 then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
237 else tokens2 a
238let tokens ?profile a =
239 Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
34e49164
C
240
241
485bce71 242let tokens_of_string string =
34e49164
C
243 let lexbuf = Lexing.from_string string in
244 try
245 let rec tokens_s_aux () =
246 let tok = Lexer_c.token lexbuf in
247 if TH.is_eof tok
248 then [tok]
249 else tok::(tokens_s_aux ())
250 in
251 tokens_s_aux ()
252 with
253 | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
254 | e -> raise e
255
256
257(*****************************************************************************)
258(* Parsing, but very basic, no more used *)
259(*****************************************************************************)
260
261(*
262 * !!!Those function use refs, and are not reentrant !!! so take care.
263 * It use globals defined in Lexer_parser.
264 *
265 * update: because now lexer return comments tokens, those functions
266 * may not work anymore.
267 *)
268
269let parse file =
270 let lexbuf = Lexing.from_channel (open_in file) in
271 let result = Parser_c.main Lexer_c.token lexbuf in
272 result
273
274
275let parse_print_error file =
276 let chan = (open_in file) in
277 let lexbuf = Lexing.from_channel chan in
278
279 let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
280 try
281 lexbuf +> Parser_c.main Lexer_c.token
282 with
283 | Lexer_c.Lexical s ->
284 failwith ("lexical error " ^s^ "\n =" ^ error_msg ())
285 | Parsing.Parse_error ->
286 failwith ("parse error \n = " ^ error_msg ())
287 | Semantic_c.Semantic (s, i) ->
288 failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
289 | e -> raise e
290
291
292
293
294(*****************************************************************************)
295(* Parsing subelements, useful to debug parser *)
296(*****************************************************************************)
297
298(*
299 * !!!Those function use refs, and are not reentrant !!! so take care.
300 * It use globals defined in Lexer_parser.
301 *)
302
303
304(* old:
305 * let parse_gen parsefunc s =
306 * let lexbuf = Lexing.from_string s in
307 * let result = parsefunc Lexer_c.token lexbuf in
308 * result
309 *)
310
311let parse_gen parsefunc s =
485bce71 312 let toks = tokens_of_string s +> List.filter TH.is_not_comment in
34e49164
C
313
314
315 (* Why use this lexing scheme ? Why not classically give lexer func
316 * to parser ? Because I now keep comments in lexer. Could
317 * just do a simple wrapper that when comment ask again for a token,
318 * but maybe simpler to use cur_tok technique.
319 *)
320 let all_tokens = ref toks in
321 let cur_tok = ref (List.hd !all_tokens) in
322
323 let lexer_function =
324 (fun _ ->
325 if TH.is_eof !cur_tok
326 then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
327 else
328 let v = Common.pop2 all_tokens in
329 cur_tok := v;
330 !cur_tok
331 )
332 in
333 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
334 let result = parsefunc lexer_function lexbuf_fake in
335 result
336
337
338let type_of_string = parse_gen Parser_c.type_name
339let statement_of_string = parse_gen Parser_c.statement
340let expression_of_string = parse_gen Parser_c.expr
341
342(* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
343
344
345
346
347
348(*****************************************************************************)
349(* Consistency checking *)
350(*****************************************************************************)
351
352type class_ident =
353 | CIdent (* can be var, func, field, tag, enum constant *)
354 | CTypedef
355
356let str_of_class_ident = function
357 | CIdent -> "Ident"
358 | CTypedef -> "Typedef"
359
360(*
361 | CMacro
362 | CMacroString
363 | CMacroStmt
364 | CMacroDecl
365 | CMacroIterator
366 | CAttr
367
368(* but take care that must still be able to use '=' *)
369type context = InFunction | InEnum | InStruct | InInitializer | InParams
370type class_token =
371 | CIdent of class_ident
372
373 | CComment
374 | CSpace
375 | CCommentCpp of cppkind
376 | CCommentMisc
377 | CCppDirective
378
379 | COPar
380 | CCPar
381 | COBrace
382 | CCBrace
383
384 | CSymbol
385 | CReservedKwd (type | decl | qualif | flow | misc | attr)
386*)
387
388(* parse_typedef_fix4 *)
389let consistency_checking2 xs =
390
391 (* first phase, gather data *)
392 let stat = Hashtbl.create 101 in
393
394 (* default value for hash *)
395 let v1 () = Hashtbl.create 101 in
396 let v2 () = ref 0 in
397
398 let bigf = { Visitor_c.default_visitor_c with
399
400 Visitor_c.kexpr = (fun (k,bigf) x ->
401 match Ast_c.unwrap_expr x with
402 | Ast_c.Ident s ->
403 stat +>
404 Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
405 (fun aref -> incr aref)
406
407 | _ -> k x
408 );
409 Visitor_c.ktype = (fun (k,bigf) t ->
410 match Ast_c.unwrap_typeC t with
411 | Ast_c.TypeName (s,_typ) ->
412 stat +>
413 Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
414 (fun aref -> incr aref)
415
416 | _ -> k t
417 );
418 }
419 in
420 xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
421
422
423 let ident_to_type = ref [] in
424
425
426 (* second phase, analyze data *)
427 stat +> Hashtbl.iter (fun k v ->
428 let xs = Common.hash_to_list v in
429 if List.length xs >= 2
430 then begin
431 pr2 ("CONFLICT:" ^ k);
432 let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
433 if !va = !vb then
434 (match ka, kb with
435 | CTypedef, _ -> 1 (* first is smaller *)
436 | _, CTypedef -> -1
437 | _ -> 0
438 )
439 else compare !va !vb
440 ) in
441 let sorted = List.rev sorted in
442 match sorted with
443 | [CTypedef, i1;CIdent, i2] ->
444 pr2 ("transforming some ident in typedef");
445 push2 k ident_to_type;
446 | _ ->
447 pr2 ("TODO:other transforming?");
448
449 end
450 );
451
485bce71
C
452 (* third phase, update ast.
453 * todo? but normally should try to handle correctly scope ? maybe sometime
454 * sizeof(id) and even if id was for a long time an identifier, maybe
455 * a few time, because of the scope it's actually really a type.
456 *)
34e49164
C
457 if (null !ident_to_type)
458 then xs
459 else
460 let bigf = { Visitor_c.default_visitor_c_s with
461 Visitor_c.kdefineval_s = (fun (k,bigf) x ->
462 match x with
463 | Ast_c.DefineExpr e ->
464 (match e with
465 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
466 let t = (Ast_c.nQ,
467 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
468
469 Ast_c.DefineType t
470 | _ -> k x
471 )
472 | _ -> k x
473 );
474 Visitor_c.kexpr_s = (fun (k, bigf) x ->
475 match x with
476 | (Ast_c.SizeOfExpr e, tref), isizeof ->
477 let i1 = tuple_of_list1 isizeof in
478 (match e with
479 | (Ast_c.ParenExpr e, _), iiparen ->
480 (match e with
481 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
482 let (i2, i3) = tuple_of_list2 iiparen in
483 let t = (Ast_c.nQ,
484 (Ast_c.TypeName (s, Ast_c.noTypedefDef()), ii)) in
485 (Ast_c.SizeOfType t, tref), [i1;i2;i3]
486
487 | _ -> k x
488 )
489 | _ -> k x
490 )
491 | _ -> k x
492 );
493 } in
494 xs +> List.map (fun (p, info_item) ->
495 Visitor_c.vk_toplevel_s bigf p, info_item
496 )
497
498
499let consistency_checking a =
500 Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
501
502
503
504(*****************************************************************************)
505(* Error recovery *)
506(*****************************************************************************)
507
508(* todo: do something if find Parser_c.Eof ? *)
509let rec find_next_synchro next already_passed =
510
511 (* Maybe because not enough }, because for example an ifdef contains
512 * in both branch some opening {, we later eat too much, "on deborde
513 * sur la fonction d'apres". So already_passed may be too big and
514 * looking for next synchro point starting from next may not be the
515 * best. So maybe we can find synchro point inside already_passed
516 * instead of looking in next.
517 *
518 * But take care! must progress. We must not stay in infinite loop!
519 * For instance now I have as a error recovery to look for
520 * a "start of something", corresponding to start of function,
521 * but must go beyond this start otherwise will loop.
522 * So look at premier(external_declaration2) in parser.output and
523 * pass at least those first tokens.
524 *
525 * I have chosen to start search for next synchro point after the
526 * first { I found, so quite sure we will not loop. *)
527
528 let last_round = List.rev already_passed in
529 let is_define =
530 let xs = last_round +> List.filter TH.is_not_comment in
531 match xs with
532 | Parser_c.TDefine _::_ -> true
533 | _ -> false
534 in
535 if is_define
536 then find_next_synchro_define (last_round ++ next) []
537 else
538
539 let (before, after) =
540 last_round +> Common.span (fun tok ->
541 match tok with
542 (* by looking at TOBrace we are sure that the "start of something"
543 * will not arrive too early
544 *)
545 | Parser_c.TOBrace _ -> false
546 | Parser_c.TDefine _ -> false
547 | _ -> true
548 )
549 in
550 find_next_synchro_orig (after ++ next) (List.rev before)
551
552
553
554and find_next_synchro_define next already_passed =
555 match next with
556 | [] ->
557 pr2 "ERROR-RECOV: end of file while in recovery mode";
558 already_passed, []
559 | (Parser_c.TDefEOL i as v)::xs ->
485bce71 560 pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
34e49164
C
561 v::already_passed, xs
562 | v::xs ->
563 find_next_synchro_define xs (v::already_passed)
564
565
566
567
568and find_next_synchro_orig next already_passed =
569 match next with
570 | [] ->
571 pr2 "ERROR-RECOV: end of file while in recovery mode";
572 already_passed, []
573
574 | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v = 0 ->
575 pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
576
577 (match xs with
578 | [] -> raise Impossible (* there is a EOF token normally *)
579
580 (* still useful: now parser.mly allow empty ';' so normally no pb *)
581 | Parser_c.TPtVirg iptvirg::xs ->
582 pr2 "ERROR-RECOV: found sync bis, eating } and ;";
583 (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
584
585 | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
586 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
587 (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
588 xs
589
590 | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
591 ::xs ->
592 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
593 (Parser_c.TCommentSpace sp)::
594 (Parser_c.TPtVirg iptvirg)::
595 (Parser_c.TIdent x)::
596 v::
597 already_passed,
598 xs
599
600 | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
601 ::xs ->
602 pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
603 (Parser_c.TCommentNewline sp)::
604 (Parser_c.TPtVirg iptvirg)::
605 (Parser_c.TIdent x)::
606 v::
607 already_passed,
608 xs
609
610 | _ ->
611 v::already_passed, xs
612 )
613 | v::xs when TH.col_of_tok v = 0 && TH.is_start_of_something v ->
614 pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
615 already_passed, v::xs
616
617 | v::xs ->
618 find_next_synchro_orig xs (v::already_passed)
619
620
621(*****************************************************************************)
622(* Include/Define hacks *)
623(*****************************************************************************)
624
485bce71
C
625(* Sometimes I prefer to generate a single token for a list of things in the
626 * lexer so that if I have to passed them, like for passing TInclude then
627 * it's easy. Also if I don't do a single token, then I need to
628 * parse the rest which may not need special stuff, like detecting
629 * end of line which the parser is not really ready for. So for instance
630 * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
631 * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
632 * but this kind of token is valid only after a #include and the
633 * lexing and parsing rules are different for such tokens so not that
634 * easy to parse such things in parser_c.mly. Hence the following hacks.
635 *
636 * less?: maybe could get rid of this like I get rid of some of fix_define.
637 *)
638
34e49164
C
639(* ------------------------------------------------------------------------- *)
640(* helpers *)
641(* ------------------------------------------------------------------------- *)
642
643(* used to generate new token from existing one *)
644let new_info posadd str ii =
645 { Ast_c.pinfo =
646 Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
647 charpos = Ast_c.pos_of_info ii + posadd;
648 str = str;
649 column = Ast_c.col_of_info ii + posadd;
650 };
651 (* must generate a new ref each time, otherwise share *)
652 cocci_tag = ref Ast_c.emptyAnnot;
653 comments_tag = ref Ast_c.emptyComments;
654 }
655
656
657let rec comment_until_defeol xs =
658 match xs with
659 | [] -> failwith "cant find end of define token TDefEOL"
660 | x::xs ->
661 (match x with
662 | Parser_c.TDefEOL i ->
663 Parser_c.TCommentCpp (Ast_c.CppDirective, TH.info_of_tok x)
664 ::xs
665 | _ ->
485bce71
C
666 let x' =
667 (* bugfix: otherwise may lose a TComment token *)
668 if TH.is_real_comment x
669 then x
670 else Parser_c.TCommentCpp (Ast_c.CppPassingNormal (*good?*), TH.info_of_tok x)
671 in
672 x'::comment_until_defeol xs
34e49164
C
673 )
674
675let drop_until_defeol xs =
676 List.tl
677 (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
678
679
680
681(* ------------------------------------------------------------------------- *)
682(* returns a pair (replaced token, list of next tokens) *)
683(* ------------------------------------------------------------------------- *)
684
685let tokens_include (info, includes, filename, inifdef) =
686 Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
687 [Parser_c.TIncludeFilename
688 (filename, (new_info (String.length includes) filename info))
689 ]
690
691(*****************************************************************************)
485bce71 692(* Parsing default define macros, usually in a standard.h file *)
34e49164
C
693(*****************************************************************************)
694
485bce71
C
695let parse_cpp_define_file2 file =
696 let toks = tokens ~profile:false file in
34e49164
C
697 let toks = Parsing_hacks.fix_tokens_define toks in
698 Parsing_hacks.extract_cpp_define toks
699
485bce71
C
700let parse_cpp_define_file a =
701 Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
702
703(* can not be put in parsing_hack, cos then mutually recursive problem as
704 * we also want to parse the standard.h file.
705 *)
706let init_defs std_h =
707 if not (Common.lfile_exists std_h)
708 then pr2 ("warning: Can't find default macro file: " ^ std_h)
709 else begin
710 pr2 ("init_defs: " ^ std_h);
711 Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
712 end
713
34e49164
C
714
715(*****************************************************************************)
716(* Main entry point *)
717(*****************************************************************************)
718
719type info_item = string * Parser_c.token list
720
721type program2 = toplevel2 list
722 and toplevel2 = Ast_c.toplevel * info_item
723
485bce71
C
724let program_of_program2 xs =
725 xs +> List.map fst
726
727let with_program2 f program2 =
728 program2
729 +> Common.unzip
730 +> (fun (program, infos) ->
731 f program, infos
732 )
733 +> Common.uncurry Common.zip
734
735
34e49164
C
736
737(* The use of local refs (remaining_tokens, passed_tokens, ...) makes
738 * possible error recovery. Indeed, they allow to skip some tokens and
739 * still be able to call again the ocamlyacc parser. It is ugly code
740 * because we cant modify ocamllex and ocamlyacc. As we want some
741 * extended lexing tricks, we have to use such refs.
742 *
743 * Those refs are now also used for my lalr(k) technique. Indeed They
744 * store the futur and previous tokens that were parsed, and so
745 * provide enough context information for powerful lex trick.
746 *
747 * - passed_tokens_last_ckp stores the passed tokens since last
485bce71 748 * checkpoint. Used for NotParsedCorrectly and also to build the
34e49164
C
749 * info_item attached to each program_element.
750 * - passed_tokens_clean is used for lookahead, in fact for lookback.
751 * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
752 * contain some comments and so would make pattern matching difficult
753 * in lookahead. Hence this variable. We would like also to get rid
754 * of cpp instruction because sometimes a cpp instruction is between
755 * two tokens and makes a pattern matching fail. But lookahead also
756 * transform some cpp instruction (in comment) so can't remove them.
757 *
758 * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
759 * whereas passed_tokens_clean and remaining_tokens_clean does not contain
760 * comment-tokens.
761 *
762 * Normally we have:
763 * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
764 * after the call to pop2.
765 * toks = (reverse passed_tok) ++ remaining_tokens
766 * at the and of the lexer_function call.
767 * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
768 * At the end of lexer_function call, cur_tok overlap with passed_tok.
769 *
770 * convention: I use "tr" for "tokens refs"
485bce71
C
771 *
772 * I now also need this lexing trick because the lexer return comment
773 * tokens.
34e49164
C
774 *)
775
776type tokens_state = {
777 mutable rest : Parser_c.token list;
778 mutable rest_clean : Parser_c.token list;
779 mutable current : Parser_c.token;
780 (* it's passed since last "checkpoint", not passed from the beginning *)
781 mutable passed : Parser_c.token list;
782 mutable passed_clean : Parser_c.token list;
783}
485bce71
C
784let clone_tokens_stat tr =
785 { rest = tr.rest;
786 rest_clean = tr.rest_clean;
787 current = tr.current;
788 passed = tr.passed;
789 passed_clean = tr.passed_clean;
790 }
791let copy_tokens_stat ~src ~dst =
792 dst.rest <- src.rest;
793 dst.rest_clean <- src.rest_clean;
794 dst.current <- src.current;
795 dst.passed <- src.passed;
796 dst.passed_clean <- src.passed_clean;
797 ()
798
799let rec filter_noise n xs =
800 match n, xs with
801 | _, [] -> []
802 | 0, xs -> xs
803 | n, x::xs ->
804 (match x with
805 | Parser_c.TMacroAttr _ ->
806 filter_noise (n-1) xs
807 | _ ->
808 x::filter_noise (n-1) xs
809 )
810
811let clean_for_lookahead xs =
812 match xs with
813 | [] -> []
814 | [x] -> [x]
815 | x::xs ->
816 x::filter_noise 10 xs
817
34e49164 818
485bce71
C
819
820(* Hacked lex. This function use refs passed by parse_print_error_heuristic
821 * tr means token refs.
822 *)
823let rec lexer_function ~pass tr = fun lexbuf ->
34e49164
C
824 match tr.rest with
825 | [] -> pr2 "ALREADY AT END"; tr.current
826 | v::xs ->
827 tr.rest <- xs;
828 tr.current <- v;
829
830 if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
831
832 if TH.is_comment v
833 then begin
834 tr.passed <- v::tr.passed;
485bce71 835 lexer_function ~pass tr lexbuf
34e49164
C
836 end
837 else begin
838 let x = List.hd tr.rest_clean in
839 tr.rest_clean <- List.tl tr.rest_clean;
840 assert (x = v);
841
842 (match v with
485bce71
C
843 (* fix_define1. Why not in parsing_hacks lookahead and do passing like
844 * I do for some ifdef directives ? Because here I also need to
845 * generate some tokens sometimes.
846 *)
34e49164 847 | Parser_c.TDefine (tok) ->
485bce71
C
848 if not (LP.current_context () = LP.InTopLevel) &&
849 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
34e49164 850 then begin
485bce71 851 incr Stat.nDefinePassing;
34e49164
C
852 pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
853 let v' = Parser_c.TCommentCpp (Ast_c.CppDirective,TH.info_of_tok v)
854 in
855 tr.passed <- v'::tr.passed;
856 tr.rest <- comment_until_defeol tr.rest;
857 tr.rest_clean <- drop_until_defeol tr.rest_clean;
485bce71 858 lexer_function ~pass tr lexbuf
34e49164
C
859 end
860 else begin
861 tr.passed <- v::tr.passed;
862 tr.passed_clean <- v::tr.passed_clean;
863 v
864 end
865
866 | Parser_c.TInclude (includes, filename, inifdef, info) ->
485bce71
C
867 if not (LP.current_context () = LP.InTopLevel) &&
868 (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
34e49164 869 then begin
485bce71 870 incr Stat.nIncludePassing;
34e49164
C
871 pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
872 let v = Parser_c.TCommentCpp(Ast_c.CppDirective, info) in
873 tr.passed <- v::tr.passed;
485bce71 874 lexer_function ~pass tr lexbuf
34e49164
C
875 end
876 else begin
877 let (v,new_tokens) =
878 tokens_include (info, includes, filename, inifdef) in
879 let new_tokens_clean =
880 new_tokens +> List.filter TH.is_not_comment in
881
882 tr.passed <- v::tr.passed;
883 tr.passed_clean <- v::tr.passed_clean;
884 tr.rest <- new_tokens ++ tr.rest;
885 tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
886 v
887 end
888
889 | _ ->
890
891 (* typedef_fix1 *)
892 let v = match v with
893 | Parser_c.TIdent (s, ii) ->
485bce71
C
894 if
895 LP.is_typedef s &&
896 not (!Flag_parsing_c.disable_add_typedef) &&
897 pass = 1
34e49164
C
898 then Parser_c.TypedefIdent (s, ii)
899 else Parser_c.TIdent (s, ii)
900 | x -> x
901 in
902
485bce71
C
903 let v = Parsing_hacks.lookahead ~pass
904 (clean_for_lookahead (v::tr.rest_clean))
905 tr.passed_clean in
34e49164
C
906
907 tr.passed <- v::tr.passed;
908
485bce71 909 (* the lookahead may have changed the status of the token and
34e49164 910 * consider it as a comment, for instance some #include are
485bce71 911 * turned into comments, hence this code. *)
34e49164 912 match v with
485bce71 913 | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
34e49164
C
914 | v ->
915 tr.passed_clean <- v::tr.passed_clean;
916 v
917 )
918 end
919
920
921
485bce71
C
922let get_one_elem ~pass tr (file, filelines) =
923
924 if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
925 then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
926
927 (* normally have to do that only when come from an exception in which
928 * case the dt() may not have been done
929 * TODO but if was in scoped scope ? have to let only the last scope
930 * so need do a LP.lexer_reset_typedef ();
931 *)
932 LP.enable_typedef();
933 LP._lexer_hint := (LP.default_hint ());
934 LP.save_typedef_state();
935
936 tr.passed <- [];
937
938 let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
939
940 (try
941 (* -------------------------------------------------- *)
942 (* Call parser *)
943 (* -------------------------------------------------- *)
944 Common.profile_code_exclusif "YACC" (fun () ->
945 Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
946 )
947 with e -> begin
948 if (pass = 1 && !Flag_parsing_c.disable_two_pass)|| (pass = 2)
949 then begin
950 (match e with
951 (* Lexical is not anymore launched I think *)
952 | Lexer_c.Lexical s ->
953 pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
954 | Parsing.Parse_error ->
955 pr2 ("parse error \n = " ^ error_msg_tok tr.current)
956 | Semantic_c.Semantic (s, i) ->
957 pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
958 | e -> raise e
959 )
960 end;
961 LP.restore_typedef_state();
962
963 (* must keep here, before the code that adjusts the tr fields *)
964 let line_error = TH.line_of_tok tr.current in
965
966
967 (* error recovery, go to next synchro point *)
968 let (passed', rest') = find_next_synchro tr.rest tr.passed in
969 tr.rest <- rest';
970 tr.passed <- passed';
971
972 tr.current <- List.hd passed';
973 tr.passed_clean <- []; (* enough ? *)
974 (* with error recovery, rest and rest_clean may not be in sync *)
975 tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
976
977
978 let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
979 Right (info_of_bads, line_error)
980 end
981 )
982
983
984
985
34e49164 986(* note: as now we go in 2 passes, there is first all the error message of
485bce71 987 * the lexer, and then the error of the parser. It is not anymore
34e49164
C
988 * interwinded.
989 *
990 * !!!This function use refs, and is not reentrant !!! so take care.
991 * It use globals defined in Lexer_parser and also the _defs global
485bce71
C
992 * in parsing_hack.ml.
993 *
994 * This function uses internally some semi globals in the
995 * tokens_stat record and parsing_stat record.
34e49164
C
996 *)
997
998let parse_print_error_heuristic2 file =
999
485bce71
C
1000 let filelines = (""::Common.cat file) +> Array.of_list in
1001 let stat = Parsing_stat.default_stat file in
1002
34e49164
C
1003 (* -------------------------------------------------- *)
1004 (* call lexer and get all the tokens *)
1005 (* -------------------------------------------------- *)
1006 LP.lexer_reset_typedef();
485bce71
C
1007 Parsing_hacks.ifdef_paren_cnt := 0;
1008 let toks_orig = tokens file in
34e49164 1009
485bce71 1010 let toks = Parsing_hacks.fix_tokens_define toks_orig in
34e49164
C
1011 let toks = Parsing_hacks.fix_tokens_cpp toks in
1012
34e49164
C
1013 let tr = {
1014 rest = toks;
1015 rest_clean = (toks +> List.filter TH.is_not_comment);
1016 current = (List.hd toks);
1017 passed = [];
1018 passed_clean = [];
1019 } in
34e49164 1020
34e49164 1021
34e49164 1022
485bce71
C
1023
1024 let rec loop tr =
34e49164
C
1025
1026 (* todo?: I am not sure that it represents current_line, cos maybe
1027 * tr.current partipated in the previous parsing phase, so maybe tr.current
1028 * is not the first token of the next parsing phase. Same with checkpoint2.
1029 * It would be better to record when we have a } or ; in parser.mly,
1030 * cos we know that they are the last symbols of external_declaration2.
485bce71
C
1031 *
1032 * bugfix: may not be equal to 'file' as after macro expansions we can
1033 * start to parse a new entity from the body of a macro, for instance
1034 * when parsing a define_machine() body, cf standard.h
34e49164
C
1035 *)
1036 let checkpoint = TH.line_of_tok tr.current in
485bce71 1037 let checkpoint_file = TH.file_of_tok tr.current in
34e49164 1038
485bce71
C
1039 let tr_save = clone_tokens_stat tr in
1040
1041 (* call the parser *)
34e49164 1042 let elem =
485bce71
C
1043 let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1044 match pass1 with
1045 | Left e -> Left e
1046 | Right res ->
1047 if !Flag_parsing_c.disable_two_pass
1048 then Right res
1049 else begin
1050 pr2 "parsing pass2: try again";
1051 copy_tokens_stat ~src:tr_save ~dst: tr;
1052 let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1053 pass2
1054 end
1055 in
1056
1057
1058 (* again not sure if checkpoint2 corresponds to end of bad region *)
1059 let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1060 let checkpoint2_file = TH.file_of_tok tr.current in
1061
1062 let was_define =
1063 (match elem with
1064 | Left _ -> false
1065 | Right (_, line_error) ->
1066 let was_define =
34e49164
C
1067 let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1068 if List.length xs >= 2
1069 then
1070 (match Common.head_middle_tail xs with
1071 | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
485bce71
C
1072 true
1073 | _ -> false
34e49164 1074 )
485bce71
C
1075 else begin
1076 pr2 "WIERD: length list of error recovery tokens < 2 ";
1077 false
1078 end
1079 in
1080 (if was_define && !Flag_parsing_c.filter_msg_define_error
1081 then ()
1082 else
1083 (* bugfix: *)
1084 if (checkpoint_file = checkpoint2_file) && checkpoint_file = file
1085 then print_bad line_error (checkpoint, checkpoint2) filelines
1086 else pr2 "PB: bad: but on tokens not from original file"
1087 );
1088 was_define
1089 ) in
1090
34e49164 1091
485bce71
C
1092 let diffline =
1093 if (checkpoint_file = checkpoint2_file) && (checkpoint_file = file)
1094 then (checkpoint2 - checkpoint)
1095 else 0
1096 (* TODO? so if error come in middle of something ? where the
1097 * start token was from original file but synchro found in body
1098 * of macro ? then can have wrong number of lines stat.
1099 * Maybe simpler just to look at tr.passed and count
1100 * the lines in the token from the correct file ?
1101 *)
34e49164 1102 in
34e49164
C
1103 let info = mk_info_item file (List.rev tr.passed) in
1104
485bce71
C
1105 (* some stat updates *)
1106 stat.Stat.commentized <-
1107 stat.Stat.commentized + count_lines_commentized (snd info);
1108
1109 let elem =
1110 match elem with
1111 | Left e -> e
1112 | Right (info_of_bads, _line_error) ->
1113 Ast_c.NotParsedCorrectly info_of_bads
1114 in
34e49164
C
1115 (match elem with
1116 | Ast_c.NotParsedCorrectly xs ->
485bce71
C
1117 if was_define && !Flag_parsing_c.filter_define_error
1118 then stat.Stat.correct <- stat.Stat.correct + diffline
1119 else stat.Stat.bad <- stat.Stat.bad + diffline
1120 | _ -> stat.Stat.correct <- stat.Stat.correct + diffline
34e49164
C
1121 );
1122
1123 (match elem with
1124 | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
485bce71 1125 | xs -> (xs, info):: loop tr (* recurse *)
34e49164
C
1126 )
1127 in
485bce71
C
1128 let v = loop tr in
1129
34e49164
C
1130 let v = consistency_checking v in
1131 (v, stat)
1132
1133
485bce71
C
1134let time_total_parsing a =
1135 Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1136
34e49164 1137let parse_print_error_heuristic a =
485bce71
C
1138 Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1139
34e49164
C
1140
1141(* alias *)
1142let parse_c_and_cpp a = parse_print_error_heuristic a
1143
1144(*****************************************************************************)
1145(* Same but faster cos memoize stuff *)
1146(*****************************************************************************)
1147let parse_cache file =
1148 if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1149 else
485bce71 1150 let _ = pr2 "TOFIX" in
34e49164
C
1151 let need_no_changed_files =
1152 (* should use Sys.argv.(0), would be safer. *)
485bce71
C
1153
1154 [
1155 (* TOFIX
1156 Config.path ^ "/parsing_c/c_parser.cma";
1157 (* we may also depend now on the semantic patch because
1158 the SP may use macro and so we will disable some of the
1159 macro expansions from standard.h.
1160 *)
1161 !Config.std_h;
1162 *)
34e49164
C
1163 ]
1164 in
1165 let need_no_changed_variables =
1166 (* could add some of the flags of flag_parsing_c.ml *)
1167 []
1168 in
1169 Common.cache_computation_robust
1170 file ".ast_raw"
1171 (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1172 (fun () -> parse_print_error_heuristic file)
1173
1174
1175
1176(*****************************************************************************)
485bce71 1177(* Some special cases *)
34e49164
C
1178(*****************************************************************************)
1179
485bce71
C
1180let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1181 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1182 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1183 program +> Common.find_some (fun (e,_) ->
1184 match e with
1185 | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1186 | _ -> None
1187 )
1188
1189let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1190 Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1191 let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1192 program +> Common.find_some (fun (e,_) ->
1193 match e with
1194 | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1195 (match compound with
1196 | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1197 | _ -> None
1198 )
1199 | _ -> None
1200 )