parsing_c/parse_c.ml

   1 (* Copyright (C) 2006, 2007, 2008 Yoann Padioleau
   2  *
   3  * This program is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU General Public License (GPL)
   5  * version 2 as published by the Free Software Foundation.
   6  *
   7  * This program is distributed in the hope that it will be useful,
   8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  10  * file license.txt for more details.
  11  *)
  12
  13 open Common
  14
  15 module TH = Token_helpers
  16 module LP = Lexer_parser
  17
  18 module Stat = Parsing_stat
  19
  20 (*****************************************************************************)
  21 (* Wrappers *)
  22 (*****************************************************************************)
  23 let pr2 s =
  24   if !Flag_parsing_c.verbose_parsing
  25   then Common.pr2 s
  26
  27 let pr2_once s =
  28   if !Flag_parsing_c.verbose_parsing
  29   then Common.pr2_once s
  30
  31 (*****************************************************************************)
  32 (* Helpers *)
  33 (*****************************************************************************)
  34
  35 let lexbuf_to_strpos lexbuf     =
  36   (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
  37
  38 let token_to_strpos tok =
  39   (TH.str_of_tok tok, TH.pos_of_tok tok)
  40
  41
  42 let error_msg_tok tok =
  43   let file = TH.file_of_tok tok in
  44   if !Flag_parsing_c.verbose_parsing
  45   then Common.error_message file (token_to_strpos tok)
  46   else ("error in " ^ file  ^ "set verbose_parsing for more info")
  47
  48
  49 let print_bad line_error (start_line, end_line) filelines  =
  50   begin
  51     pr2 ("badcount: " ^ i_to_s (end_line - start_line));
  52
  53     for i = start_line to end_line do
  54       let line = filelines.(i) in
  55
  56       if i = line_error
  57       then  pr2 ("BAD:!!!!!" ^ " " ^ line)
  58       else  pr2 ("bad:" ^ " " ^      line)
  59     done
  60   end
  61
  62
  63
  64 let mk_info_item2 filename toks =
  65   let buf = Buffer.create 100 in
  66   let s =
  67     (* old: get_slice_file filename (line1, line2) *)
  68     begin
  69       toks +> List.iter (fun tok ->
  70         match TH.pinfo_of_tok tok with
  71         | Ast_c.OriginTok _ ->
  72             Buffer.add_string buf (TH.str_of_tok tok)
  73         | Ast_c.AbstractLineTok _ ->
  74             raise Impossible
  75         | _ -> ()
  76       );
  77       Buffer.contents buf
  78     end
  79   in
  80   (s, toks)
  81
  82 let mk_info_item a b =
  83   Common.profile_code "C parsing.mk_info_item"
  84     (fun () -> mk_info_item2 a b)
  85
  86
  87 let info_same_line line xs =
  88   xs +> List.filter (fun info -> Ast_c.line_of_info info = line)
  89
  90
  91 (*****************************************************************************)
  92 (* Stats on what was passed/commentized  *)
  93 (*****************************************************************************)
  94
  95 let commentized xs = xs +> Common.map_filter (function
  96   | Parser_c.TCommentCpp (cppkind, ii) ->
  97       let s = Ast_c.str_of_info ii in
  98       let legal_passing =
  99         match !Flag_parsing_c.filter_passed_level with
 100         | 0 -> false
 101         | 1 ->
 102             List.mem cppkind [Ast_c.CppAttr]
 103             ||
 104             (s =~ "__.*")
 105         | 2 ->
 106             List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal]
 107             ||
 108             (s =~ "__.*")
 109         | 3 ->
 110             List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective]
 111             ||
 112             (s =~ "__.*")
 113         | 4 ->
 114             List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppMacro]
 115             ||
 116             (s =~ "__.*")
 117
 118
 119         | 5 ->
 120             List.mem cppkind [Ast_c.CppAttr;Ast_c.CppPassingNormal;Ast_c.CppDirective;Ast_c.CppMacro]
 121             ||
 122             (s =~ "__.*")
 123
 124
 125
 126
 127         | _ -> failwith "not valid level passing number"
 128       in
 129       if legal_passing then None else Some (ii.Ast_c.pinfo)
 130
 131         (*
 132         | Ast_c.CppOther ->
 133             (match s with
 134             | s when s =~ "KERN_.*" -> None
 135             | s when s =~ "__.*" -> None
 136             | _ ->
 137                 Some (ii.Ast_c.pinfo)
 138             )
 139         *)
 140
 141
 142   | Parser_c.TCommentMisc ii
 143   | Parser_c.TAction ii
 144     ->
 145       Some (ii.Ast_c.pinfo)
 146   | _ ->
 147       None
 148  )
 149
 150 let count_lines_commentized xs =
 151   let line = ref (-1) in
 152   let count = ref 0 in
 153   begin
 154     commentized xs +>
 155     List.iter
 156       (function
 157           Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
 158             let newline = pinfo.Common.line in
 159             if newline <> !line
 160             then begin
 161               line := newline;
 162               incr count
 163             end
 164         | _ -> ());
 165     !count
 166   end
 167
 168
 169
 170 let print_commentized xs =
 171   let line = ref (-1) in
 172   begin
 173     let ys = commentized xs in
 174     ys +>
 175     List.iter
 176       (function
 177           Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
 178             let newline = pinfo.Common.line in
 179             let s = pinfo.Common.str in
 180             let s = Str.global_substitute
 181                 (Str.regexp "\n") (fun s -> "") s
 182             in
 183             if newline = !line
 184             then prerr_string (s ^ " ")
 185             else begin
 186               if !line = -1
 187               then pr2_no_nl "passed:"
 188               else pr2_no_nl "\npassed:";
 189               line := newline;
 190               pr2_no_nl (s ^ " ");
 191             end
 192         | _ -> ());
 193     if not (null ys) then pr2 "";
 194   end
 195
 196
 197
 198
 199 (*****************************************************************************)
 200 (* Lexing only *)
 201 (*****************************************************************************)
 202
 203 (* called by parse_print_error_heuristic *)
 204 let tokens2 file =
 205  let table     = Common.full_charpos_to_pos file in
 206
 207  Common.with_open_infile file (fun chan ->
 208   let lexbuf = Lexing.from_channel chan in
 209   try
 210     let rec tokens_aux acc =
 211       let tok = Lexer_c.token lexbuf in
 212       (* fill in the line and col information *)
 213       let tok = tok +> TH.visitor_info_of_tok (fun ii ->
 214         { ii with Ast_c.pinfo=
 215           (* could assert pinfo.filename = file ? *)
 216           match Ast_c.pinfo_of_info ii with
 217             Ast_c.OriginTok pi ->
 218               Ast_c.OriginTok (Common.complete_parse_info file table pi)
 219           | Ast_c.ExpandedTok (pi,vpi) ->
 220               Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
 221           | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
 222           | Ast_c.AbstractLineTok pi -> failwith "should not occur"
 223       })
 224       in
 225
 226       if TH.is_eof tok
 227       then List.rev (tok::acc)
 228       else tokens_aux (tok::acc)
 229     in
 230     tokens_aux []
 231   with
 232     | Lexer_c.Lexical s ->
 233         failwith ("lexical error " ^ s ^ "\n =" ^
 234                   (Common.error_message file (lexbuf_to_strpos lexbuf)))
 235     | e -> raise e
 236  )
 237
 238 let time_lexing ?(profile=true) a =
 239   if profile
 240   then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
 241   else tokens2 a
 242 let tokens ?profile a =
 243   Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
 244
 245
 246 let tokens_of_string string =
 247   let lexbuf = Lexing.from_string string in
 248   try
 249     let rec tokens_s_aux () =
 250       let tok = Lexer_c.token lexbuf in
 251       if TH.is_eof tok
 252       then [tok]
 253       else tok::(tokens_s_aux ())
 254     in
 255     tokens_s_aux ()
 256   with
 257     | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
 258     | e -> raise e
 259
 260
 261 (*****************************************************************************)
 262 (* Parsing, but very basic, no more used *)
 263 (*****************************************************************************)
 264
 265 (*
 266  * !!!Those function use refs, and are not reentrant !!! so take care.
 267  * It use globals defined in Lexer_parser.
 268  *
 269  * update: because now lexer return comments tokens, those functions
 270  * may not work anymore.
 271  *)
 272
 273 let parse file =
 274   let lexbuf = Lexing.from_channel (open_in file) in
 275   let result = Parser_c.main Lexer_c.token lexbuf in
 276   result
 277
 278
 279 let parse_print_error file =
 280   let chan = (open_in file) in
 281   let lexbuf = Lexing.from_channel chan in
 282
 283   let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
 284   try
 285     lexbuf +> Parser_c.main Lexer_c.token
 286   with
 287   | Lexer_c.Lexical s ->
 288       failwith ("lexical error " ^s^ "\n =" ^  error_msg ())
 289   | Parsing.Parse_error ->
 290       failwith ("parse error \n = " ^ error_msg ())
 291   | Semantic_c.Semantic (s, i) ->
 292       failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
 293   | e -> raise e
 294
 295
 296
 297
 298 (*****************************************************************************)
 299 (* Parsing subelements, useful to debug parser *)
 300 (*****************************************************************************)
 301
 302 (*
 303  * !!!Those function use refs, and are not reentrant !!! so take care.
 304  * It use globals defined in Lexer_parser.
 305  *)
 306
 307
 308 (* old:
 309  *   let parse_gen parsefunc s =
 310  *     let lexbuf = Lexing.from_string s in
 311  *     let result = parsefunc Lexer_c.token lexbuf in
 312  *     result
 313  *)
 314
 315 let parse_gen parsefunc s =
 316   let toks = tokens_of_string s +> List.filter TH.is_not_comment in
 317
 318
 319   (* Why use this lexing scheme ? Why not classically give lexer func
 320    * to parser ? Because I now keep comments in lexer. Could
 321    * just do a simple wrapper that when comment ask again for a token,
 322    * but maybe simpler to use cur_tok technique.
 323    *)
 324   let all_tokens = ref toks in
 325   let cur_tok    = ref (List.hd !all_tokens) in
 326
 327   let lexer_function =
 328     (fun _ ->
 329       if TH.is_eof !cur_tok
 330       then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
 331       else
 332         let v = Common.pop2 all_tokens in
 333         cur_tok := v;
 334         !cur_tok
 335     )
 336   in
 337   let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
 338   let result = parsefunc lexer_function lexbuf_fake in
 339   result
 340
 341
 342 let type_of_string       = parse_gen Parser_c.type_name
 343 let statement_of_string  = parse_gen Parser_c.statement
 344 let expression_of_string = parse_gen Parser_c.expr
 345
 346 (* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
 347
 348
 349
 350
 351
 352 (*****************************************************************************)
 353 (* Consistency checking *)
 354 (*****************************************************************************)
 355
 356 type class_ident =
 357   | CIdent (* can be var, func, field, tag, enum constant *)
 358   | CTypedef
 359
 360 let str_of_class_ident = function
 361   | CIdent -> "Ident"
 362   | CTypedef -> "Typedef"
 363
 364 (*
 365   | CMacro
 366   | CMacroString
 367   | CMacroStmt
 368   | CMacroDecl
 369   | CMacroIterator
 370   | CAttr
 371
 372 (* but take care that must still be able to use '=' *)
 373 type context = InFunction | InEnum | InStruct | InInitializer | InParams
 374 type class_token =
 375   | CIdent of class_ident
 376
 377   | CComment
 378   | CSpace
 379   | CCommentCpp of cppkind
 380   | CCommentMisc
 381   | CCppDirective
 382
 383   | COPar
 384   | CCPar
 385   | COBrace
 386   | CCBrace
 387
 388   | CSymbol
 389   | CReservedKwd (type | decl | qualif | flow | misc | attr)
 390 *)
 391
 392 (* parse_typedef_fix4 *)
 393 let consistency_checking2 xs =
 394
 395   (* first phase, gather data *)
 396   let stat = Hashtbl.create 101 in
 397
 398   (* default value for hash *)
 399   let v1 () = Hashtbl.create 101 in
 400   let v2 () = ref 0 in
 401
 402   let bigf = { Visitor_c.default_visitor_c with
 403
 404     Visitor_c.kexpr = (fun (k,bigf) x ->
 405       match Ast_c.unwrap_expr x with
 406       | Ast_c.Ident s ->
 407           stat +>
 408             Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
 409             (fun aref -> incr aref)
 410
 411       | _ -> k x
 412     );
 413     Visitor_c.ktype = (fun (k,bigf) t ->
 414       match Ast_c.unwrap_typeC t with
 415       | Ast_c.TypeName (s,_typ) ->
 416           stat +>
 417             Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
 418             (fun aref -> incr aref)
 419
 420       | _ -> k t
 421     );
 422   }
 423   in
 424   xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
 425
 426
 427   let ident_to_type = ref [] in
 428
 429
 430   (* second phase, analyze data *)
 431   stat +> Hashtbl.iter (fun k v ->
 432     let xs = Common.hash_to_list v in
 433     if List.length xs >= 2
 434     then begin
 435       pr2 ("CONFLICT:" ^ k);
 436       let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
 437         if !va = !vb then
 438           (match ka, kb with
 439           | CTypedef, _ -> 1 (* first is smaller *)
 440           | _, CTypedef -> -1
 441           | _ -> 0
 442           )
 443         else compare !va !vb
 444       ) in
 445       let sorted = List.rev sorted in
 446       match sorted with
 447       | [CTypedef, i1;CIdent, i2] ->
 448           pr2 ("transforming some ident in typedef");
 449           push2 k ident_to_type;
 450       | _ ->
 451           pr2 ("TODO:other transforming?");
 452
 453     end
 454   );
 455
 456   (* third phase, update ast.
 457    * todo? but normally should try to handle correctly scope ? maybe sometime
 458    * sizeof(id) and even if id was for a long time an identifier, maybe
 459    * a few time, because of the scope it's actually really a type.
 460    *)
 461   if (null !ident_to_type)
 462   then xs
 463   else
 464     let bigf = { Visitor_c.default_visitor_c_s with
 465       Visitor_c.kdefineval_s = (fun (k,bigf) x ->
 466         match x with
 467         | Ast_c.DefineExpr e ->
 468             (match e with
 469             | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
 470                 let t = (Ast_c.nQ,
 471                         (Ast_c.TypeName  (s, Ast_c.noTypedefDef()), ii)) in
 472
 473                 Ast_c.DefineType t
 474             | _ -> k x
 475             )
 476         | _ -> k x
 477       );
 478       Visitor_c.kexpr_s = (fun (k, bigf) x ->
 479         match x with
 480         | (Ast_c.SizeOfExpr e, tref), isizeof ->
 481             let i1 = tuple_of_list1 isizeof in
 482             (match e with
 483             | (Ast_c.ParenExpr e, _), iiparen ->
 484                 (match e with
 485                 | (Ast_c.Ident s, _), ii when List.mem s !ident_to_type ->
 486                     let (i2, i3) = tuple_of_list2 iiparen in
 487                     let t = (Ast_c.nQ,
 488                             (Ast_c.TypeName  (s, Ast_c.noTypedefDef()), ii)) in
 489                     (Ast_c.SizeOfType t, tref), [i1;i2;i3]
 490
 491                 | _ -> k x
 492                 )
 493             | _ -> k x
 494             )
 495         | _ -> k x
 496       );
 497     } in
 498     xs +> List.map (fun (p, info_item) ->
 499       Visitor_c.vk_toplevel_s bigf p, info_item
 500     )
 501
 502
 503 let consistency_checking a  =
 504   Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
 505
 506
 507
 508 (*****************************************************************************)
 509 (* Error recovery *)
 510 (*****************************************************************************)
 511
 512 (* todo: do something if find Parser_c.Eof ? *)
 513 let rec find_next_synchro next already_passed =
 514
 515   (* Maybe because not enough }, because for example an ifdef contains
 516    * in both branch some opening {, we later eat too much, "on deborde
 517    * sur la fonction d'apres". So already_passed may be too big and
 518    * looking for next synchro point starting from next may not be the
 519    * best. So maybe we can find synchro point inside already_passed
 520    * instead of looking in next.
 521    *
 522    * But take care! must progress. We must not stay in infinite loop!
 523    * For instance now I have as a error recovery to look for
 524    * a "start of something", corresponding to start of function,
 525    * but must go beyond this start otherwise will loop.
 526    * So look at premier(external_declaration2) in parser.output and
 527    * pass at least those first tokens.
 528    *
 529    * I have chosen to start search for next synchro point after the
 530    * first { I found, so quite sure we will not loop. *)
 531
 532   let last_round = List.rev already_passed in
 533   let is_define =
 534     let xs = last_round +> List.filter TH.is_not_comment in
 535     match xs with
 536     | Parser_c.TDefine _::_ -> true
 537     | _ -> false
 538   in
 539   if is_define
 540   then find_next_synchro_define (last_round ++ next) []
 541   else
 542
 543   let (before, after) =
 544     last_round +> Common.span (fun tok ->
 545       match tok with
 546       (* by looking at TOBrace we are sure that the "start of something"
 547        * will not arrive too early
 548        *)
 549       | Parser_c.TOBrace _ -> false
 550       | Parser_c.TDefine _ -> false
 551       | _ -> true
 552     )
 553   in
 554   find_next_synchro_orig (after ++ next)  (List.rev before)
 555
 556
 557
 558 and find_next_synchro_define next already_passed =
 559   match next with
 560   | [] ->
 561       pr2 "ERROR-RECOV: end of file while in recovery mode";
 562       already_passed, []
 563   | (Parser_c.TDefEOL i as v)::xs  ->
 564       pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
 565       v::already_passed, xs
 566   | v::xs ->
 567       find_next_synchro_define xs (v::already_passed)
 568
 569
 570
 571
 572 and find_next_synchro_orig next already_passed =
 573   match next with
 574   | [] ->
 575       pr2 "ERROR-RECOV: end of file while in recovery mode";
 576       already_passed, []
 577
 578   | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v = 0 ->
 579       pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
 580
 581       (match xs with
 582       | [] -> raise Impossible (* there is a EOF token normally *)
 583
 584       (* still useful: now parser.mly allow empty ';' so normally no pb *)
 585       | Parser_c.TPtVirg iptvirg::xs ->
 586           pr2 "ERROR-RECOV: found sync bis, eating } and ;";
 587           (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
 588
 589       | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
 590           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 591           (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
 592           xs
 593
 594       | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
 595         ::xs ->
 596           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 597           (Parser_c.TCommentSpace sp)::
 598             (Parser_c.TPtVirg iptvirg)::
 599             (Parser_c.TIdent x)::
 600             v::
 601             already_passed,
 602           xs
 603
 604       | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
 605         ::xs ->
 606           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 607           (Parser_c.TCommentNewline sp)::
 608             (Parser_c.TPtVirg iptvirg)::
 609             (Parser_c.TIdent x)::
 610             v::
 611             already_passed,
 612           xs
 613
 614       | _ ->
 615           v::already_passed, xs
 616       )
 617   | v::xs when TH.col_of_tok v = 0 && TH.is_start_of_something v  ->
 618       pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
 619       already_passed, v::xs
 620
 621   | v::xs ->
 622       find_next_synchro_orig xs (v::already_passed)
 623
 624
 625 (*****************************************************************************)
 626 (* Include/Define hacks *)
 627 (*****************************************************************************)
 628
 629 (* Sometimes I prefer to generate a single token for a list of things in the
 630  * lexer so that if I have to passed them, like for passing TInclude then
 631  * it's easy. Also if I don't do a single token, then I need to
 632  * parse the rest which may not need special stuff, like detecting
 633  * end of line which the parser is not really ready for. So for instance
 634  * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
 635  * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
 636  * but this kind of token is valid only after a #include and the
 637  * lexing and parsing rules are different for such tokens so not that
 638  * easy to parse such things in parser_c.mly. Hence the following hacks.
 639  *
 640  * less?: maybe could get rid of this like I get rid of some of fix_define.
 641  *)
 642
 643 (* ------------------------------------------------------------------------- *)
 644 (* helpers *)
 645 (* ------------------------------------------------------------------------- *)
 646
 647 (* used to generate new token from existing one *)
 648 let new_info posadd str ii =
 649   { Ast_c.pinfo =
 650       Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
 651         charpos = Ast_c.pos_of_info ii + posadd;
 652         str     = str;
 653         column = Ast_c.col_of_info ii + posadd;
 654       };
 655     (* must generate a new ref each time, otherwise share *)
 656     cocci_tag = ref Ast_c.emptyAnnot;
 657     comments_tag = ref Ast_c.emptyComments;
 658    }
 659
 660
 661 let rec comment_until_defeol xs =
 662   match xs with
 663   | [] -> failwith "cant find end of define token TDefEOL"
 664   | x::xs ->
 665       (match x with
 666       | Parser_c.TDefEOL i ->
 667           Parser_c.TCommentCpp (Ast_c.CppDirective, TH.info_of_tok x)
 668           ::xs
 669       | _ ->
 670           let x' =
 671             (* bugfix: otherwise may lose a TComment token *)
 672             if TH.is_real_comment x
 673             then x
 674             else Parser_c.TCommentCpp (Ast_c.CppPassingNormal (*good?*), TH.info_of_tok x)
 675           in
 676           x'::comment_until_defeol xs
 677       )
 678
 679 let drop_until_defeol xs =
 680   List.tl
 681     (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
 682
 683
 684
 685 (* ------------------------------------------------------------------------- *)
 686 (* returns a pair (replaced token, list of next tokens) *)
 687 (* ------------------------------------------------------------------------- *)
 688
 689 let tokens_include (info, includes, filename, inifdef) =
 690   Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
 691   [Parser_c.TIncludeFilename
 692       (filename, (new_info (String.length includes) filename info))
 693   ]
 694
 695 (*****************************************************************************)
 696 (* Parsing default define macros, usually in a standard.h file *)
 697 (*****************************************************************************)
 698
 699 let parse_cpp_define_file2 file =
 700   let toks = tokens ~profile:false file in
 701   let toks = Parsing_hacks.fix_tokens_define toks in
 702   Parsing_hacks.extract_cpp_define toks
 703
 704 let parse_cpp_define_file a =
 705   Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
 706
 707 (* can not be put in parsing_hack, cos then mutually recursive problem as
 708  * we also want to parse the standard.h file.
 709  *)
 710 let init_defs std_h =
 711   if not (Common.lfile_exists std_h)
 712   then pr2 ("warning: Can't find default macro file: " ^ std_h)
 713   else begin
 714     pr2 ("init_defs: " ^ std_h);
 715     Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
 716   end
 717
 718
 719 (*****************************************************************************)
 720 (* Main entry point *)
 721 (*****************************************************************************)
 722
 723 type info_item =  string * Parser_c.token list
 724
 725 type program2 = toplevel2 list
 726      and toplevel2 = Ast_c.toplevel * info_item
 727
 728 let program_of_program2 xs =
 729   xs +> List.map fst
 730
 731 let with_program2 f program2 =
 732   program2
 733   +> Common.unzip
 734   +> (fun (program, infos) ->
 735     f program, infos
 736   )
 737   +> Common.uncurry Common.zip
 738
 739
 740
 741 (* The use of local refs (remaining_tokens, passed_tokens, ...) makes
 742  * possible error recovery. Indeed, they allow to skip some tokens and
 743  * still be able to call again the ocamlyacc parser. It is ugly code
 744  * because we cant modify ocamllex and ocamlyacc. As we want some
 745  * extended lexing tricks, we have to use such refs.
 746  *
 747  * Those refs are now also used for my lalr(k) technique. Indeed They
 748  * store the futur and previous tokens that were parsed, and so
 749  * provide enough context information for powerful lex trick.
 750  *
 751  * - passed_tokens_last_ckp stores the passed tokens since last
 752  *   checkpoint. Used for NotParsedCorrectly and also to build the
 753  *   info_item attached to each program_element.
 754  * - passed_tokens_clean is used for lookahead, in fact for lookback.
 755  * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
 756  *   contain some comments and so would make pattern matching difficult
 757  *   in lookahead. Hence this variable. We would like also to get rid
 758  *   of cpp instruction because sometimes a cpp instruction is between
 759  *   two tokens and makes a pattern matching fail. But lookahead also
 760  *   transform some cpp instruction (in comment) so can't remove them.
 761  *
 762  * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
 763  * whereas passed_tokens_clean and remaining_tokens_clean does not contain
 764  * comment-tokens.
 765  *
 766  * Normally we have:
 767  * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
 768  *    after the call to pop2.
 769  * toks = (reverse passed_tok) ++ remaining_tokens
 770  *     at the and of the lexer_function call.
 771  * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
 772  * At the end of lexer_function call,  cur_tok  overlap  with passed_tok.
 773  *
 774  * convention: I use "tr"  for "tokens refs"
 775  *
 776  * I now also need this lexing trick because the lexer return comment
 777  * tokens.
 778  *)
 779
 780 type tokens_state = {
 781   mutable rest :         Parser_c.token list;
 782   mutable rest_clean :   Parser_c.token list;
 783   mutable current :      Parser_c.token;
 784   (* it's passed since last "checkpoint", not passed from the beginning *)
 785   mutable passed :       Parser_c.token list;
 786   mutable passed_clean : Parser_c.token list;
 787 }
 788 let clone_tokens_stat tr =
 789   { rest = tr.rest;
 790     rest_clean = tr.rest_clean;
 791     current = tr.current;
 792     passed = tr.passed;
 793     passed_clean = tr.passed_clean;
 794   }
 795 let copy_tokens_stat ~src ~dst =
 796   dst.rest <- src.rest;
 797   dst.rest_clean <- src.rest_clean;
 798   dst.current <- src.current;
 799   dst.passed <- src.passed;
 800   dst.passed_clean <-  src.passed_clean;
 801   ()
 802
 803 let rec filter_noise n xs =
 804   match n, xs with
 805   | _, [] -> []
 806   | 0, xs -> xs
 807   | n, x::xs ->
 808       (match x with
 809       | Parser_c.TMacroAttr _ ->
 810           filter_noise (n-1) xs
 811       | _ ->
 812           x::filter_noise (n-1) xs
 813       )
 814
 815 let clean_for_lookahead xs =
 816   match xs with
 817   | [] -> []
 818   | [x] -> [x]
 819   | x::xs ->
 820       x::filter_noise 10 xs
 821
 822
 823
 824 (* Hacked lex. This function use refs passed by parse_print_error_heuristic
 825  * tr means token refs.
 826  *)
 827 let rec lexer_function ~pass tr = fun lexbuf ->
 828   match tr.rest with
 829   | [] -> pr2 "ALREADY AT END"; tr.current
 830   | v::xs ->
 831     tr.rest <- xs;
 832     tr.current <- v;
 833
 834     if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
 835
 836     if TH.is_comment v
 837     then begin
 838       tr.passed <- v::tr.passed;
 839       lexer_function ~pass tr lexbuf
 840     end
 841     else begin
 842       let x = List.hd tr.rest_clean  in
 843       tr.rest_clean <- List.tl tr.rest_clean;
 844       assert (x = v);
 845
 846       (match v with
 847       (* fix_define1. Why not in parsing_hacks lookahead and do passing like
 848        * I do for some ifdef directives ? Because here I also need to
 849        * generate some tokens sometimes.
 850        *)
 851       | Parser_c.TDefine (tok) ->
 852           if not (LP.current_context () = LP.InTopLevel) &&
 853             (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
 854           then begin
 855             incr Stat.nDefinePassing;
 856             pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
 857             let v' = Parser_c.TCommentCpp (Ast_c.CppDirective,TH.info_of_tok v)
 858             in
 859             tr.passed <- v'::tr.passed;
 860             tr.rest       <- comment_until_defeol tr.rest;
 861             tr.rest_clean <- drop_until_defeol tr.rest_clean;
 862             lexer_function ~pass tr lexbuf
 863           end
 864           else begin
 865             tr.passed <- v::tr.passed;
 866             tr.passed_clean <- v::tr.passed_clean;
 867             v
 868           end
 869
 870       | Parser_c.TInclude (includes, filename, inifdef, info) ->
 871           if not (LP.current_context () = LP.InTopLevel)  &&
 872             (!Flag_parsing_c.cpp_directive_passing || (pass = 2))
 873           then begin
 874             incr Stat.nIncludePassing;
 875             pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
 876             let v = Parser_c.TCommentCpp(Ast_c.CppDirective, info) in
 877             tr.passed <- v::tr.passed;
 878             lexer_function ~pass tr lexbuf
 879           end
 880           else begin
 881             let (v,new_tokens) =
 882               tokens_include (info, includes, filename, inifdef) in
 883             let new_tokens_clean =
 884               new_tokens +> List.filter TH.is_not_comment  in
 885
 886             tr.passed <- v::tr.passed;
 887             tr.passed_clean <- v::tr.passed_clean;
 888             tr.rest <- new_tokens ++ tr.rest;
 889             tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
 890             v
 891           end
 892
 893       | _ ->
 894
 895           (* typedef_fix1 *)
 896           let v = match v with
 897             | Parser_c.TIdent (s, ii) ->
 898                 if
 899                   LP.is_typedef s &&
 900                     not (!Flag_parsing_c.disable_add_typedef) &&
 901                     pass = 1
 902                 then Parser_c.TypedefIdent (s, ii)
 903                 else Parser_c.TIdent (s, ii)
 904             | x -> x
 905           in
 906
 907           let v = Parsing_hacks.lookahead ~pass
 908             (clean_for_lookahead (v::tr.rest_clean))
 909             tr.passed_clean in
 910
 911           tr.passed <- v::tr.passed;
 912
 913           (* the lookahead may have changed the status of the token and
 914            * consider it as a comment, for instance some #include are
 915            * turned into comments, hence this code. *)
 916           match v with
 917           | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
 918           | v ->
 919               tr.passed_clean <- v::tr.passed_clean;
 920               v
 921       )
 922     end
 923
 924
 925
 926 let get_one_elem ~pass tr (file, filelines) =
 927
 928   if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
 929   then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
 930
 931   (* normally have to do that only when come from an exception in which
 932    * case the dt() may not have been done
 933    * TODO but if was in scoped scope ? have to let only the last scope
 934    * so need do a LP.lexer_reset_typedef ();
 935    *)
 936   LP.enable_typedef();
 937   LP._lexer_hint := (LP.default_hint ());
 938   LP.save_typedef_state();
 939
 940   tr.passed <- [];
 941
 942   let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
 943
 944   (try
 945       (* -------------------------------------------------- *)
 946       (* Call parser *)
 947       (* -------------------------------------------------- *)
 948       Common.profile_code_exclusif "YACC" (fun () ->
 949         Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
 950       )
 951     with e -> begin
 952       if (pass = 1 && !Flag_parsing_c.disable_two_pass)|| (pass = 2)
 953       then begin
 954         (match e with
 955         (* Lexical is not anymore launched I think *)
 956         | Lexer_c.Lexical s ->
 957             pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
 958         | Parsing.Parse_error ->
 959             pr2 ("parse error \n = " ^ error_msg_tok tr.current)
 960         | Semantic_c.Semantic (s, i) ->
 961             pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
 962         | e -> raise e
 963         )
 964       end;
 965       LP.restore_typedef_state();
 966
 967       (* must keep here, before the code that adjusts the tr fields *)
 968       let line_error = TH.line_of_tok tr.current in
 969
 970
 971       (*  error recovery, go to next synchro point *)
 972       let (passed', rest') = find_next_synchro tr.rest tr.passed in
 973       tr.rest <- rest';
 974       tr.passed <- passed';
 975
 976       tr.current <- List.hd passed';
 977       tr.passed_clean <- [];           (* enough ? *)
 978       (* with error recovery, rest and rest_clean may not be in sync *)
 979       tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
 980
 981
 982       let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
 983       Right (info_of_bads,  line_error, tr.passed)
 984     end
 985   )
 986
 987
 988
 989
 990 (* note: as now we go in 2 passes, there is first all the error message of
 991  * the lexer, and then the error of the parser. It is not anymore
 992  * interwinded.
 993  *
 994  * !!!This function use refs, and is not reentrant !!! so take care.
 995  * It use globals defined in Lexer_parser and also the _defs global
 996  * in parsing_hack.ml.
 997  *
 998  * This function uses internally some semi globals in the
 999  * tokens_stat record and parsing_stat record.
1000  *)
1001
1002 let parse_print_error_heuristic2 file =
1003
1004   let filelines = Common.cat_array file in
1005   let stat = Parsing_stat.default_stat file in
1006
1007   (* -------------------------------------------------- *)
1008   (* call lexer and get all the tokens *)
1009   (* -------------------------------------------------- *)
1010   LP.lexer_reset_typedef();
1011   Parsing_hacks.ifdef_paren_cnt := 0;
1012   let toks_orig = tokens file in
1013
1014   let toks = Parsing_hacks.fix_tokens_define toks_orig in
1015   let toks = Parsing_hacks.fix_tokens_cpp toks in
1016
1017   let tr = {
1018     rest       = toks;
1019     rest_clean = (toks +> List.filter TH.is_not_comment);
1020     current    = (List.hd toks);
1021     passed = [];
1022     passed_clean = [];
1023   } in
1024
1025
1026
1027
1028   let rec loop tr =
1029
1030     (* todo?: I am not sure that it represents current_line, cos maybe
1031      * tr.current partipated in the previous parsing phase, so maybe tr.current
1032      * is not the first token of the next parsing phase. Same with checkpoint2.
1033      * It would be better to record when we have a } or ; in parser.mly,
1034      *  cos we know that they are the last symbols of external_declaration2.
1035      *
1036      * bugfix: may not be equal to 'file' as after macro expansions we can
1037      * start to parse a new entity from the body of a macro, for instance
1038      * when parsing a define_machine() body, cf standard.h
1039      *)
1040     let checkpoint = TH.line_of_tok tr.current in
1041     let checkpoint_file = TH.file_of_tok tr.current in
1042
1043     let tr_save = clone_tokens_stat tr in
1044
1045     (* call the parser *)
1046     let elem =
1047       let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1048       match pass1 with
1049       | Left e -> Left e
1050       | Right res ->
1051           if !Flag_parsing_c.disable_two_pass
1052           then Right res
1053           else begin
1054             pr2 "parsing pass2: try again";
1055             copy_tokens_stat ~src:tr_save ~dst: tr;
1056             let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1057             pass2
1058           end
1059     in
1060
1061
1062     (* again not sure if checkpoint2 corresponds to end of bad region *)
1063     let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1064     let checkpoint2_file = TH.file_of_tok tr.current in
1065
1066     let was_define =
1067       (match elem with
1068       | Left _ -> false
1069       | Right (_, line_error, _) ->
1070           let was_define =
1071             let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1072             if List.length xs >= 2
1073             then
1074               (match Common.head_middle_tail xs with
1075               | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
1076                   true
1077               | _ -> false
1078               )
1079             else begin
1080               pr2 "WIERD: length list of error recovery tokens < 2 ";
1081               false
1082             end
1083           in
1084           (if was_define && !Flag_parsing_c.filter_msg_define_error
1085           then ()
1086           else
1087             (* bugfix: *)
1088             if (checkpoint_file = checkpoint2_file) && checkpoint_file = file
1089             then print_bad line_error (checkpoint, checkpoint2) filelines
1090             else pr2 "PB: bad: but on tokens not from original file"
1091           );
1092           was_define
1093       ) in
1094
1095
1096     let diffline =
1097       if (checkpoint_file = checkpoint2_file) && (checkpoint_file = file)
1098       then (checkpoint2 - checkpoint)
1099       else 0
1100         (* TODO? so if error come in middle of something ? where the
1101          * start token was from original file but synchro found in body
1102          * of macro ? then can have wrong number of lines stat.
1103          * Maybe simpler just to look at tr.passed and count
1104          * the lines in the token from the correct file ?
1105          *)
1106     in
1107     let info = mk_info_item file (List.rev tr.passed) in
1108
1109     (* some stat updates *)
1110     stat.Stat.commentized <-
1111       stat.Stat.commentized + count_lines_commentized (snd info);
1112
1113     let elem =
1114       match elem with
1115       | Left e ->
1116           stat.Stat.correct <- stat.Stat.correct + diffline;
1117           e
1118       | Right (info_of_bads, line_error, toks_of_bads) ->
1119           if was_define && !Flag_parsing_c.filter_define_error
1120           then stat.Stat.correct <- stat.Stat.correct + diffline
1121           else stat.Stat.bad     <- stat.Stat.bad     + diffline;
1122
1123           let pbline =
1124             toks_of_bads
1125             +> Common.filter (TH.is_same_line line_error)
1126             +> Common.filter TH.is_ident_like
1127           in
1128           let error_info =
1129             (pbline +> List.map TH.str_of_tok), line_error
1130           in
1131           stat.Stat.problematic_lines <-
1132             error_info::stat.Stat.problematic_lines;
1133
1134           Ast_c.NotParsedCorrectly info_of_bads
1135     in
1136
1137     (match elem with
1138     | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
1139     | xs -> (xs, info):: loop tr (* recurse *)
1140     )
1141   in
1142   let v = loop tr in
1143   let v = consistency_checking v in
1144   (v, stat)
1145
1146
1147 let time_total_parsing a  =
1148   Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1149
1150 let parse_print_error_heuristic a  =
1151   Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1152
1153
1154 (* alias *)
1155 let parse_c_and_cpp a = parse_print_error_heuristic a
1156
1157 (*****************************************************************************)
1158 (* Same but faster cos memoize stuff *)
1159 (*****************************************************************************)
1160 let parse_cache file =
1161   if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1162   else
1163   let _ = pr2 "TOFIX" in
1164   let need_no_changed_files =
1165     (* should use Sys.argv.(0), would be safer. *)
1166
1167     [
1168       (* TOFIX
1169       Config.path ^ "/parsing_c/c_parser.cma";
1170       (* we may also depend now on the semantic patch because
1171          the SP may use macro and so we will disable some of the
1172          macro expansions from standard.h.
1173       *)
1174       !Config.std_h;
1175       *)
1176     ]
1177   in
1178   let need_no_changed_variables =
1179     (* could add some of the flags of flag_parsing_c.ml *)
1180     []
1181   in
1182   Common.cache_computation_robust
1183     file ".ast_raw"
1184     (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1185     (fun () -> parse_print_error_heuristic file)
1186
1187
1188
1189 (*****************************************************************************)
1190 (* Some special cases *)
1191 (*****************************************************************************)
1192
1193 let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1194   Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1195   let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1196   program +> Common.find_some (fun (e,_) ->
1197     match e with
1198     | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1199     | _ -> None
1200   )
1201
1202 let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1203   Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1204   let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1205   program +> Common.find_some (fun (e,_) ->
1206     match e with
1207     | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1208         (match compound with
1209         | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1210         | _ -> None
1211         )
1212     | _ -> None
1213   )