parsing_c/parse_c.ml

   1 (* Yoann Padioleau
   2  *
   3  * Copyright (C) 2006, 2007, 2008 Ecole des Mines de Nantes
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License (GPL)
   7  * version 2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * file license.txt for more details.
  13  *)
  14
  15 open Common
  16
  17 module TH = Token_helpers
  18 module LP = Lexer_parser
  19
  20 module Stat = Parsing_stat
  21
  22 (*****************************************************************************)
  23 (* Wrappers *)
  24 (*****************************************************************************)
  25 let pr2 s =
  26   if !Flag_parsing_c.verbose_parsing
  27   then Common.pr2 s
  28
  29 let pr2_once s =
  30   if !Flag_parsing_c.verbose_parsing
  31   then Common.pr2_once s
  32
  33 (*****************************************************************************)
  34 (* Helpers *)
  35 (*****************************************************************************)
  36
  37 let lexbuf_to_strpos lexbuf     =
  38   (Lexing.lexeme lexbuf, Lexing.lexeme_start lexbuf)
  39
  40 let token_to_strpos tok =
  41   (TH.str_of_tok tok, TH.pos_of_tok tok)
  42
  43
  44 let error_msg_tok tok =
  45   let file = TH.file_of_tok tok in
  46   if !Flag_parsing_c.verbose_parsing
  47   then Common.error_message file (token_to_strpos tok)
  48   else ("error in " ^ file  ^ "set verbose_parsing for more info")
  49
  50
  51 let print_bad line_error (start_line, end_line) filelines  =
  52   begin
  53     pr2 ("badcount: " ^ i_to_s (end_line - start_line));
  54
  55     for i = start_line to end_line do
  56       let line = filelines.(i) in
  57
  58       if i =|= line_error
  59       then  pr2 ("BAD:!!!!!" ^ " " ^ line)
  60       else  pr2 ("bad:" ^ " " ^      line)
  61     done
  62   end
  63
  64
  65
  66 let mk_info_item2 filename toks =
  67   let buf = Buffer.create 100 in
  68   let s =
  69     (* old: get_slice_file filename (line1, line2) *)
  70     begin
  71       toks +> List.iter (fun tok ->
  72         match TH.pinfo_of_tok tok with
  73         | Ast_c.OriginTok _ ->
  74             Buffer.add_string buf (TH.str_of_tok tok)
  75         | Ast_c.AbstractLineTok _ ->
  76             raise Impossible
  77         | _ -> ()
  78       );
  79       Buffer.contents buf
  80     end
  81   in
  82   (s, toks)
  83
  84 let mk_info_item a b =
  85   Common.profile_code "C parsing.mk_info_item"
  86     (fun () -> mk_info_item2 a b)
  87
  88
  89 let info_same_line line xs =
  90   xs +> List.filter (fun info -> Ast_c.line_of_info info =|= line)
  91
  92
  93 (*****************************************************************************)
  94 (* Stats on what was passed/commentized  *)
  95 (*****************************************************************************)
  96
  97 let commentized xs = xs +> Common.map_filter (function
  98   | Parser_c.TCommentCpp (cppkind, ii) ->
  99       let s = Ast_c.str_of_info ii in
 100       let legal_passing =
 101         match !Flag_parsing_c.filter_passed_level with
 102         | 0 -> false
 103         | 1 ->
 104             List.mem cppkind [Token_c.CppAttr]
 105             ||
 106             (s =~ "__.*")
 107         | 2 ->
 108             List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal]
 109             ||
 110             (s =~ "__.*")
 111         | 3 ->
 112             List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective]
 113             ||
 114             (s =~ "__.*")
 115         | 4 ->
 116             List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppMacro]
 117             ||
 118             (s =~ "__.*")
 119
 120
 121         | 5 ->
 122             List.mem cppkind [Token_c.CppAttr;Token_c.CppPassingNormal;Token_c.CppDirective;Token_c.CppMacro]
 123             ||
 124             (s =~ "__.*")
 125
 126
 127
 128
 129         | _ -> failwith "not valid level passing number"
 130       in
 131       if legal_passing then None else Some (ii.Ast_c.pinfo)
 132
 133         (*
 134         | Ast_c.CppOther ->
 135             (match s with
 136             | s when s =~ "KERN_.*" -> None
 137             | s when s =~ "__.*" -> None
 138             | _ ->
 139                 Some (ii.Ast_c.pinfo)
 140             )
 141         *)
 142
 143
 144   | Parser_c.TCommentMisc ii
 145   | Parser_c.TAction ii
 146     ->
 147       Some (ii.Ast_c.pinfo)
 148   | _ ->
 149       None
 150  )
 151
 152 let count_lines_commentized xs =
 153   let line = ref (-1) in
 154   let count = ref 0 in
 155   begin
 156     commentized xs +>
 157     List.iter
 158       (function
 159           Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
 160             let newline = pinfo.Common.line in
 161             if newline <> !line
 162             then begin
 163               line := newline;
 164               incr count
 165             end
 166         | _ -> ());
 167     !count
 168   end
 169
 170
 171
 172 let print_commentized xs =
 173   let line = ref (-1) in
 174   begin
 175     let ys = commentized xs in
 176     ys +>
 177     List.iter
 178       (function
 179           Ast_c.OriginTok pinfo | Ast_c.ExpandedTok (_,(pinfo,_)) ->
 180             let newline = pinfo.Common.line in
 181             let s = pinfo.Common.str in
 182             let s = Str.global_substitute
 183                 (Str.regexp "\n") (fun s -> "") s
 184             in
 185             if newline =|= !line
 186             then prerr_string (s ^ " ")
 187             else begin
 188               if !line =|= -1
 189               then pr2_no_nl "passed:"
 190               else pr2_no_nl "\npassed:";
 191               line := newline;
 192               pr2_no_nl (s ^ " ");
 193             end
 194         | _ -> ());
 195     if not (null ys) then pr2 "";
 196   end
 197
 198
 199
 200
 201 (*****************************************************************************)
 202 (* Lexing only *)
 203 (*****************************************************************************)
 204
 205 (* called by parse_print_error_heuristic *)
 206 let tokens2 file =
 207  let table     = Common.full_charpos_to_pos file in
 208
 209  Common.with_open_infile file (fun chan ->
 210   let lexbuf = Lexing.from_channel chan in
 211   try
 212     let rec tokens_aux acc =
 213       let tok = Lexer_c.token lexbuf in
 214       (* fill in the line and col information *)
 215       let tok = tok +> TH.visitor_info_of_tok (fun ii ->
 216         { ii with Ast_c.pinfo=
 217           (* could assert pinfo.filename = file ? *)
 218           match Ast_c.pinfo_of_info ii with
 219             Ast_c.OriginTok pi ->
 220               Ast_c.OriginTok (Common.complete_parse_info file table pi)
 221           | Ast_c.ExpandedTok (pi,vpi) ->
 222               Ast_c.ExpandedTok((Common.complete_parse_info file table pi),vpi)
 223           | Ast_c.FakeTok (s,vpi) -> Ast_c.FakeTok (s,vpi)
 224           | Ast_c.AbstractLineTok pi -> failwith "should not occur"
 225       })
 226       in
 227
 228       if TH.is_eof tok
 229       then List.rev (tok::acc)
 230       else tokens_aux (tok::acc)
 231     in
 232     tokens_aux []
 233   with
 234     | Lexer_c.Lexical s ->
 235         failwith ("lexical error " ^ s ^ "\n =" ^
 236                   (Common.error_message file (lexbuf_to_strpos lexbuf)))
 237     | e -> raise e
 238  )
 239
 240 let time_lexing ?(profile=true) a =
 241   if profile
 242   then Common.profile_code_exclusif "LEXING" (fun () -> tokens2 a)
 243   else tokens2 a
 244 let tokens ?profile a =
 245   Common.profile_code "C parsing.tokens" (fun () -> time_lexing ?profile a)
 246
 247
 248 let tokens_of_string string =
 249   let lexbuf = Lexing.from_string string in
 250   try
 251     let rec tokens_s_aux () =
 252       let tok = Lexer_c.token lexbuf in
 253       if TH.is_eof tok
 254       then [tok]
 255       else tok::(tokens_s_aux ())
 256     in
 257     tokens_s_aux ()
 258   with
 259     | Lexer_c.Lexical s -> failwith ("lexical error " ^ s ^ "\n =" )
 260     | e -> raise e
 261
 262
 263 (*****************************************************************************)
 264 (* Parsing, but very basic, no more used *)
 265 (*****************************************************************************)
 266
 267 (*
 268  * !!!Those function use refs, and are not reentrant !!! so take care.
 269  * It use globals defined in Lexer_parser.
 270  *
 271  * update: because now lexer return comments tokens, those functions
 272  * may not work anymore.
 273  *)
 274
 275 let parse file =
 276   let lexbuf = Lexing.from_channel (open_in file) in
 277   let result = Parser_c.main Lexer_c.token lexbuf in
 278   result
 279
 280
 281 let parse_print_error file =
 282   let chan = (open_in file) in
 283   let lexbuf = Lexing.from_channel chan in
 284
 285   let error_msg () = Common.error_message file (lexbuf_to_strpos lexbuf) in
 286   try
 287     lexbuf +> Parser_c.main Lexer_c.token
 288   with
 289   | Lexer_c.Lexical s ->
 290       failwith ("lexical error " ^s^ "\n =" ^  error_msg ())
 291   | Parsing.Parse_error ->
 292       failwith ("parse error \n = " ^ error_msg ())
 293   | Semantic_c.Semantic (s, i) ->
 294       failwith ("semantic error " ^ s ^ "\n =" ^ error_msg ())
 295   | e -> raise e
 296
 297
 298
 299
 300 (*****************************************************************************)
 301 (* Parsing subelements, useful to debug parser *)
 302 (*****************************************************************************)
 303
 304 (*
 305  * !!!Those function use refs, and are not reentrant !!! so take care.
 306  * It use globals defined in Lexer_parser.
 307  *)
 308
 309
 310 (* old:
 311  *   let parse_gen parsefunc s =
 312  *     let lexbuf = Lexing.from_string s in
 313  *     let result = parsefunc Lexer_c.token lexbuf in
 314  *     result
 315  *)
 316
 317 let parse_gen parsefunc s =
 318   let toks = tokens_of_string s +> List.filter TH.is_not_comment in
 319
 320
 321   (* Why use this lexing scheme ? Why not classically give lexer func
 322    * to parser ? Because I now keep comments in lexer. Could
 323    * just do a simple wrapper that when comment ask again for a token,
 324    * but maybe simpler to use cur_tok technique.
 325    *)
 326   let all_tokens = ref toks in
 327   let cur_tok    = ref (List.hd !all_tokens) in
 328
 329   let lexer_function =
 330     (fun _ ->
 331       if TH.is_eof !cur_tok
 332       then (pr2 "LEXER: ALREADY AT END"; !cur_tok)
 333       else
 334         let v = Common.pop2 all_tokens in
 335         cur_tok := v;
 336         !cur_tok
 337     )
 338   in
 339   let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
 340   let result = parsefunc lexer_function lexbuf_fake in
 341   result
 342
 343
 344 let type_of_string       = parse_gen Parser_c.type_name
 345 let statement_of_string  = parse_gen Parser_c.statement
 346 let expression_of_string = parse_gen Parser_c.expr
 347
 348 (* ex: statement_of_string "(struct us_data* )psh->hostdata = NULL;" *)
 349
 350
 351
 352
 353
 354 (*****************************************************************************)
 355 (* Consistency checking *)
 356 (*****************************************************************************)
 357
 358 (* todo:
 359  *  could check that an ident has always the same class, be it a typedef
 360  *  (but sometimes do 'acpi_val acpi_val;'), an ident, a TMacroStatement,
 361  *  etc.
 362  *)
 363
 364 type class_ident =
 365   | CIdent (* can be var, func, field, tag, enum constant *)
 366   | CTypedef
 367
 368 let str_of_class_ident = function
 369   | CIdent -> "Ident"
 370   | CTypedef -> "Typedef"
 371
 372 (*
 373   | CMacro
 374   | CMacroString
 375   | CMacroStmt
 376   | CMacroDecl
 377   | CMacroIterator
 378   | CAttr
 379
 380 (* but take care that must still be able to use '=' *)
 381 type context = InFunction | InEnum | InStruct | InInitializer | InParams
 382 type class_token =
 383   | CIdent of class_ident
 384
 385   | CComment
 386   | CSpace
 387   | CCommentCpp of cppkind
 388   | CCommentMisc
 389   | CCppDirective
 390
 391   | COPar
 392   | CCPar
 393   | COBrace
 394   | CCBrace
 395
 396   | CSymbol
 397   | CReservedKwd (type | decl | qualif | flow | misc | attr)
 398 *)
 399
 400 let ident_to_typename ident =
 401   (Ast_c.nQ, (Ast_c.TypeName  (ident, Ast_c.noTypedefDef()), Ast_c.noii))
 402
 403
 404 (* parse_typedef_fix4 *)
 405 let consistency_checking2 xs =
 406
 407   (* first phase, gather data *)
 408   let stat = Hashtbl.create 101 in
 409
 410   (* default value for hash *)
 411   let v1 () = Hashtbl.create 101 in
 412   let v2 () = ref 0 in
 413
 414   let bigf = { Visitor_c.default_visitor_c with
 415
 416     Visitor_c.kexpr = (fun (k,bigf) x ->
 417       match Ast_c.unwrap_expr x with
 418       | Ast_c.Ident (id) ->
 419           let s = Ast_c.str_of_name id in
 420           stat +>
 421             Common.hfind_default s v1 +> Common.hfind_default CIdent v2 +>
 422             (fun aref -> incr aref)
 423
 424       | _ -> k x
 425     );
 426     Visitor_c.ktype = (fun (k,bigf) t ->
 427       match Ast_c.unwrap_typeC t with
 428       | Ast_c.TypeName (name,_typ) ->
 429           let s = Ast_c.str_of_name name in
 430           stat +>
 431             Common.hfind_default s v1 +> Common.hfind_default CTypedef v2 +>
 432             (fun aref -> incr aref)
 433
 434       | _ -> k t
 435     );
 436   }
 437   in
 438   xs +> List.iter (fun (p, info_item) -> Visitor_c.vk_toplevel bigf p);
 439
 440
 441   let ident_to_type = ref [] in
 442
 443
 444   (* second phase, analyze data *)
 445   stat +> Hashtbl.iter (fun k v ->
 446     let xs = Common.hash_to_list v in
 447     if List.length xs >= 2
 448     then begin
 449       pr2 ("CONFLICT:" ^ k);
 450       let sorted = xs +> List.sort (fun (ka,va) (kb,vb) ->
 451         if !va =|= !vb then
 452           (match ka, kb with
 453           | CTypedef, _ -> 1 (* first is smaller *)
 454           | _, CTypedef -> -1
 455           | _ -> 0
 456           )
 457         else compare !va !vb
 458       ) in
 459       let sorted = List.rev sorted in
 460       match sorted with
 461       | [CTypedef, i1;CIdent, i2] ->
 462           pr2 ("transforming some ident in typedef");
 463           push2 k ident_to_type;
 464       | _ ->
 465           pr2 ("TODO:other transforming?");
 466
 467     end
 468   );
 469
 470   (* third phase, update ast.
 471    * todo? but normally should try to handle correctly scope ? maybe sometime
 472    * sizeof(id) and even if id was for a long time an identifier, maybe
 473    * a few time, because of the scope it's actually really a type.
 474    *)
 475   if (null !ident_to_type)
 476   then xs
 477   else
 478     let bigf = { Visitor_c.default_visitor_c_s with
 479       Visitor_c.kdefineval_s = (fun (k,bigf) x ->
 480         match x with
 481         | Ast_c.DefineExpr e ->
 482             (match e with
 483             | (Ast_c.Ident (ident), _), _ii  ->
 484                 let s = Ast_c.str_of_name ident in
 485                 if List.mem s !ident_to_type
 486                 then
 487                   let t = ident_to_typename ident in
 488                   Ast_c.DefineType t
 489                 else k x
 490             | _ -> k x
 491             )
 492         | _ -> k x
 493       );
 494       Visitor_c.kexpr_s = (fun (k, bigf) x ->
 495         match x with
 496         | (Ast_c.SizeOfExpr e, tref), isizeof ->
 497             let i1 = tuple_of_list1 isizeof in
 498             (match e with
 499             | (Ast_c.ParenExpr e, _), iiparen ->
 500                 (match e with
 501                 | (Ast_c.Ident (ident), _), _ii  ->
 502
 503                     let s = Ast_c.str_of_name ident in
 504                     if List.mem s !ident_to_type
 505                     then
 506                       let t = ident_to_typename ident in
 507                       let (i2, i3) = tuple_of_list2 iiparen in
 508                       (Ast_c.SizeOfType t, tref), [i1;i2;i3]
 509                     else  k x
 510                 | _ -> k x
 511                 )
 512             | _ -> k x
 513             )
 514         | _ -> k x
 515       );
 516     } in
 517     xs +> List.map (fun (p, info_item) ->
 518       Visitor_c.vk_toplevel_s bigf p, info_item
 519     )
 520
 521
 522 let consistency_checking a  =
 523   Common.profile_code "C consistencycheck" (fun () -> consistency_checking2 a)
 524
 525
 526
 527 (*****************************************************************************)
 528 (* Error recovery *)
 529 (*****************************************************************************)
 530
 531 (* todo: do something if find Parser_c.Eof ? *)
 532 let rec find_next_synchro next already_passed =
 533
 534   (* Maybe because not enough }, because for example an ifdef contains
 535    * in both branch some opening {, we later eat too much, "on deborde
 536    * sur la fonction d'apres". So already_passed may be too big and
 537    * looking for next synchro point starting from next may not be the
 538    * best. So maybe we can find synchro point inside already_passed
 539    * instead of looking in next.
 540    *
 541    * But take care! must progress. We must not stay in infinite loop!
 542    * For instance now I have as a error recovery to look for
 543    * a "start of something", corresponding to start of function,
 544    * but must go beyond this start otherwise will loop.
 545    * So look at premier(external_declaration2) in parser.output and
 546    * pass at least those first tokens.
 547    *
 548    * I have chosen to start search for next synchro point after the
 549    * first { I found, so quite sure we will not loop. *)
 550
 551   let last_round = List.rev already_passed in
 552   let is_define =
 553     let xs = last_round +> List.filter TH.is_not_comment in
 554     match xs with
 555     | Parser_c.TDefine _::_ -> true
 556     | _ -> false
 557   in
 558   if is_define
 559   then find_next_synchro_define (last_round ++ next) []
 560   else
 561
 562   let (before, after) =
 563     last_round +> Common.span (fun tok ->
 564       match tok with
 565       (* by looking at TOBrace we are sure that the "start of something"
 566        * will not arrive too early
 567        *)
 568       | Parser_c.TOBrace _ -> false
 569       | Parser_c.TDefine _ -> false
 570       | _ -> true
 571     )
 572   in
 573   find_next_synchro_orig (after ++ next)  (List.rev before)
 574
 575
 576
 577 and find_next_synchro_define next already_passed =
 578   match next with
 579   | [] ->
 580       pr2 "ERROR-RECOV: end of file while in recovery mode";
 581       already_passed, []
 582   | (Parser_c.TDefEOL i as v)::xs  ->
 583       pr2 ("ERROR-RECOV: found sync end of #define, line "^i_to_s(TH.line_of_tok v));
 584       v::already_passed, xs
 585   | v::xs ->
 586       find_next_synchro_define xs (v::already_passed)
 587
 588
 589
 590
 591 and find_next_synchro_orig next already_passed =
 592   match next with
 593   | [] ->
 594       pr2 "ERROR-RECOV: end of file while in recovery mode";
 595       already_passed, []
 596
 597   | (Parser_c.TCBrace i as v)::xs when TH.col_of_tok v =|= 0 ->
 598       pr2 ("ERROR-RECOV: found sync '}' at line "^i_to_s (TH.line_of_tok v));
 599
 600       (match xs with
 601       | [] -> raise Impossible (* there is a EOF token normally *)
 602
 603       (* still useful: now parser.mly allow empty ';' so normally no pb *)
 604       | Parser_c.TPtVirg iptvirg::xs ->
 605           pr2 "ERROR-RECOV: found sync bis, eating } and ;";
 606           (Parser_c.TPtVirg iptvirg)::v::already_passed, xs
 607
 608       | Parser_c.TIdent x::Parser_c.TPtVirg iptvirg::xs ->
 609           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 610           (Parser_c.TPtVirg iptvirg)::(Parser_c.TIdent x)::v::already_passed,
 611           xs
 612
 613       | Parser_c.TCommentSpace sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
 614         ::xs ->
 615           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 616           (Parser_c.TCommentSpace sp)::
 617             (Parser_c.TPtVirg iptvirg)::
 618             (Parser_c.TIdent x)::
 619             v::
 620             already_passed,
 621           xs
 622
 623       | Parser_c.TCommentNewline sp::Parser_c.TIdent x::Parser_c.TPtVirg iptvirg
 624         ::xs ->
 625           pr2 "ERROR-RECOV: found sync bis, eating ident, }, and ;";
 626           (Parser_c.TCommentNewline sp)::
 627             (Parser_c.TPtVirg iptvirg)::
 628             (Parser_c.TIdent x)::
 629             v::
 630             already_passed,
 631           xs
 632
 633       | _ ->
 634           v::already_passed, xs
 635       )
 636   | v::xs when TH.col_of_tok v =|= 0 && TH.is_start_of_something v  ->
 637       pr2 ("ERROR-RECOV: found sync col 0 at line "^ i_to_s(TH.line_of_tok v));
 638       already_passed, v::xs
 639
 640   | v::xs ->
 641       find_next_synchro_orig xs (v::already_passed)
 642
 643
 644 (*****************************************************************************)
 645 (* Include/Define hacks *)
 646 (*****************************************************************************)
 647
 648 (* Sometimes I prefer to generate a single token for a list of things in the
 649  * lexer so that if I have to passed them, like for passing TInclude then
 650  * it's easy. Also if I don't do a single token, then I need to
 651  * parse the rest which may not need special stuff, like detecting
 652  * end of line which the parser is not really ready for. So for instance
 653  * could I parse a #include <a/b/c/xxx.h> as 2 or more tokens ? just
 654  * lex #include ? so then need recognize <a/b/c/xxx.h> as one token ?
 655  * but this kind of token is valid only after a #include and the
 656  * lexing and parsing rules are different for such tokens so not that
 657  * easy to parse such things in parser_c.mly. Hence the following hacks.
 658  *
 659  * less?: maybe could get rid of this like I get rid of some of fix_define.
 660  *)
 661
 662 (* ------------------------------------------------------------------------- *)
 663 (* helpers *)
 664 (* ------------------------------------------------------------------------- *)
 665
 666 (* used to generate new token from existing one *)
 667 let new_info posadd str ii =
 668   { Ast_c.pinfo =
 669       Ast_c.OriginTok { (Ast_c.parse_info_of_info ii) with
 670         charpos = Ast_c.pos_of_info ii + posadd;
 671         str     = str;
 672         column = Ast_c.col_of_info ii + posadd;
 673       };
 674     (* must generate a new ref each time, otherwise share *)
 675     cocci_tag = ref Ast_c.emptyAnnot;
 676     comments_tag = ref Ast_c.emptyComments;
 677    }
 678
 679
 680 let rec comment_until_defeol xs =
 681   match xs with
 682   | [] -> failwith "cant find end of define token TDefEOL"
 683   | x::xs ->
 684       (match x with
 685       | Parser_c.TDefEOL i ->
 686           Parser_c.TCommentCpp (Token_c.CppDirective, TH.info_of_tok x)
 687           ::xs
 688       | _ ->
 689           let x' =
 690             (* bugfix: otherwise may lose a TComment token *)
 691             if TH.is_real_comment x
 692             then x
 693             else Parser_c.TCommentCpp (Token_c.CppPassingNormal (*good?*), TH.info_of_tok x)
 694           in
 695           x'::comment_until_defeol xs
 696       )
 697
 698 let drop_until_defeol xs =
 699   List.tl
 700     (Common.drop_until (function Parser_c.TDefEOL _ -> true | _ -> false) xs)
 701
 702
 703
 704 (* ------------------------------------------------------------------------- *)
 705 (* returns a pair (replaced token, list of next tokens) *)
 706 (* ------------------------------------------------------------------------- *)
 707
 708 let tokens_include (info, includes, filename, inifdef) =
 709   Parser_c.TIncludeStart (Ast_c.rewrap_str includes info, inifdef),
 710   [Parser_c.TIncludeFilename
 711       (filename, (new_info (String.length includes) filename info))
 712   ]
 713
 714 (*****************************************************************************)
 715 (* Parsing default define macros, usually in a standard.h file *)
 716 (*****************************************************************************)
 717
 718 let parse_cpp_define_file2 file =
 719   let toks = tokens ~profile:false file in
 720   let toks = Parsing_hacks.fix_tokens_define toks in
 721   Parsing_hacks.extract_cpp_define toks
 722
 723 let parse_cpp_define_file a =
 724   Common.profile_code_exclusif "HACK" (fun () -> parse_cpp_define_file2 a)
 725
 726 (* can not be put in parsing_hack, cos then mutually recursive problem as
 727  * we also want to parse the standard.h file.
 728  *)
 729 let init_defs std_h =
 730   if not (Common.lfile_exists std_h)
 731   then pr2 ("warning: Can't find default macro file: " ^ std_h)
 732   else begin
 733     pr2 ("init_defs: " ^ std_h);
 734     Parsing_hacks._defs := Common.hash_of_list (parse_cpp_define_file std_h);
 735   end
 736
 737
 738 (*****************************************************************************)
 739 (* Main entry point *)
 740 (*****************************************************************************)
 741
 742 type info_item =  string * Parser_c.token list
 743
 744 type program2 = toplevel2 list
 745      and toplevel2 = Ast_c.toplevel * info_item
 746
 747 let program_of_program2 xs =
 748   xs +> List.map fst
 749
 750 let with_program2 f program2 =
 751   program2
 752   +> Common.unzip
 753   +> (fun (program, infos) ->
 754     f program, infos
 755   )
 756   +> Common.uncurry Common.zip
 757
 758
 759
 760 (* The use of local refs (remaining_tokens, passed_tokens, ...) makes
 761  * possible error recovery. Indeed, they allow to skip some tokens and
 762  * still be able to call again the ocamlyacc parser. It is ugly code
 763  * because we cant modify ocamllex and ocamlyacc. As we want some
 764  * extended lexing tricks, we have to use such refs.
 765  *
 766  * Those refs are now also used for my lalr(k) technique. Indeed They
 767  * store the futur and previous tokens that were parsed, and so
 768  * provide enough context information for powerful lex trick.
 769  *
 770  * - passed_tokens_last_ckp stores the passed tokens since last
 771  *   checkpoint. Used for NotParsedCorrectly and also to build the
 772  *   info_item attached to each program_element.
 773  * - passed_tokens_clean is used for lookahead, in fact for lookback.
 774  * - remaining_tokens_clean is used for lookahead. Now remaining_tokens
 775  *   contain some comments and so would make pattern matching difficult
 776  *   in lookahead. Hence this variable. We would like also to get rid
 777  *   of cpp instruction because sometimes a cpp instruction is between
 778  *   two tokens and makes a pattern matching fail. But lookahead also
 779  *   transform some cpp instruction (in comment) so can't remove them.
 780  *
 781  * So remaining_tokens, passed_tokens_last_ckp contain comment-tokens,
 782  * whereas passed_tokens_clean and remaining_tokens_clean does not contain
 783  * comment-tokens.
 784  *
 785  * Normally we have:
 786  * toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
 787  *    after the call to pop2.
 788  * toks = (reverse passed_tok) ++ remaining_tokens
 789  *     at the and of the lexer_function call.
 790  * At the very beginning, cur_tok and remaining_tokens overlap, but not after.
 791  * At the end of lexer_function call,  cur_tok  overlap  with passed_tok.
 792  *
 793  * convention: I use "tr"  for "tokens refs"
 794  *
 795  * I now also need this lexing trick because the lexer return comment
 796  * tokens.
 797  *)
 798
 799 type tokens_state = {
 800   mutable rest :         Parser_c.token list;
 801   mutable rest_clean :   Parser_c.token list;
 802   mutable current :      Parser_c.token;
 803   (* it's passed since last "checkpoint", not passed from the beginning *)
 804   mutable passed :       Parser_c.token list;
 805   mutable passed_clean : Parser_c.token list;
 806 }
 807 let clone_tokens_stat tr =
 808   { rest = tr.rest;
 809     rest_clean = tr.rest_clean;
 810     current = tr.current;
 811     passed = tr.passed;
 812     passed_clean = tr.passed_clean;
 813   }
 814 let copy_tokens_stat ~src ~dst =
 815   dst.rest <- src.rest;
 816   dst.rest_clean <- src.rest_clean;
 817   dst.current <- src.current;
 818   dst.passed <- src.passed;
 819   dst.passed_clean <-  src.passed_clean;
 820   ()
 821
 822 let rec filter_noise n xs =
 823   match n, xs with
 824   | _, [] -> []
 825   | 0, xs -> xs
 826   | n, x::xs ->
 827       (match x with
 828       | Parser_c.TMacroAttr _ ->
 829           filter_noise (n-1) xs
 830       | _ ->
 831           x::filter_noise (n-1) xs
 832       )
 833
 834 let clean_for_lookahead xs =
 835   match xs with
 836   | [] -> []
 837   | [x] -> [x]
 838   | x::xs ->
 839       x::filter_noise 10 xs
 840
 841
 842
 843 (* Hacked lex. This function use refs passed by parse_print_error_heuristic
 844  * tr means token refs.
 845  *)
 846 let rec lexer_function ~pass tr = fun lexbuf ->
 847   match tr.rest with
 848   | [] -> pr2 "ALREADY AT END"; tr.current
 849   | v::xs ->
 850     tr.rest <- xs;
 851     tr.current <- v;
 852
 853     if !Flag_parsing_c.debug_lexer then Common.pr2_gen v;
 854
 855     if TH.is_comment v
 856     then begin
 857       tr.passed <- v::tr.passed;
 858       lexer_function ~pass tr lexbuf
 859     end
 860     else begin
 861       let x = List.hd tr.rest_clean  in
 862       tr.rest_clean <- List.tl tr.rest_clean;
 863       assert (x =*= v);
 864
 865       (match v with
 866
 867       (* fix_define1.
 868        *
 869        * Why not in parsing_hacks lookahead and do passing like
 870        * I do for some ifdef directives ? Because here I also need to
 871        * generate some tokens sometimes and so I need access to the
 872        * tr.passed, tr.rest, etc.
 873        *)
 874       | Parser_c.TDefine (tok) ->
 875           if not (LP.current_context () =*= LP.InTopLevel) &&
 876             (!Flag_parsing_c.cpp_directive_passing || (pass =|= 2))
 877           then begin
 878             incr Stat.nDefinePassing;
 879             pr2_once ("CPP-DEFINE: inside function, I treat it as comment");
 880             let v' = Parser_c.TCommentCpp (Token_c.CppDirective,TH.info_of_tok v)
 881             in
 882             tr.passed <- v'::tr.passed;
 883             tr.rest       <- comment_until_defeol tr.rest;
 884             tr.rest_clean <- drop_until_defeol tr.rest_clean;
 885             lexer_function ~pass tr lexbuf
 886           end
 887           else begin
 888             tr.passed <- v::tr.passed;
 889             tr.passed_clean <- v::tr.passed_clean;
 890             v
 891           end
 892
 893       | Parser_c.TInclude (includes, filename, inifdef, info) ->
 894           if not (LP.current_context () =*= LP.InTopLevel)  &&
 895             (!Flag_parsing_c.cpp_directive_passing || (pass =|= 2))
 896           then begin
 897             incr Stat.nIncludePassing;
 898             pr2_once ("CPP-INCLUDE: inside function, I treat it as comment");
 899             let v = Parser_c.TCommentCpp(Token_c.CppDirective, info) in
 900             tr.passed <- v::tr.passed;
 901             lexer_function ~pass tr lexbuf
 902           end
 903           else begin
 904             let (v,new_tokens) =
 905               tokens_include (info, includes, filename, inifdef) in
 906             let new_tokens_clean =
 907               new_tokens +> List.filter TH.is_not_comment  in
 908
 909             tr.passed <- v::tr.passed;
 910             tr.passed_clean <- v::tr.passed_clean;
 911             tr.rest <- new_tokens ++ tr.rest;
 912             tr.rest_clean <- new_tokens_clean ++ tr.rest_clean;
 913             v
 914           end
 915
 916       | _ ->
 917
 918           (* typedef_fix1 *)
 919           let v = match v with
 920             | Parser_c.TIdent (s, ii) ->
 921                 if
 922                   LP.is_typedef s &&
 923                     not (!Flag_parsing_c.disable_add_typedef) &&
 924                     pass =|= 1
 925                 then Parser_c.TypedefIdent (s, ii)
 926                 else Parser_c.TIdent (s, ii)
 927             | x -> x
 928           in
 929
 930           let v = Parsing_hacks.lookahead ~pass
 931             (clean_for_lookahead (v::tr.rest_clean))
 932             tr.passed_clean in
 933
 934           tr.passed <- v::tr.passed;
 935
 936           (* the lookahead may have changed the status of the token and
 937            * consider it as a comment, for instance some #include are
 938            * turned into comments, hence this code. *)
 939           match v with
 940           | Parser_c.TCommentCpp _ -> lexer_function ~pass tr lexbuf
 941           | v ->
 942               tr.passed_clean <- v::tr.passed_clean;
 943               v
 944       )
 945     end
 946
 947
 948
 949 let get_one_elem ~pass tr (file, filelines) =
 950
 951   if not (LP.is_enabled_typedef()) && !Flag_parsing_c.debug_typedef
 952   then pr2 "TYPEDEF:_handle_typedef=false. Not normal if dont come from exn";
 953
 954   (* normally have to do that only when come from an exception in which
 955    * case the dt() may not have been done
 956    * TODO but if was in scoped scope ? have to let only the last scope
 957    * so need do a LP.lexer_reset_typedef ();
 958    *)
 959   LP.enable_typedef();
 960   LP._lexer_hint := (LP.default_hint ());
 961   LP.save_typedef_state();
 962
 963   tr.passed <- [];
 964
 965   let lexbuf_fake = Lexing.from_function (fun buf n -> raise Impossible) in
 966
 967   (try
 968       (* -------------------------------------------------- *)
 969       (* Call parser *)
 970       (* -------------------------------------------------- *)
 971       Common.profile_code_exclusif "YACC" (fun () ->
 972         Left (Parser_c.celem (lexer_function ~pass tr) lexbuf_fake)
 973       )
 974     with e -> begin
 975       if (pass =|= 1 && !Flag_parsing_c.disable_two_pass)|| (pass =|= 2)
 976       then begin
 977         (match e with
 978         (* Lexical is not anymore launched I think *)
 979         | Lexer_c.Lexical s ->
 980             pr2 ("lexical error " ^s^ "\n =" ^ error_msg_tok tr.current)
 981         | Parsing.Parse_error ->
 982             pr2 ("parse error \n = " ^ error_msg_tok tr.current)
 983         | Semantic_c.Semantic (s, i) ->
 984             pr2 ("semantic error " ^s^ "\n ="^ error_msg_tok tr.current)
 985         | e -> raise e
 986         )
 987       end;
 988       LP.restore_typedef_state();
 989
 990       (* must keep here, before the code that adjusts the tr fields *)
 991       let line_error = TH.line_of_tok tr.current in
 992
 993
 994       (*  error recovery, go to next synchro point *)
 995       let (passed', rest') = find_next_synchro tr.rest tr.passed in
 996       tr.rest <- rest';
 997       tr.passed <- passed';
 998
 999       tr.current <- List.hd passed';
1000       tr.passed_clean <- [];           (* enough ? *)
1001       (* with error recovery, rest and rest_clean may not be in sync *)
1002       tr.rest_clean <- (tr.rest +> List.filter TH.is_not_comment);
1003
1004
1005       let info_of_bads = Common.map_eff_rev TH.info_of_tok tr.passed in
1006       Right (info_of_bads,  line_error, tr.passed)
1007     end
1008   )
1009
1010
1011
1012
1013 (* note: as now we go in 2 passes, there is first all the error message of
1014  * the lexer, and then the error of the parser. It is not anymore
1015  * interwinded.
1016  *
1017  * !!!This function use refs, and is not reentrant !!! so take care.
1018  * It use globals defined in Lexer_parser and also the _defs global
1019  * in parsing_hack.ml.
1020  *
1021  * This function uses internally some semi globals in the
1022  * tokens_stat record and parsing_stat record.
1023  *)
1024
1025 let parse_print_error_heuristic2 file =
1026
1027   let filelines = Common.cat_array file in
1028   let stat = Parsing_stat.default_stat file in
1029
1030   (* -------------------------------------------------- *)
1031   (* call lexer and get all the tokens *)
1032   (* -------------------------------------------------- *)
1033   LP.lexer_reset_typedef();
1034   Parsing_hacks.ifdef_paren_cnt := 0;
1035   let toks_orig = tokens file in
1036
1037   let toks = Parsing_hacks.fix_tokens_define toks_orig in
1038   let toks = Parsing_hacks.fix_tokens_cpp toks in
1039
1040   let tr = {
1041     rest       = toks;
1042     rest_clean = (toks +> List.filter TH.is_not_comment);
1043     current    = (List.hd toks);
1044     passed = [];
1045     passed_clean = [];
1046   } in
1047
1048
1049
1050
1051   let rec loop tr =
1052
1053     (* todo?: I am not sure that it represents current_line, cos maybe
1054      * tr.current partipated in the previous parsing phase, so maybe tr.current
1055      * is not the first token of the next parsing phase. Same with checkpoint2.
1056      * It would be better to record when we have a } or ; in parser.mly,
1057      *  cos we know that they are the last symbols of external_declaration2.
1058      *
1059      * bugfix: may not be equal to 'file' as after macro expansions we can
1060      * start to parse a new entity from the body of a macro, for instance
1061      * when parsing a define_machine() body, cf standard.h
1062      *)
1063     let checkpoint = TH.line_of_tok tr.current in
1064     let checkpoint_file = TH.file_of_tok tr.current in
1065
1066     let tr_save = clone_tokens_stat tr in
1067
1068     (* call the parser *)
1069     let elem =
1070       let pass1 = get_one_elem ~pass:1 tr (file, filelines) in
1071       match pass1 with
1072       | Left e -> Left e
1073       | Right res ->
1074           if !Flag_parsing_c.disable_two_pass
1075           then Right res
1076           else begin
1077             pr2 "parsing pass2: try again";
1078             copy_tokens_stat ~src:tr_save ~dst: tr;
1079             let pass2 = get_one_elem ~pass:2 tr (file, filelines) in
1080             pass2
1081           end
1082     in
1083
1084
1085     (* again not sure if checkpoint2 corresponds to end of bad region *)
1086     let checkpoint2 = TH.line_of_tok tr.current in (* <> line_error *)
1087     let checkpoint2_file = TH.file_of_tok tr.current in
1088
1089     let was_define =
1090       (match elem with
1091       | Left _ -> false
1092       | Right (_, line_error, _) ->
1093           let was_define =
1094             let xs = tr.passed +> List.rev +> List.filter TH.is_not_comment in
1095             if List.length xs >= 2
1096             then
1097               (match Common.head_middle_tail xs with
1098               | Parser_c.TDefine _, _, Parser_c.TDefEOL _ ->
1099                   true
1100               | _ -> false
1101               )
1102             else begin
1103               pr2 "WEIRD: length list of error recovery tokens < 2 ";
1104               false
1105             end
1106           in
1107           (if was_define && !Flag_parsing_c.filter_msg_define_error
1108           then ()
1109           else
1110             (* bugfix: *)
1111             if (checkpoint_file =$= checkpoint2_file) &&
1112                 checkpoint_file =$= file
1113             then print_bad line_error (checkpoint, checkpoint2) filelines
1114             else pr2 "PB: bad: but on tokens not from original file"
1115           );
1116           was_define
1117       ) in
1118
1119
1120     let diffline =
1121       if (checkpoint_file =$= checkpoint2_file) && (checkpoint_file =$= file)
1122       then (checkpoint2 - checkpoint)
1123       else 0
1124         (* TODO? so if error come in middle of something ? where the
1125          * start token was from original file but synchro found in body
1126          * of macro ? then can have wrong number of lines stat.
1127          * Maybe simpler just to look at tr.passed and count
1128          * the lines in the token from the correct file ?
1129          *)
1130     in
1131     let info = mk_info_item file (List.rev tr.passed) in
1132
1133     (* some stat updates *)
1134     stat.Stat.commentized <-
1135       stat.Stat.commentized + count_lines_commentized (snd info);
1136
1137     let elem =
1138       match elem with
1139       | Left e ->
1140           stat.Stat.correct <- stat.Stat.correct + diffline;
1141           e
1142       | Right (info_of_bads, line_error, toks_of_bads) ->
1143           if was_define && !Flag_parsing_c.filter_define_error
1144           then stat.Stat.correct <- stat.Stat.correct + diffline
1145           else stat.Stat.bad     <- stat.Stat.bad     + diffline;
1146
1147           let pbline =
1148             toks_of_bads
1149             +> Common.filter (TH.is_same_line_or_close line_error)
1150             +> Common.filter TH.is_ident_like
1151           in
1152           let error_info =
1153             (pbline +> List.map TH.str_of_tok), line_error
1154           in
1155           stat.Stat.problematic_lines <-
1156             error_info::stat.Stat.problematic_lines;
1157
1158           Ast_c.NotParsedCorrectly info_of_bads
1159     in
1160
1161     (match elem with
1162     | Ast_c.FinalDef x -> [(Ast_c.FinalDef x, info)]
1163     | xs -> (xs, info):: loop tr (* recurse *)
1164     )
1165   in
1166   let v = loop tr in
1167   let v = consistency_checking v in
1168   (v, stat)
1169
1170
1171 let time_total_parsing a  =
1172   Common.profile_code "TOTAL" (fun () -> parse_print_error_heuristic2 a)
1173
1174 let parse_print_error_heuristic a  =
1175   Common.profile_code "C parsing" (fun () -> time_total_parsing a)
1176
1177
1178 (* alias *)
1179 let parse_c_and_cpp a = parse_print_error_heuristic a
1180
1181 (*****************************************************************************)
1182 (* Same but faster cos memoize stuff *)
1183 (*****************************************************************************)
1184 let parse_cache file =
1185   if not !Flag_parsing_c.use_cache then parse_print_error_heuristic file
1186   else
1187   let _ = pr2 "TOFIX" in
1188   let need_no_changed_files =
1189     (* should use Sys.argv.(0), would be safer. *)
1190
1191     [
1192       (* TOFIX
1193       Config.path ^ "/parsing_c/c_parser.cma";
1194       (* we may also depend now on the semantic patch because
1195          the SP may use macro and so we will disable some of the
1196          macro expansions from standard.h.
1197       *)
1198       !Config.std_h;
1199       *)
1200     ]
1201   in
1202   let need_no_changed_variables =
1203     (* could add some of the flags of flag_parsing_c.ml *)
1204     []
1205   in
1206   Common.cache_computation_robust
1207     file ".ast_raw"
1208     (need_no_changed_files, need_no_changed_variables) ".depend_raw"
1209     (fun () -> parse_print_error_heuristic file)
1210
1211
1212
1213 (*****************************************************************************)
1214 (* Some special cases *)
1215 (*****************************************************************************)
1216
1217 let (cstatement_of_string: string -> Ast_c.statement) = fun s ->
1218   Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ "\n}");
1219   let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1220   program +> Common.find_some (fun (e,_) ->
1221     match e with
1222     | Ast_c.Definition ({Ast_c.f_body = [Ast_c.StmtElem st]},_) -> Some st
1223     | _ -> None
1224   )
1225
1226 let (cexpression_of_string: string -> Ast_c.expression) = fun s ->
1227   Common.write_file ("/tmp/__cocci.c") ("void main() { \n" ^ s ^ ";\n}");
1228   let program = parse_c_and_cpp ("/tmp/__cocci.c") +> fst in
1229   program +> Common.find_some (fun (e,_) ->
1230     match e with
1231     | Ast_c.Definition ({Ast_c.f_body = compound},_) ->
1232         (match compound with
1233         | [Ast_c.StmtElem (Ast_c.ExprStatement (Some e),ii)] -> Some e
1234         | _ -> None
1235         )
1236     | _ -> None
1237   )