commons/ocamlextra/setPt.ml

   1 (*
   2  * Ptset: Sets of integers implemented as Patricia trees.
   3  * Copyright (C) 2000 Jean-Christophe FILLIATRE
   4  *
   5  * This software is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Library General Public
   7  * License version 2, as published by the Free Software Foundation.
   8  *
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  12  *
  13  * See the GNU Library General Public License version 2 for more details
  14  * (enclosed in the file LGPL).
  15  *)
  16
  17 (*i ptset.ml 1.8 2001/06/28 07:05:55 filliatr Exp  i*)
  18
  19 (*s Sets of integers implemented as Patricia trees, following Chris
  20     Okasaki and Andrew Gill's paper {\em Fast Mergeable Integer Maps}
  21     ({\tt\small http://www.cs.columbia.edu/\~{}cdo/papers.html\#ml98maps}).
  22     Patricia trees provide faster operations than standard library's
  23     module [Set], and especially very fast [union], [subset], [inter]
  24     and [diff] operations. *)
  25
  26 (*s The idea behind Patricia trees is to build a {\em trie} on the
  27     binary digits of the elements, and to compact the representation
  28     by branching only one the relevant bits (i.e. the ones for which
  29     there is at least on element in each subtree). We implement here
  30     {\em little-endian} Patricia trees: bits are processed from
  31     least-significant to most-significant. The trie is implemented by
  32     the following type [t]. [Empty] stands for the empty trie, and
  33     [Leaf k] for the singleton [k]. (Note that [k] is the actual
  34     element.) [Branch (m,p,l,r)] represents a branching, where [p] is
  35     the prefix (from the root of the trie) and [m] is the branching
  36     bit (a power of 2). [l] and [r] contain the subsets for which the
  37     branching bit is respectively 0 and 1. Invariant: the trees [l]
  38     and [r] are not empty. *)
  39
  40 (*i*)
  41 type elt = int
  42 (*i*)
  43
  44 type t =
  45   | Empty
  46   | Leaf of int
  47   | Branch of int * int * t * t
  48
  49 (*s Example: the representation of the set $\{1,4,5\}$ is
  50     $$\mathtt{Branch~(0,~1,~Leaf~4,~Branch~(1,~4,~Leaf~1,~Leaf~5))}$$
  51     The first branching bit is the bit 0 (and the corresponding prefix
  52     is [0b0], not of use here), with $\{4\}$ on the left and $\{1,5\}$ on the
  53     right. Then the right subtree branches on bit 2 (and so has a branching
  54     value of $2^2 = 4$), with prefix [0b01 = 1]. *)
  55
  56 (*s Empty set and singletons. *)
  57
  58 let empty = Empty
  59
  60 let is_empty = function Empty -> true | _ -> false
  61
  62 let singleton k = Leaf k
  63
  64 (*s Testing the occurrence of a value is similar to the search in a
  65     binary search tree, where the branching bit is used to select the
  66     appropriate subtree. *)
  67
  68 let zero_bit k m = (k land m) == 0
  69
  70 let rec mem k = function
  71   | Empty -> false
  72   | Leaf j -> k == j
  73   | Branch (_, m, l, r) -> mem k (if zero_bit k m then l else r)
  74
  75 (*s The following operation [join] will be used in both insertion and
  76     union. Given two non-empty trees [t0] and [t1] with longest common
  77     prefixes [p0] and [p1] respectively, which are supposed to
  78     disagree, it creates the union of [t0] and [t1]. For this, it
  79     computes the first bit [m] where [p0] and [p1] disagree and create
  80     a branching node on that bit. Depending on the value of that bit
  81     in [p0], [t0] will be the left subtree and [t1] the right one, or
  82     the converse. Computing the first branching bit of [p0] and [p1]
  83     uses a nice property of twos-complement representation of integers. *)
  84
  85 let lowest_bit x = x land (-x)
  86
  87 let branching_bit p0 p1 = lowest_bit (p0 lxor p1)
  88
  89 let mask p m = p land (m-1)
  90
  91 let join (p0,t0,p1,t1) =
  92   let m = branching_bit p0 p1 in
  93   if zero_bit p0 m then
  94     Branch (mask p0 m, m, t0, t1)
  95   else
  96     Branch (mask p0 m, m, t1, t0)
  97
  98 (*s Then the insertion of value [k] in set [t] is easily implemented
  99     using [join].  Insertion in a singleton is just the identity or a
 100     call to [join], depending on the value of [k].  When inserting in
 101     a branching tree, we first check if the value to insert [k]
 102     matches the prefix [p]: if not, [join] will take care of creating
 103     the above branching; if so, we just insert [k] in the appropriate
 104     subtree, depending of the branching bit. *)
 105
 106 let match_prefix k p m = (mask k m) == p
 107
 108 let add k t =
 109   let rec ins = function
 110     | Empty -> Leaf k
 111     | Leaf j as t ->
 112         if j == k then t else join (k, Leaf k, j, t)
 113     | Branch (p,m,t0,t1) as t ->
 114         if match_prefix k p m then
 115           if zero_bit k m then
 116             Branch (p, m, ins t0, t1)
 117           else
 118             Branch (p, m, t0, ins t1)
 119         else
 120           join (k, Leaf k, p, t)
 121   in
 122   ins t
 123
 124 (*s The code to remove an element is basically similar to the code of
 125     insertion. But since we have to maintain the invariant that both
 126     subtrees of a [Branch] node are non-empty, we use here the
 127     ``smart constructor'' [branch] instead of [Branch]. *)
 128
 129 let branch = function
 130   | (_,_,Empty,t) -> t
 131   | (_,_,t,Empty) -> t
 132   | (p,m,t0,t1)   -> Branch (p,m,t0,t1)
 133
 134 let remove k t =
 135   let rec rmv = function
 136     | Empty -> Empty
 137     | Leaf j as t -> if k == j then Empty else t
 138     | Branch (p,m,t0,t1) as t ->
 139         if match_prefix k p m then
 140           if zero_bit k m then
 141             branch (p, m, rmv t0, t1)
 142           else
 143             branch (p, m, t0, rmv t1)
 144         else
 145           t
 146   in
 147   rmv t
 148
 149 (*s One nice property of Patricia trees is to support a fast union
 150     operation (and also fast subset, difference and intersection
 151     operations). When merging two branching trees we examine the
 152     following four cases: (1) the trees have exactly the same
 153     prefix; (2/3) one prefix contains the other one; and (4) the
 154     prefixes disagree. In cases (1), (2) and (3) the recursion is
 155     immediate; in case (4) the function [join] creates the appropriate
 156     branching. *)
 157
 158 let rec merge = function
 159   | Empty, t  -> t
 160   | t, Empty  -> t
 161   | Leaf k, t -> add k t
 162   | t, Leaf k -> add k t
 163   | (Branch (p,m,s0,s1) as s), (Branch (q,n,t0,t1) as t) ->
 164       if m == n && match_prefix q p m then
 165         (* The trees have the same prefix. Merge the subtrees. *)
 166         Branch (p, m, merge (s0,t0), merge (s1,t1))
 167       else if m < n && match_prefix q p m then
 168         (* [q] contains [p]. Merge [t] with a subtree of [s]. *)
 169         if zero_bit q m then
 170           Branch (p, m, merge (s0,t), s1)
 171         else
 172           Branch (p, m, s0, merge (s1,t))
 173       else if m > n && match_prefix p q n then
 174         (* [p] contains [q]. Merge [s] with a subtree of [t]. *)
 175         if zero_bit p n then
 176           Branch (q, n, merge (s,t0), t1)
 177         else
 178           Branch (q, n, t0, merge (s,t1))
 179       else
 180         (* The prefixes disagree. *)
 181         join (p, s, q, t)
 182
 183 let union s t = merge (s,t)
 184
 185 (*s When checking if [s1] is a subset of [s2] only two of the above
 186     four cases are relevant: when the prefixes are the same and when the
 187     prefix of [s1] contains the one of [s2], and then the recursion is
 188     obvious. In the other two cases, the result is [false]. *)
 189
 190 let rec subset s1 s2 = match (s1,s2) with
 191   | Empty, _ -> true
 192   | _, Empty -> false
 193   | Leaf k1, _ -> mem k1 s2
 194   | Branch _, Leaf _ -> false
 195   | Branch (p1,m1,l1,r1), Branch (p2,m2,l2,r2) ->
 196       if m1 == m2 && p1 == p2 then
 197         subset l1 l2 && subset r1 r2
 198       else if m1 > m2 && match_prefix p1 p2 m2 then
 199         if zero_bit p1 m2 then
 200           subset l1 l2 && subset r1 l2
 201         else
 202           subset l1 r2 && subset r1 r2
 203       else
 204         false
 205
 206 (*s To compute the intersection and the difference of two sets, we
 207     still examine the same four cases as in [merge]. The recursion is
 208     then obvious. *)
 209
 210 let rec inter s1 s2 = match (s1,s2) with
 211   | Empty, _ -> Empty
 212   | _, Empty -> Empty
 213   | Leaf k1, _ -> if mem k1 s2 then s1 else Empty
 214   | _, Leaf k2 -> if mem k2 s1 then s2 else Empty
 215   | Branch (p1,m1,l1,r1), Branch (p2,m2,l2,r2) ->
 216       if m1 == m2 && p1 == p2 then
 217         merge (inter l1 l2, inter r1 r2)
 218       else if m1 < m2 && match_prefix p2 p1 m1 then
 219         inter (if zero_bit p2 m1 then l1 else r1) s2
 220       else if m1 > m2 && match_prefix p1 p2 m2 then
 221         inter s1 (if zero_bit p1 m2 then l2 else r2)
 222       else
 223         Empty
 224
 225 let rec diff s1 s2 = match (s1,s2) with
 226   | Empty, _ -> Empty
 227   | _, Empty -> s1
 228   | Leaf k1, _ -> if mem k1 s2 then Empty else s1
 229   | _, Leaf k2 -> remove k2 s1
 230   | Branch (p1,m1,l1,r1), Branch (p2,m2,l2,r2) ->
 231       if m1 == m2 && p1 == p2 then
 232         merge (diff l1 l2, diff r1 r2)
 233       else if m1 < m2 && match_prefix p2 p1 m1 then
 234         if zero_bit p2 m1 then
 235           merge (diff l1 s2, r1)
 236         else
 237           merge (l1, diff r1 s2)
 238       else if m1 > m2 && match_prefix p1 p2 m2 then
 239         if zero_bit p1 m2 then diff s1 l2 else diff s1 r2
 240       else
 241         s1
 242
 243 (*s All the following operations ([cardinal], [iter], [fold], [for_all],
 244     [exists], [filter], [partition], [choose], [elements]) are
 245     implemented as for any other kind of binary trees. *)
 246
 247 let rec cardinal = function
 248   | Empty -> 0
 249   | Leaf _ -> 1
 250   | Branch (_,_,t0,t1) -> cardinal t0 + cardinal t1
 251
 252 let rec iter f = function
 253   | Empty -> ()
 254   | Leaf k -> f k
 255   | Branch (_,_,t0,t1) -> iter f t0; iter f t1
 256
 257 let rec fold f s accu = match s with
 258   | Empty -> accu
 259   | Leaf k -> f k accu
 260   | Branch (_,_,t0,t1) -> fold f t0 (fold f t1 accu)
 261
 262 let rec for_all p = function
 263   | Empty -> true
 264   | Leaf k -> p k
 265   | Branch (_,_,t0,t1) -> for_all p t0 && for_all p t1
 266
 267 let rec exists p = function
 268   | Empty -> false
 269   | Leaf k -> p k
 270   | Branch (_,_,t0,t1) -> exists p t0 || exists p t1
 271
 272 let filter p s =
 273   let rec filt acc = function
 274     | Empty -> acc
 275     | Leaf k -> if p k then add k acc else acc
 276     | Branch (_,_,t0,t1) -> filt (filt acc t0) t1
 277   in
 278   filt Empty s
 279
 280 let partition p s =
 281   let rec part (t,f as acc) = function
 282     | Empty -> acc
 283     | Leaf k -> if p k then (add k t, f) else (t, add k f)
 284     | Branch (_,_,t0,t1) -> part (part acc t0) t1
 285   in
 286   part (Empty, Empty) s
 287
 288 let rec choose = function
 289   | Empty -> raise Not_found
 290   | Leaf k -> k
 291   | Branch (_, _,t0,_) -> choose t0   (* we know that [t0] is non-empty *)
 292
 293 let elements s =
 294   let rec elements_aux acc = function
 295     | Empty -> acc
 296     | Leaf k -> k :: acc
 297     | Branch (_,_,l,r) -> elements_aux (elements_aux acc l) r
 298   in
 299   elements_aux [] s
 300
 301 (*s There is no way to give an efficient implementation of [min_elt]
 302     and [max_elt], as with binary search trees.  The following
 303     implementation is a traversal of all elements, barely more
 304     efficient than [fold min t (choose t)] (resp. [fold max t (choose
 305     t)]). Note that we use the fact that there is no constructor
 306     [Empty] under [Branch] and therefore always a minimal
 307     (resp. maximal) element there. *)
 308
 309 let rec min_elt = function
 310   | Empty -> raise Not_found
 311   | Leaf k -> k
 312   | Branch (_,_,s,t) -> min (min_elt s) (min_elt t)
 313
 314 let rec max_elt = function
 315   | Empty -> raise Not_found
 316   | Leaf k -> k
 317   | Branch (_,_,s,t) -> max (max_elt s) (max_elt t)
 318
 319 (*s Another nice property of Patricia trees is to be independent of the
 320     order of insertion. As a consequence, two Patricia trees have the
 321     same elements if and only if they are structurally equal. *)
 322
 323 let equal = (=)
 324
 325 let compare = compare
 326
 327 (*i*)
 328 let make l = List.fold_right add l empty
 329 (*i*)
 330
 331 (*s Additional functions w.r.t to [Set.S]. *)
 332
 333 let rec intersect s1 s2 = match (s1,s2) with
 334   | Empty, _ -> false
 335   | _, Empty -> false
 336   | Leaf k1, _ -> mem k1 s2
 337   | _, Leaf k2 -> mem k2 s1
 338   | Branch (p1,m1,l1,r1), Branch (p2,m2,l2,r2) ->
 339       if m1 == m2 && p1 == p2 then
 340         intersect l1 l2 || intersect r1 r2
 341       else if m1 < m2 && match_prefix p2 p1 m1 then
 342         intersect (if zero_bit p2 m1 then l1 else r1) s2
 343       else if m1 > m2 && match_prefix p1 p2 m2 then
 344         intersect s1 (if zero_bit p1 m2 then l2 else r2)
 345       else
 346         false