2 * Ptset: Sets of integers implemented as Patricia trees.
3 * Copyright (C) 2000 Jean-Christophe FILLIATRE
5 * This software is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License version 2, as published by the Free Software Foundation.
9 * This software is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 * See the GNU Library General Public License version 2 for more details
14 * (enclosed in the file LGPL).
17 (*i ptset.ml 1.8 2001/06/28 07:05:55 filliatr Exp i*)
19 (*s Sets of integers implemented as Patricia trees, following Chris
20 Okasaki and Andrew Gill's paper {\em Fast Mergeable Integer Maps}
21 ({\tt\small http://www.cs.columbia.edu/\~{}cdo/papers.html\#ml98maps}).
22 Patricia trees provide faster operations than standard library's
23 module [Set], and especially very fast [union], [subset], [inter]
24 and [diff] operations. *)
26 (*s The idea behind Patricia trees is to build a {\em trie} on the
27 binary digits of the elements, and to compact the representation
28 by branching only one the relevant bits (i.e. the ones for which
29 there is at least on element in each subtree). We implement here
30 {\em little-endian} Patricia trees: bits are processed from
31 least-significant to most-significant. The trie is implemented by
32 the following type [t]. [Empty] stands for the empty trie, and
33 [Leaf k] for the singleton [k]. (Note that [k] is the actual
34 element.) [Branch (m,p,l,r)] represents a branching, where [p] is
35 the prefix (from the root of the trie) and [m] is the branching
36 bit (a power of 2). [l] and [r] contain the subsets for which the
37 branching bit is respectively 0 and 1. Invariant: the trees [l]
38 and [r] are not empty. *)
47 | Branch
of int * int * t
* t
49 (*s Example: the representation of the set $\{1,4,5\}$ is
50 $$\mathtt{Branch~(0,~1,~Leaf~4,~Branch~(1,~4,~Leaf~1,~Leaf~5))}$$
51 The first branching bit is the bit 0 (and the corresponding prefix
52 is [0b0], not of use here), with $\{4\}$ on the left and $\{1,5\}$ on the
53 right. Then the right subtree branches on bit 2 (and so has a branching
54 value of $2^2 = 4$), with prefix [0b01 = 1]. *)
56 (*s Empty set and singletons. *)
60 let is_empty = function Empty
-> true | _
-> false
62 let singleton k
= Leaf k
64 (*s Testing the occurrence of a value is similar to the search in a
65 binary search tree, where the branching bit is used to select the
66 appropriate subtree. *)
68 let zero_bit k m
= (k
land m
) == 0
70 let rec mem k
= function
73 | Branch
(_
, m
, l
, r
) -> mem k
(if zero_bit k m
then l
else r
)
75 (*s The following operation [join] will be used in both insertion and
76 union. Given two non-empty trees [t0] and [t1] with longest common
77 prefixes [p0] and [p1] respectively, which are supposed to
78 disagree, it creates the union of [t0] and [t1]. For this, it
79 computes the first bit [m] where [p0] and [p1] disagree and create
80 a branching node on that bit. Depending on the value of that bit
81 in [p0], [t0] will be the left subtree and [t1] the right one, or
82 the converse. Computing the first branching bit of [p0] and [p1]
83 uses a nice property of twos-complement representation of integers. *)
85 let lowest_bit x
= x
land (-x
)
87 let branching_bit p0 p1
= lowest_bit (p0
lxor p1
)
89 let mask p m
= p
land (m
-1)
91 let join (p0
,t0
,p1
,t1
) =
92 let m = branching_bit p0 p1
in
94 Branch
(mask p0
m, m, t0
, t1
)
96 Branch
(mask p0
m, m, t1
, t0
)
98 (*s Then the insertion of value [k] in set [t] is easily implemented
99 using [join]. Insertion in a singleton is just the identity or a
100 call to [join], depending on the value of [k]. When inserting in
101 a branching tree, we first check if the value to insert [k]
102 matches the prefix [p]: if not, [join] will take care of creating
103 the above branching; if so, we just insert [k] in the appropriate
104 subtree, depending of the branching bit. *)
106 let match_prefix k p
m = (mask k
m) == p
109 let rec ins = function
112 if j
== k
then t
else join (k
, Leaf k
, j
, t
)
113 | Branch
(p
,m,t0
,t1
) as t
->
114 if match_prefix k p
m then
116 Branch
(p
, m, ins t0
, t1
)
118 Branch
(p
, m, t0
, ins t1
)
120 join (k
, Leaf k
, p
, t
)
124 (*s The code to remove an element is basically similar to the code of
125 insertion. But since we have to maintain the invariant that both
126 subtrees of a [Branch] node are non-empty, we use here the
127 ``smart constructor'' [branch] instead of [Branch]. *)
129 let branch = function
132 | (p
,m,t0
,t1
) -> Branch
(p
,m,t0
,t1
)
135 let rec rmv = function
137 | Leaf j
as t
-> if k
== j
then Empty
else t
138 | Branch
(p
,m,t0
,t1
) as t
->
139 if match_prefix k p
m then
141 branch (p
, m, rmv t0
, t1
)
143 branch (p
, m, t0
, rmv t1
)
149 (*s One nice property of Patricia trees is to support a fast union
150 operation (and also fast subset, difference and intersection
151 operations). When merging two branching trees we examine the
152 following four cases: (1) the trees have exactly the same
153 prefix; (2/3) one prefix contains the other one; and (4) the
154 prefixes disagree. In cases (1), (2) and (3) the recursion is
155 immediate; in case (4) the function [join] creates the appropriate
158 let rec merge = function
161 | Leaf k
, t
-> add k t
162 | t
, Leaf k
-> add k t
163 | (Branch
(p
,m,s0
,s1
) as s
), (Branch
(q
,n
,t0
,t1
) as t
) ->
164 if m == n
&& match_prefix q p
m then
165 (* The trees have the same prefix. Merge the subtrees. *)
166 Branch
(p
, m, merge (s0
,t0
), merge (s1
,t1
))
167 else if m < n
&& match_prefix q p
m then
168 (* [q] contains [p]. Merge [t] with a subtree of [s]. *)
170 Branch
(p
, m, merge (s0
,t
), s1
)
172 Branch
(p
, m, s0
, merge (s1
,t
))
173 else if m > n
&& match_prefix p q n
then
174 (* [p] contains [q]. Merge [s] with a subtree of [t]. *)
176 Branch
(q
, n
, merge (s
,t0
), t1
)
178 Branch
(q
, n
, t0
, merge (s
,t1
))
180 (* The prefixes disagree. *)
183 let union s t
= merge (s
,t
)
185 (*s When checking if [s1] is a subset of [s2] only two of the above
186 four cases are relevant: when the prefixes are the same and when the
187 prefix of [s1] contains the one of [s2], and then the recursion is
188 obvious. In the other two cases, the result is [false]. *)
190 let rec subset s1 s2
= match (s1
,s2
) with
193 | Leaf k1
, _
-> mem k1 s2
194 | Branch _
, Leaf _
-> false
195 | Branch
(p1
,m1
,l1
,r1
), Branch
(p2
,m2
,l2
,r2
) ->
196 if m1
== m2
&& p1
== p2
then
197 subset l1 l2
&& subset r1 r2
198 else if m1
> m2
&& match_prefix p1 p2 m2
then
199 if zero_bit p1 m2
then
200 subset l1 l2
&& subset r1 l2
202 subset l1 r2
&& subset r1 r2
206 (*s To compute the intersection and the difference of two sets, we
207 still examine the same four cases as in [merge]. The recursion is
210 let rec inter s1 s2
= match (s1
,s2
) with
213 | Leaf k1
, _
-> if mem k1 s2
then s1
else Empty
214 | _
, Leaf k2
-> if mem k2 s1
then s2
else Empty
215 | Branch
(p1
,m1
,l1
,r1
), Branch
(p2
,m2
,l2
,r2
) ->
216 if m1
== m2
&& p1
== p2
then
217 merge (inter l1 l2
, inter r1 r2
)
218 else if m1
< m2
&& match_prefix p2 p1 m1
then
219 inter (if zero_bit p2 m1
then l1
else r1
) s2
220 else if m1
> m2
&& match_prefix p1 p2 m2
then
221 inter s1
(if zero_bit p1 m2
then l2
else r2
)
225 let rec diff s1 s2
= match (s1
,s2
) with
228 | Leaf k1
, _
-> if mem k1 s2
then Empty
else s1
229 | _
, Leaf k2
-> remove k2 s1
230 | Branch
(p1
,m1
,l1
,r1
), Branch
(p2
,m2
,l2
,r2
) ->
231 if m1
== m2
&& p1
== p2
then
232 merge (diff l1 l2
, diff r1 r2
)
233 else if m1
< m2
&& match_prefix p2 p1 m1
then
234 if zero_bit p2 m1
then
235 merge (diff l1 s2
, r1
)
237 merge (l1
, diff r1 s2
)
238 else if m1
> m2
&& match_prefix p1 p2 m2
then
239 if zero_bit p1 m2
then diff s1 l2
else diff s1 r2
243 (*s All the following operations ([cardinal], [iter], [fold], [for_all],
244 [exists], [filter], [partition], [choose], [elements]) are
245 implemented as for any other kind of binary trees. *)
247 let rec cardinal = function
250 | Branch
(_
,_
,t0
,t1
) -> cardinal t0
+ cardinal t1
252 let rec iter f
= function
255 | Branch
(_
,_
,t0
,t1
) -> iter f t0
; iter f t1
257 let rec fold f s accu
= match s
with
260 | Branch
(_
,_
,t0
,t1
) -> fold f t0
(fold f t1 accu
)
262 let rec for_all p
= function
265 | Branch
(_
,_
,t0
,t1
) -> for_all p t0
&& for_all p t1
267 let rec exists p
= function
270 | Branch
(_
,_
,t0
,t1
) -> exists p t0
|| exists p t1
273 let rec filt acc
= function
275 | Leaf k
-> if p k
then add k acc
else acc
276 | Branch
(_
,_
,t0
,t1
) -> filt (filt acc t0
) t1
281 let rec part (t
,f
as acc
) = function
283 | Leaf k
-> if p k
then (add k t
, f
) else (t
, add k f
)
284 | Branch
(_
,_
,t0
,t1
) -> part (part acc t0
) t1
286 part (Empty
, Empty
) s
288 let rec choose = function
289 | Empty
-> raise Not_found
291 | Branch
(_
, _
,t0
,_
) -> choose t0
(* we know that [t0] is non-empty *)
294 let rec elements_aux acc
= function
297 | Branch
(_
,_
,l
,r
) -> elements_aux (elements_aux acc l
) r
301 (*s There is no way to give an efficient implementation of [min_elt]
302 and [max_elt], as with binary search trees. The following
303 implementation is a traversal of all elements, barely more
304 efficient than [fold min t (choose t)] (resp. [fold max t (choose
305 t)]). Note that we use the fact that there is no constructor
306 [Empty] under [Branch] and therefore always a minimal
307 (resp. maximal) element there. *)
309 let rec min_elt = function
310 | Empty
-> raise Not_found
312 | Branch
(_
,_
,s
,t
) -> min
(min_elt s
) (min_elt t
)
314 let rec max_elt = function
315 | Empty
-> raise Not_found
317 | Branch
(_
,_
,s
,t
) -> max
(max_elt s
) (max_elt t
)
319 (*s Another nice property of Patricia trees is to be independent of the
320 order of insertion. As a consequence, two Patricia trees have the
321 same elements if and only if they are structurally equal. *)
325 let compare = compare
328 let make l
= List.fold_right
add l
empty
331 (*s Additional functions w.r.t to [Set.S]. *)
333 let rec intersect s1 s2
= match (s1
,s2
) with
336 | Leaf k1
, _
-> mem k1 s2
337 | _
, Leaf k2
-> mem k2 s1
338 | Branch
(p1
,m1
,l1
,r1
), Branch
(p2
,m2
,l2
,r2
) ->
339 if m1
== m2
&& p1
== p2
then
340 intersect l1 l2
|| intersect r1 r2
341 else if m1
< m2
&& match_prefix p2 p1 m1
then
342 intersect (if zero_bit p2 m1
then l1
else r1
) s2
343 else if m1
> m2
&& match_prefix p1 p2 m2
then
344 intersect s1
(if zero_bit p1 m2
then l2
else r2
)