1 (* made by Sebastien Ferre *)
3 (* type of nodes in suffix trees *)
5 seqid
: int; (* sequence index in which the positions start and final are defined *)
6 mutable start
: int; (* start and final position of the word labelling the node *)
8 mutable link
: node
; (* suffix link *)
12 | Children
of (char
,node
) Hashtbl.t
(* for non-leaves: children nodes *)
13 (* for the key '\000', all values are relevant (use Hashtbl.find_all) *)
14 | Index
of int (* for leaves: position of recognized suffix *)
16 (* type of suffix trees *)
17 type t
= string array
* node
19 (* the initial root node *)
20 let empty : unit -> node
=
22 let rec root = {seqid
= -1; start
=0; final
=ref (-1); link
=root; v
=Children
(Hashtbl.create
2)} in
26 (* --------------------------------------------------------------------------------
27 Operations on substrings of sequences
28 -------------------------------------------------------------------------------- *)
30 type subseq
= string * int * int (* (seq, pos, len) *)
32 let subseq_empty = ("",0,0) (* non-significant subseq *)
34 let subseq_is_empty (s
,pos
,len
) = len
= 0
36 let subseq_get (s
,pos
,len
) i
= s
.[pos
+i
]
38 let subseq_length (s
,pos
,len
) = len
40 let subseq_sub (s
,pos
,len
) pos' len'
= (s
,pos
+pos'
,len'
)
42 let subseq_extend (s
,pos
,len
) = (s
,pos
,len
+1)
44 (* -------------------------------------------------------------------------------
45 Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
46 the snd node [child] is significant only when [implicit] is not the empty string,
47 and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
48 defined by a sequence, a start and a length.
49 ------------------------------------------------------------------------------- *)
52 c1
<>'
\000'
& c1
=c2
(* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)
54 (* returns the child node that recognizes [implicit] from the node [explicit] *)
55 let get_child seqar
(explicit
,implicit
) =
56 if subseq_is_empty implicit
59 let c = subseq_get implicit
0 in
64 | Children h
-> Hashtbl.find h
c
65 | Index _
-> raise Not_found
66 (* List.find (fun child -> eq_char seqar.(child.seqid).[child.start] c) explicit.children *)
68 (* ensures that implicit does not span over another node below [explicit] *)
69 let rec canonical seqar
(explicit
,implicit
,child
) =
70 if subseq_is_empty implicit
71 then (explicit
,implicit
,child
)
73 let l = !(child
.final
) - child
.start
+ 1 in
74 let a = subseq_length implicit
in
76 then (explicit
,implicit
,child
)
78 let implicit'
= subseq_sub implicit l (a-l) in
79 canonical seqar
(child
, implicit'
, get_child seqar
(child
,implicit'
))
81 (* test whether an implicit node is the root node *)
82 let is_root root (explicit
,implicit,_
) =
83 explicit
== root & subseq_is_empty implicit
85 (* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
86 and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
87 let has_child seqar
(explicit
,implicit,child
) (k
,i
) =
88 let a = subseq_length implicit in
90 if eq_char seqar
.(child
.seqid
).[child
.start
+a] seqar
.(k
).[i
]
91 then Some
(explicit
, subseq_extend implicit, child
)
95 let implicit'
= (seqar
.(k
),i
,1) in
96 Some
(explicit
, implicit'
, get_child seqar
(explicit
,implicit'
))
97 with Not_found
-> None
99 (* --------------------------------
100 creation of new nodes and leaves
101 -------------------------------- *)
103 let add_leaf (seqar
,root) node seqid start final_ref index
=
107 seqar
.(seqid
).[start
]
108 {seqid
=seqid
; start
=start
; final
=final_ref
; link
=root; v
=(Index index
)}
109 | Index _
-> raise
(Invalid_argument
"Suffix_tree.add_leaf: 2nd argument must not be a leaf")
111 (* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
112 let insert_node (seqar
,root) (explicit
,implicit,child
) =
113 let a = subseq_length implicit in
117 match explicit
.v
with
119 let c_child_old = seqar
.(child
.seqid
).[child
.start
] in
120 let c_child_new = seqar
.(child
.seqid
).[child
.start
+a] in
124 final
= ref (child
.start
+a-1);
126 v
= Children
(let h'
= Hashtbl.create
(Hashtbl.length
h) in Hashtbl.add
h'
c_child_new child
; h'
)
128 child
.start
<- child
.start
+a;
129 Hashtbl.replace
h c_child_old n'
;
131 | Index _
-> raise
(Invalid_argument
"Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")
133 (* add a suffix link from [pred_opt] (if defined) to [explicit] *)
134 let add_link root pred_opt explicit
=
135 (*if explicit != root then*) (* create a new suffix link *)
137 | Some
n -> (*if n.link = None then*) n.link
<- explicit
144 (* get the node refered by the suffix link at [n] *)
146 let suffix_link (root : node) (n : node) : node =
148 | None -> root (* by default, the suffix link points to the root node *)
152 (* extend suffix_link for implicit nodes *)
153 let link (seqar
,root) = function (* TODO *)
154 | (explicit
,implicit,_
) when subseq_is_empty implicit ->
155 let explicit'
= explicit.link (*suffix_link root explicit*) in
156 (explicit'
, subseq_empty, explicit'
)
157 | (explicit,implicit,_
) ->
160 let implicit'
= subseq_sub implicit 1 (subseq_length implicit - 1) in
161 canonical seqar
(root, implicit'
, get_child seqar
(root,implicit'
))
163 let explicit'
= explicit.link (*suffix_link root explicit*) in
164 canonical seqar
(explicit'
, implicit, get_child seqar
(explicit'
,implicit))
166 (* --------------------------------------------------------------
167 GST update for the new character c at position i in sequence k
168 -------------------------------------------------------------- *)
170 (* state for 'update' *)
173 mutable startj
: int;
174 mutable startnode
: node
* subseq
* node
177 let rec update (seqar
,root) (k
,i
) res pred_opt
=
178 (* c = seqar.(k).[i] *)
179 match has_child seqar res
.startnode
(k
,i
) with
180 | Some extended_startnode
-> (* startnode can be extended by [c] *)
181 let explicit, implicit, _
= res
.startnode
in
182 assert (pred_opt
= None
or subseq_is_empty implicit);
183 (* if a link has been followed after node creation, then we are on an explicit node *)
184 add_link root pred_opt
explicit;
185 res
.startnode
<- canonical seqar extended_startnode
186 | None
-> (* startnode cannot be extended by [c] ... *)
187 let n'
= insert_node (seqar
,root) res
.startnode
in (* ... so we insert a new node ... *)
188 add_link root pred_opt
n'
; (* ... a suffix link from the last created node (if defined) ... *)
189 if seqar
.(k
).[res
.startj
] <> '
\000'
then
190 add_leaf (seqar
,root) n' k i res
.terminal res
.startj
; (* ... and a new leaf for the suffix at position [res.startj] *)
191 res
.startj
<- res
.startj
+ 1; (* prepare for the next suffix *)
192 if not
(is_root root res
.startnode
)
193 then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
194 res
.startnode
<- link (seqar
,root) res
.startnode
; (* ... follow the suffix link to find the next suffix ... *)
195 update (seqar
,root) (k
,i
) res
(Some
n'
) end (* ... and loop on [update] *)
197 (* -------------------------------
198 implementing the .mli interface
199 ------------------------------- *)
201 let make : string list
-> t
=
203 let l = List.length l_seq
in
204 let seqar = Array.make l "" in
205 let root = empty () in
206 let st = (seqar, root) in
207 ignore
(List.fold_left
208 (fun k seq
-> (* for every sequence/string [seq], numbered [k] ... *)
209 seqar.(k
) <- seq ^
String.make 1 '
\000'
; (* add a terminal symbol ... *)
210 let res = {terminal
=ref (-1); startj
=0; startnode
=(root,subseq_empty,root)} in (* initialize for [update] ... *)
211 for i
= 0 to String.length
seqar.(k
) - 1 do (* for every position [i] in the sequence ... *)
212 incr
res.terminal
; (* increment the leaves final position ... *)
213 update st (k
,i
) res None
(* call [update] for updating the suffix tree with the character at position [i] *)
219 let string (seqar,root : t
) (k
: int) =
220 let seq = seqar.(k
) in
221 String.sub
seq 0 (String.length
seq - 1) (* removing the terminal symbol *)
223 let string_list (seqar,root : t
) =
224 List.map
(fun seq -> String.sub
seq 0 (String.length
seq - 1)) (Array.to_list
seqar)
226 let root (seq,root : t
) = root
228 let word (seqar,root) node
=
231 else String.sub
seqar.(node
.seqid
) node
.start
(!(node
.final
) - node
.start
+ (match node
.v
with Children _
-> 1 | Index _
-> 0))
233 let children (gst
: t
) node
=
236 Hashtbl.fold
(fun c n l -> n::l) h []
239 let index (seq,root) node
: int * int =
241 | Children _
-> raise
(Invalid_argument
"Suffix_tree.index: 2nd argument must be a leaf")
242 | Index i
-> (node
.seqid
, i
)
244 let linked_node (seqar,root : t
) (n : node
) : node
=
245 n.link (*suffix_link root n*)
247 let rec implicit_node (seqar,node
: t
) (word : string) =
248 let (explicit, (s
,i
,len
), child
) = implicit_node_aux
(seqar,node
) (word,0,String.length
word) in
249 (explicit, String.sub s i len
, child
)
250 and implicit_node_aux
(seqar,node
) implicit =
251 let w = subseq_length implicit in
252 let child = get_child seqar (node
,implicit) in
253 let l = !(child.final
) - child.start
+ 1 in
255 while !a < l & !a < w & eq_char seqar.(child.seqid
).[child.start
+ !a] (subseq_get implicit !a) do
257 done; (* [!a] is the first mismatch position, or the length of [child] label *)
261 else implicit_node_aux
(seqar,child) (subseq_sub implicit !a (w - !a))
262 else (node
,implicit,child)
265 let rec synthesized (seqar,root : t) (f : 'a list -> node -> 'a) =
266 synthesized_node (seqar,root) f root
267 and synthesized_node st f node =
268 f (List.map (synthesized_node st f) (children st node)) node
272 let rec fold : t
-> ('
h -> node
-> bool) -> ('
h -> node
-> '
h) -> ('s list
-> '
h -> node
-> 's
) -> '
h -> 's
=
273 fun gst f
h s init
->
274 fold_node gst f
h s init
(root gst
)
275 and fold_node gst f
h s h_node node
=
278 (fun child -> fold_node gst f
h s
(h h_node
child) child)
279 (List.filter
(f h_node
) (children gst node
)))
283 (* synthesized attributes only *)
284 let fold_s_node gst s node
= fold_node gst
(fun _ _
-> true) (fun _ _
-> ()) (fun l _
n -> s
l n) () node
285 let fold_s gst s
= fold_s_node gst s
(root gst
)
287 (* filtering and synthesizing, no inheritance *)
288 let fold_fs gst f s
= fold gst
(fun _
n -> f
n) (fun _ _
-> ()) (fun l _
n -> s
l n) ()
291 type tree
= Node
of string * tree list
| Leaf
of string * (int * int)
296 let w = word gst
n in
298 then Leaf
(w, index gst
n)
301 (* applications of suffix trees *)
303 let exact_matches : t
-> string -> (int * int) list
=
306 let explicit, implicit, child = implicit_node gst
word in
308 (fun l n -> if l=[] then [index gst
n] else List.concat
l)
314 let contained_string gst
word =
315 List.map
(fun (i
,j
) -> Array.get
(fst gst
) i
) (exact_matches gst
word)