Release coccinelle-0.2.1-rc1
[bpt/coccinelle.git] / commons / ocamlextra / suffix_tree.ml
CommitLineData
34e49164
C
1(* made by Sebastien Ferre *)
2
3(* type of nodes in suffix trees *)
4type node = {
5 seqid : int; (* sequence index in which the positions start and final are defined *)
6 mutable start : int; (* start and final position of the word labelling the node *)
7 final : int ref;
8 mutable link : node; (* suffix link *)
9 v : node_value
10 }
11and node_value =
12 | Children of (char,node) Hashtbl.t (* for non-leaves: children nodes *)
13 (* for the key '\000', all values are relevant (use Hashtbl.find_all) *)
14 | Index of int (* for leaves: position of recognized suffix *)
15
16(* type of suffix trees *)
17type t = string array * node
18
19(* the initial root node *)
20let empty : unit -> node =
21 fun () ->
22 let rec root = {seqid= -1; start=0; final=ref (-1); link=root; v=Children (Hashtbl.create 2)} in
23 root
24
25
26(* --------------------------------------------------------------------------------
27 Operations on substrings of sequences
28 -------------------------------------------------------------------------------- *)
29
30type subseq = string * int * int (* (seq, pos, len) *)
31
32let subseq_empty = ("",0,0) (* non-significant subseq *)
33
34let subseq_is_empty (s,pos,len) = len = 0
35
36let subseq_get (s,pos,len) i = s.[pos+i]
37
38let subseq_length (s,pos,len) = len
39
40let subseq_sub (s,pos,len) pos' len' = (s,pos+pos',len')
41
42let subseq_extend (s,pos,len) = (s,pos,len+1)
43
44(* -------------------------------------------------------------------------------
45 Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
46 the snd node [child] is significant only when [implicit] is not the empty string,
47 and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
48 defined by a sequence, a start and a length.
49 ------------------------------------------------------------------------------- *)
50
51let eq_char c1 c2 =
52 c1<>'\000' & c1=c2 (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)
53
54(* returns the child node that recognizes [implicit] from the node [explicit] *)
55let get_child seqar (explicit,implicit) =
56 if subseq_is_empty implicit
57 then explicit
58 else
59 let c = subseq_get implicit 0 in
60 if c = '\000'
61 then raise Not_found
62 else
63 match explicit.v with
64 | Children h -> Hashtbl.find h c
65 | Index _ -> raise Not_found
66 (* List.find (fun child -> eq_char seqar.(child.seqid).[child.start] c) explicit.children *)
67
68(* ensures that implicit does not span over another node below [explicit] *)
69let rec canonical seqar (explicit,implicit,child) =
70 if subseq_is_empty implicit
71 then (explicit,implicit,child)
72 else
73 let l = !(child.final) - child.start + 1 in
74 let a = subseq_length implicit in
75 if a < l
76 then (explicit,implicit,child)
77 else
78 let implicit' = subseq_sub implicit l (a-l) in
79 canonical seqar (child, implicit', get_child seqar (child,implicit'))
80
81(* test whether an implicit node is the root node *)
82let is_root root (explicit,implicit,_) =
83 explicit == root & subseq_is_empty implicit
84
85(* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
86 and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
87let has_child seqar (explicit,implicit,child) (k,i) =
88 let a = subseq_length implicit in
89 if a <> 0 then
90 if eq_char seqar.(child.seqid).[child.start+a] seqar.(k).[i]
91 then Some (explicit, subseq_extend implicit, child)
92 else None
93 else
94 try
95 let implicit' = (seqar.(k),i,1) in
96 Some (explicit, implicit', get_child seqar (explicit,implicit'))
97 with Not_found -> None
98
99(* --------------------------------
100 creation of new nodes and leaves
101 -------------------------------- *)
102
103let add_leaf (seqar,root) node seqid start final_ref index =
104 match node.v with
105 | Children h ->
106 Hashtbl.add h
107 seqar.(seqid).[start]
108 {seqid=seqid; start=start; final=final_ref; link=root; v=(Index index)}
109 | Index _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")
110
111(* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
112let insert_node (seqar,root) (explicit,implicit,child) =
113 let a = subseq_length implicit in
114 if a = 0
115 then explicit
116 else
117 match explicit.v with
118 | Children h ->
119 let c_child_old = seqar.(child.seqid).[child.start] in
120 let c_child_new = seqar.(child.seqid).[child.start+a] in
121 let n' = {
122 seqid = child.seqid;
123 start = child.start;
124 final = ref (child.start+a-1);
125 link = root;
126 v = Children (let h' = Hashtbl.create (Hashtbl.length h) in Hashtbl.add h' c_child_new child; h')
127 } in
128 child.start <- child.start+a;
129 Hashtbl.replace h c_child_old n';
130 n'
131 | Index _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")
132
133(* add a suffix link from [pred_opt] (if defined) to [explicit] *)
134let add_link root pred_opt explicit =
135 (*if explicit != root then*) (* create a new suffix link *)
136 match pred_opt with
137 | Some n -> (*if n.link = None then*) n.link <- explicit
138 | None -> ()
139
ae4735db 140(* ------------
34e49164
C
141 suffix links
142 ------------ *)
143
144(* get the node refered by the suffix link at [n] *)
145(*
146let suffix_link (root : node) (n : node) : node =
147 match n.link with
148 | None -> root (* by default, the suffix link points to the root node *)
149 | Some n' -> n'
150*)
151
152(* extend suffix_link for implicit nodes *)
153let link (seqar,root) = function (* TODO *)
154 | (explicit,implicit,_) when subseq_is_empty implicit ->
155 let explicit' = explicit.link (*suffix_link root explicit*) in
156 (explicit', subseq_empty, explicit')
157 | (explicit,implicit,_) ->
158 if explicit == root
159 then
160 let implicit' = subseq_sub implicit 1 (subseq_length implicit - 1) in
161 canonical seqar (root, implicit', get_child seqar (root,implicit'))
162 else
163 let explicit' = explicit.link (*suffix_link root explicit*) in
164 canonical seqar (explicit', implicit, get_child seqar (explicit',implicit))
165
166(* --------------------------------------------------------------
167 GST update for the new character c at position i in sequence k
168 -------------------------------------------------------------- *)
169
170(* state for 'update' *)
171type res = {
172 terminal : int ref;
173 mutable startj : int;
174 mutable startnode : node * subseq * node
175 }
176
177let rec update (seqar,root) (k,i) res pred_opt =
178 (* c = seqar.(k).[i] *)
179 match has_child seqar res.startnode (k,i) with
180 | Some extended_startnode -> (* startnode can be extended by [c] *)
181 let explicit, implicit, _ = res.startnode in
182 assert (pred_opt = None or subseq_is_empty implicit);
183 (* if a link has been followed after node creation, then we are on an explicit node *)
184 add_link root pred_opt explicit;
185 res.startnode <- canonical seqar extended_startnode
186 | None -> (* startnode cannot be extended by [c] ... *)
187 let n' = insert_node (seqar,root) res.startnode in (* ... so we insert a new node ... *)
188 add_link root pred_opt n'; (* ... a suffix link from the last created node (if defined) ... *)
189 if seqar.(k).[res.startj] <> '\000' then
190 add_leaf (seqar,root) n' k i res.terminal res.startj; (* ... and a new leaf for the suffix at position [res.startj] *)
191 res.startj <- res.startj + 1; (* prepare for the next suffix *)
192 if not (is_root root res.startnode)
193 then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
194 res.startnode <- link (seqar,root) res.startnode; (* ... follow the suffix link to find the next suffix ... *)
195 update (seqar,root) (k,i) res (Some n') end (* ... and loop on [update] *)
196
197(* -------------------------------
198 implementing the .mli interface
199 ------------------------------- *)
200
201let make : string list -> t =
202 fun l_seq ->
203 let l = List.length l_seq in
204 let seqar = Array.make l "" in
205 let root = empty () in
206 let st = (seqar, root) in
207 ignore (List.fold_left
208 (fun k seq -> (* for every sequence/string [seq], numbered [k] ... *)
209 seqar.(k) <- seq ^ String.make 1 '\000'; (* add a terminal symbol ... *)
210 let res = {terminal=ref (-1); startj=0; startnode=(root,subseq_empty,root)} in (* initialize for [update] ... *)
211 for i = 0 to String.length seqar.(k) - 1 do (* for every position [i] in the sequence ... *)
212 incr res.terminal; (* increment the leaves final position ... *)
213 update st (k,i) res None (* call [update] for updating the suffix tree with the character at position [i] *)
214 done;
215 k+1)
216 0 l_seq);
217 st
218
219let string (seqar,root : t) (k : int) =
220 let seq = seqar.(k) in
221 String.sub seq 0 (String.length seq - 1) (* removing the terminal symbol *)
222
223let string_list (seqar,root : t) =
224 List.map (fun seq -> String.sub seq 0 (String.length seq - 1)) (Array.to_list seqar)
225
226let root (seq,root : t) = root
227
228let word (seqar,root) node =
229 if node == root
230 then ""
231 else String.sub seqar.(node.seqid) node.start (!(node.final) - node.start + (match node.v with Children _ -> 1 | Index _ -> 0))
232
233let children (gst : t) node =
234 match node.v with
235 | Children h ->
236 Hashtbl.fold (fun c n l -> n::l) h []
237 | Index _ -> []
238
239let index (seq,root) node : int * int =
240 match node.v with
241 | Children _ -> raise (Invalid_argument "Suffix_tree.index: 2nd argument must be a leaf")
242 | Index i -> (node.seqid, i)
243
244let linked_node (seqar,root : t) (n : node) : node =
245 n.link (*suffix_link root n*)
246
247let rec implicit_node (seqar,node : t) (word : string) =
248 let (explicit, (s,i,len), child) = implicit_node_aux (seqar,node) (word,0,String.length word) in
249 (explicit, String.sub s i len, child)
250and implicit_node_aux (seqar,node) implicit =
251 let w = subseq_length implicit in
252 let child = get_child seqar (node,implicit) in
253 let l = !(child.final) - child.start + 1 in
254 let a = ref 1 in
255 while !a < l & !a < w & eq_char seqar.(child.seqid).[child.start + !a] (subseq_get implicit !a) do
256 incr a
257 done; (* [!a] is the first mismatch position, or the length of [child] label *)
258 if !a < w then
259 if !a < l
260 then raise Not_found
261 else implicit_node_aux (seqar,child) (subseq_sub implicit !a (w - !a))
ae4735db 262 else (node,implicit,child)
34e49164
C
263
264(*
265let rec synthesized (seqar,root : t) (f : 'a list -> node -> 'a) =
266 synthesized_node (seqar,root) f root
267and synthesized_node st f node =
268 f (List.map (synthesized_node st f) (children st node)) node
269*)
270
271(* general fold *)
272let rec fold : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
273 fun gst f h s init ->
274 fold_node gst f h s init (root gst)
275and fold_node gst f h s h_node node =
276 s
277 (List.map
278 (fun child -> fold_node gst f h s (h h_node child) child)
279 (List.filter (f h_node) (children gst node)))
280 h_node
281 node
282
283(* synthesized attributes only *)
284let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node
285let fold_s gst s = fold_s_node gst s (root gst)
286
287(* filtering and synthesizing, no inheritance *)
288let fold_fs gst f s = fold gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) ()
289
290
291type tree = Node of string * tree list | Leaf of string * (int * int)
292
293let readable gst =
294 fold_s gst
295 (fun l n ->
296 let w = word gst n in
297 if l=[]
298 then Leaf (w, index gst n)
299 else Node (w, l))
300
301(* applications of suffix trees *)
302
303let exact_matches : t -> string -> (int * int) list =
304 fun gst word ->
305 try
306 let explicit, implicit, child = implicit_node gst word in
307 fold_s_node gst
308 (fun l n -> if l=[] then [index gst n] else List.concat l)
309 child
310 with Not_found -> []
311
312
313
ae4735db 314let contained_string gst word =
34e49164
C
315 List.map (fun (i,j) -> Array.get (fst gst) i) (exact_matches gst word)
316
317
318