Commit | Line | Data |
---|---|---|
34e49164 C |
1 | (* made by Sebastien Ferre *) |
2 | ||
3 | (* type of nodes in suffix trees *) | |
4 | type node = { | |
5 | seqid : int; (* sequence index in which the positions start and final are defined *) | |
6 | mutable start : int; (* start and final position of the word labelling the node *) | |
7 | final : int ref; | |
8 | mutable link : node; (* suffix link *) | |
9 | v : node_value | |
10 | } | |
11 | and node_value = | |
12 | | Children of (char,node) Hashtbl.t (* for non-leaves: children nodes *) | |
13 | (* for the key '\000', all values are relevant (use Hashtbl.find_all) *) | |
14 | | Index of int (* for leaves: position of recognized suffix *) | |
15 | ||
16 | (* type of suffix trees *) | |
17 | type t = string array * node | |
18 | ||
19 | (* the initial root node *) | |
20 | let empty : unit -> node = | |
21 | fun () -> | |
22 | let rec root = {seqid= -1; start=0; final=ref (-1); link=root; v=Children (Hashtbl.create 2)} in | |
23 | root | |
24 | ||
25 | ||
26 | (* -------------------------------------------------------------------------------- | |
27 | Operations on substrings of sequences | |
28 | -------------------------------------------------------------------------------- *) | |
29 | ||
30 | type subseq = string * int * int (* (seq, pos, len) *) | |
31 | ||
32 | let subseq_empty = ("",0,0) (* non-significant subseq *) | |
33 | ||
34 | let subseq_is_empty (s,pos,len) = len = 0 | |
35 | ||
36 | let subseq_get (s,pos,len) i = s.[pos+i] | |
37 | ||
38 | let subseq_length (s,pos,len) = len | |
39 | ||
40 | let subseq_sub (s,pos,len) pos' len' = (s,pos+pos',len') | |
41 | ||
42 | let subseq_extend (s,pos,len) = (s,pos,len+1) | |
43 | ||
44 | (* ------------------------------------------------------------------------------- | |
45 | Operations on implicit nodes (explicit, implicit, child : node * subseq * node) | |
46 | the snd node [child] is significant only when [implicit] is not the empty string, | |
47 | and is the child that recognizes [implicit] starting from [explicit]. [implicit] is | |
48 | defined by a sequence, a start and a length. | |
49 | ------------------------------------------------------------------------------- *) | |
50 | ||
51 | let eq_char c1 c2 = | |
52 | c1<>'\000' & c1=c2 (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *) | |
53 | ||
54 | (* returns the child node that recognizes [implicit] from the node [explicit] *) | |
55 | let get_child seqar (explicit,implicit) = | |
56 | if subseq_is_empty implicit | |
57 | then explicit | |
58 | else | |
59 | let c = subseq_get implicit 0 in | |
60 | if c = '\000' | |
61 | then raise Not_found | |
62 | else | |
63 | match explicit.v with | |
64 | | Children h -> Hashtbl.find h c | |
65 | | Index _ -> raise Not_found | |
66 | (* List.find (fun child -> eq_char seqar.(child.seqid).[child.start] c) explicit.children *) | |
67 | ||
68 | (* ensures that implicit does not span over another node below [explicit] *) | |
69 | let rec canonical seqar (explicit,implicit,child) = | |
70 | if subseq_is_empty implicit | |
71 | then (explicit,implicit,child) | |
72 | else | |
73 | let l = !(child.final) - child.start + 1 in | |
74 | let a = subseq_length implicit in | |
75 | if a < l | |
76 | then (explicit,implicit,child) | |
77 | else | |
78 | let implicit' = subseq_sub implicit l (a-l) in | |
79 | canonical seqar (child, implicit', get_child seqar (child,implicit')) | |
80 | ||
81 | (* test whether an implicit node is the root node *) | |
82 | let is_root root (explicit,implicit,_) = | |
83 | explicit == root & subseq_is_empty implicit | |
84 | ||
85 | (* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST, | |
86 | and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *) | |
87 | let has_child seqar (explicit,implicit,child) (k,i) = | |
88 | let a = subseq_length implicit in | |
89 | if a <> 0 then | |
90 | if eq_char seqar.(child.seqid).[child.start+a] seqar.(k).[i] | |
91 | then Some (explicit, subseq_extend implicit, child) | |
92 | else None | |
93 | else | |
94 | try | |
95 | let implicit' = (seqar.(k),i,1) in | |
96 | Some (explicit, implicit', get_child seqar (explicit,implicit')) | |
97 | with Not_found -> None | |
98 | ||
99 | (* -------------------------------- | |
100 | creation of new nodes and leaves | |
101 | -------------------------------- *) | |
102 | ||
103 | let add_leaf (seqar,root) node seqid start final_ref index = | |
104 | match node.v with | |
105 | | Children h -> | |
106 | Hashtbl.add h | |
107 | seqar.(seqid).[start] | |
108 | {seqid=seqid; start=start; final=final_ref; link=root; v=(Index index)} | |
109 | | Index _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf") | |
110 | ||
111 | (* make explicit an implicit node by inserting a new node between [explicit] and [child] *) | |
112 | let insert_node (seqar,root) (explicit,implicit,child) = | |
113 | let a = subseq_length implicit in | |
114 | if a = 0 | |
115 | then explicit | |
116 | else | |
117 | match explicit.v with | |
118 | | Children h -> | |
119 | let c_child_old = seqar.(child.seqid).[child.start] in | |
120 | let c_child_new = seqar.(child.seqid).[child.start+a] in | |
121 | let n' = { | |
122 | seqid = child.seqid; | |
123 | start = child.start; | |
124 | final = ref (child.start+a-1); | |
125 | link = root; | |
126 | v = Children (let h' = Hashtbl.create (Hashtbl.length h) in Hashtbl.add h' c_child_new child; h') | |
127 | } in | |
128 | child.start <- child.start+a; | |
129 | Hashtbl.replace h c_child_old n'; | |
130 | n' | |
131 | | Index _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf") | |
132 | ||
133 | (* add a suffix link from [pred_opt] (if defined) to [explicit] *) | |
134 | let add_link root pred_opt explicit = | |
135 | (*if explicit != root then*) (* create a new suffix link *) | |
136 | match pred_opt with | |
137 | | Some n -> (*if n.link = None then*) n.link <- explicit | |
138 | | None -> () | |
139 | ||
ae4735db | 140 | (* ------------ |
34e49164 C |
141 | suffix links |
142 | ------------ *) | |
143 | ||
144 | (* get the node refered by the suffix link at [n] *) | |
145 | (* | |
146 | let suffix_link (root : node) (n : node) : node = | |
147 | match n.link with | |
148 | | None -> root (* by default, the suffix link points to the root node *) | |
149 | | Some n' -> n' | |
150 | *) | |
151 | ||
152 | (* extend suffix_link for implicit nodes *) | |
153 | let link (seqar,root) = function (* TODO *) | |
154 | | (explicit,implicit,_) when subseq_is_empty implicit -> | |
155 | let explicit' = explicit.link (*suffix_link root explicit*) in | |
156 | (explicit', subseq_empty, explicit') | |
157 | | (explicit,implicit,_) -> | |
158 | if explicit == root | |
159 | then | |
160 | let implicit' = subseq_sub implicit 1 (subseq_length implicit - 1) in | |
161 | canonical seqar (root, implicit', get_child seqar (root,implicit')) | |
162 | else | |
163 | let explicit' = explicit.link (*suffix_link root explicit*) in | |
164 | canonical seqar (explicit', implicit, get_child seqar (explicit',implicit)) | |
165 | ||
166 | (* -------------------------------------------------------------- | |
167 | GST update for the new character c at position i in sequence k | |
168 | -------------------------------------------------------------- *) | |
169 | ||
170 | (* state for 'update' *) | |
171 | type res = { | |
172 | terminal : int ref; | |
173 | mutable startj : int; | |
174 | mutable startnode : node * subseq * node | |
175 | } | |
176 | ||
177 | let rec update (seqar,root) (k,i) res pred_opt = | |
178 | (* c = seqar.(k).[i] *) | |
179 | match has_child seqar res.startnode (k,i) with | |
180 | | Some extended_startnode -> (* startnode can be extended by [c] *) | |
181 | let explicit, implicit, _ = res.startnode in | |
182 | assert (pred_opt = None or subseq_is_empty implicit); | |
183 | (* if a link has been followed after node creation, then we are on an explicit node *) | |
184 | add_link root pred_opt explicit; | |
185 | res.startnode <- canonical seqar extended_startnode | |
186 | | None -> (* startnode cannot be extended by [c] ... *) | |
187 | let n' = insert_node (seqar,root) res.startnode in (* ... so we insert a new node ... *) | |
188 | add_link root pred_opt n'; (* ... a suffix link from the last created node (if defined) ... *) | |
189 | if seqar.(k).[res.startj] <> '\000' then | |
190 | add_leaf (seqar,root) n' k i res.terminal res.startj; (* ... and a new leaf for the suffix at position [res.startj] *) | |
191 | res.startj <- res.startj + 1; (* prepare for the next suffix *) | |
192 | if not (is_root root res.startnode) | |
193 | then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *) | |
194 | res.startnode <- link (seqar,root) res.startnode; (* ... follow the suffix link to find the next suffix ... *) | |
195 | update (seqar,root) (k,i) res (Some n') end (* ... and loop on [update] *) | |
196 | ||
197 | (* ------------------------------- | |
198 | implementing the .mli interface | |
199 | ------------------------------- *) | |
200 | ||
201 | let make : string list -> t = | |
202 | fun l_seq -> | |
203 | let l = List.length l_seq in | |
204 | let seqar = Array.make l "" in | |
205 | let root = empty () in | |
206 | let st = (seqar, root) in | |
207 | ignore (List.fold_left | |
208 | (fun k seq -> (* for every sequence/string [seq], numbered [k] ... *) | |
209 | seqar.(k) <- seq ^ String.make 1 '\000'; (* add a terminal symbol ... *) | |
210 | let res = {terminal=ref (-1); startj=0; startnode=(root,subseq_empty,root)} in (* initialize for [update] ... *) | |
211 | for i = 0 to String.length seqar.(k) - 1 do (* for every position [i] in the sequence ... *) | |
212 | incr res.terminal; (* increment the leaves final position ... *) | |
213 | update st (k,i) res None (* call [update] for updating the suffix tree with the character at position [i] *) | |
214 | done; | |
215 | k+1) | |
216 | 0 l_seq); | |
217 | st | |
218 | ||
219 | let string (seqar,root : t) (k : int) = | |
220 | let seq = seqar.(k) in | |
221 | String.sub seq 0 (String.length seq - 1) (* removing the terminal symbol *) | |
222 | ||
223 | let string_list (seqar,root : t) = | |
224 | List.map (fun seq -> String.sub seq 0 (String.length seq - 1)) (Array.to_list seqar) | |
225 | ||
226 | let root (seq,root : t) = root | |
227 | ||
228 | let word (seqar,root) node = | |
229 | if node == root | |
230 | then "" | |
231 | else String.sub seqar.(node.seqid) node.start (!(node.final) - node.start + (match node.v with Children _ -> 1 | Index _ -> 0)) | |
232 | ||
233 | let children (gst : t) node = | |
234 | match node.v with | |
235 | | Children h -> | |
236 | Hashtbl.fold (fun c n l -> n::l) h [] | |
237 | | Index _ -> [] | |
238 | ||
239 | let index (seq,root) node : int * int = | |
240 | match node.v with | |
241 | | Children _ -> raise (Invalid_argument "Suffix_tree.index: 2nd argument must be a leaf") | |
242 | | Index i -> (node.seqid, i) | |
243 | ||
244 | let linked_node (seqar,root : t) (n : node) : node = | |
245 | n.link (*suffix_link root n*) | |
246 | ||
247 | let rec implicit_node (seqar,node : t) (word : string) = | |
248 | let (explicit, (s,i,len), child) = implicit_node_aux (seqar,node) (word,0,String.length word) in | |
249 | (explicit, String.sub s i len, child) | |
250 | and implicit_node_aux (seqar,node) implicit = | |
251 | let w = subseq_length implicit in | |
252 | let child = get_child seqar (node,implicit) in | |
253 | let l = !(child.final) - child.start + 1 in | |
254 | let a = ref 1 in | |
255 | while !a < l & !a < w & eq_char seqar.(child.seqid).[child.start + !a] (subseq_get implicit !a) do | |
256 | incr a | |
257 | done; (* [!a] is the first mismatch position, or the length of [child] label *) | |
258 | if !a < w then | |
259 | if !a < l | |
260 | then raise Not_found | |
261 | else implicit_node_aux (seqar,child) (subseq_sub implicit !a (w - !a)) | |
ae4735db | 262 | else (node,implicit,child) |
34e49164 C |
263 | |
264 | (* | |
265 | let rec synthesized (seqar,root : t) (f : 'a list -> node -> 'a) = | |
266 | synthesized_node (seqar,root) f root | |
267 | and synthesized_node st f node = | |
268 | f (List.map (synthesized_node st f) (children st node)) node | |
269 | *) | |
270 | ||
271 | (* general fold *) | |
272 | let rec fold : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's = | |
273 | fun gst f h s init -> | |
274 | fold_node gst f h s init (root gst) | |
275 | and fold_node gst f h s h_node node = | |
276 | s | |
277 | (List.map | |
278 | (fun child -> fold_node gst f h s (h h_node child) child) | |
279 | (List.filter (f h_node) (children gst node))) | |
280 | h_node | |
281 | node | |
282 | ||
283 | (* synthesized attributes only *) | |
284 | let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node | |
285 | let fold_s gst s = fold_s_node gst s (root gst) | |
286 | ||
287 | (* filtering and synthesizing, no inheritance *) | |
288 | let fold_fs gst f s = fold gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) () | |
289 | ||
290 | ||
291 | type tree = Node of string * tree list | Leaf of string * (int * int) | |
292 | ||
293 | let readable gst = | |
294 | fold_s gst | |
295 | (fun l n -> | |
296 | let w = word gst n in | |
297 | if l=[] | |
298 | then Leaf (w, index gst n) | |
299 | else Node (w, l)) | |
300 | ||
301 | (* applications of suffix trees *) | |
302 | ||
303 | let exact_matches : t -> string -> (int * int) list = | |
304 | fun gst word -> | |
305 | try | |
306 | let explicit, implicit, child = implicit_node gst word in | |
307 | fold_s_node gst | |
308 | (fun l n -> if l=[] then [index gst n] else List.concat l) | |
309 | child | |
310 | with Not_found -> [] | |
311 | ||
312 | ||
313 | ||
ae4735db | 314 | let contained_string gst word = |
34e49164 C |
315 | List.map (fun (i,j) -> Array.get (fst gst) i) (exact_matches gst word) |
316 | ||
317 | ||
318 |