(* stringMatch(T,P) is the smallest shift at which * P occurs within T, or length(T) if there is no such shift. *) signature STRING_MATCH = sig exception NoMatch val match: string * string -> int end structure Naive :> STRING_MATCH = struct exception NoMatch fun match(T,P) = let val m = String.size(P) val n = String.size(T) (* inner(s,j) is whether the characters at indices j..m-1 * in P match the corresponding characters at shift s in T. * Requires: j,s >= 0 and s < n-m *) fun inner(s: int, j: int):bool = j >= m orelse String.sub(T,s+j) = String.sub(P,j) andalso inner(s,j+1) (* outer(s) is the smallest shift at least s * at which P occurs in T. Raises NoMatch if no * such shift. *) fun outer(s: int) = if s <= n-m then if inner(s, 0) then s else outer(s+1) else raise NoMatch in outer(0) end end structure BoyerMoore :> STRING_MATCH = struct exception NoMatch exception None; type last_type = (char, int) HashTable.hash_table fun match(T,P) = let val m = String.size(P) val n = String.size(T) val last: last_type = HashTable.mkTable (fn(c) => Word.fromInt(Char.ord(c)), fn(c1,c2) => (c1 = c2)) (100,None) fun compute_last(j) = if j < m then ( HashTable.insert last (String.sub(P,j),j); compute_last(j+1)) else () fun compute_prefix(S: string): int Array.array = let val m = String.size(S) val ret: int Array.array = Array.array(m+1,0) (* Effect: fill in entries i..m-1 in the prefix table ret *) fun compute_i(i: int, j: int) = if i < m then let val Si = String.sub(S, i); fun next_j(j:int) = if j > 0 andalso String.sub(S, j) <> Si then next_j(Array.sub(ret, j)) else j val j' = next_j(j) val j'' = if String.sub(S, j') = Si then j'+1 else j' in print ("prefix["^(Int.toString(i+1))^"] = " ^ (Int.toString(j'')) ^ "\n"); Array.update(ret, i+1, j''); compute_i(i+1, j'') end else ((*done*)) in print ("Computing prefix table for " ^ S ^ "\n"); Array.update(ret, 1, 0); compute_i(1,0); ret end val good_suffix = Array.array(m,0) fun compute_good_suffix() = let val prefix = compute_prefix(P) val P' = String.implode(List.rev(String.explode(P))) val prefix' = compute_prefix(P') in Array.modify (fn(x) => m - Array.sub(prefix, m)) good_suffix; let fun loop(j': int) = if j' <= m then let val j = m - Array.sub(prefix', j') - 1 in ( Array.update(good_suffix, j, Int.min(Array.sub(good_suffix, j), j' - Array.sub(prefix', j'))); loop(j'+1)) end else ((*done*)) in loop(1) end; Array.appi(fn(j, x) => print ("good_suffix[" ^ Int.toString(j) ^ "] = " ^ Int.toString(x)^"\n")) (good_suffix, 0, NONE) end fun inner(s: int, j: int): int option = if j < 0 then NONE else if String.sub(T,s+j) = String.sub(P,j) then inner(s,j-1) else SOME j fun outer(s:int) = ( print ("trying shift " ^ (Int.toString(s)) ^ "\n"); if s <= n-m then case inner(s, m-1) of NONE => s | SOME j => let val bad_char_shift = case HashTable.find last (String.sub(T, s+j)) of SOME pos => j - pos | NONE => j + 1 val good_suffix_shift = Array.sub(good_suffix, j) in print("bad char shift = "^(Int.toString(bad_char_shift))^"\n"); print("good suffix shift = "^(Int.toString(good_suffix_shift))^"\n"); outer(s + Int.max(good_suffix_shift, bad_char_shift)) end else raise NoMatch ) in compute_last(0); compute_good_suffix(); outer(0) end end val _ = BoyerMoore.match("atcgatcgatcgtatatcgagcgaagcggagttgagca", "gagcg"); (* T = atcgatcgatcgtatatcgagcgaagcggagttgagca P = gagcg gagcg (bad char) gagcg (good suffix) gagcg (good suffix) gagcg (bad char) gagcg (good suffix) *)