Simple implementation of SuffixTree (suffix trie) in Ruby
Build tree in linear time
class SuffixTree
# node = [ suffix_link, parent, edges ]
# edges = { first_char => edge }
# edge = [ range, node ]
# range = [ first, last ] # corresponds to interval [ first, last )
def initialize(text)
@text = text
@joker_edge = [[0,1]]
@joker = [nil, nil, lambda{|c| @joker_edge}]
@root = [@joker, nil, {}]
@joker_edge << @root
@infty = @text.size
build_tree(@root, 0, @infty)
end
private
def build_tree(node, n, infty, skip=0)
while n < infty
c = @text[n]
edges = node[2]
if edge = edges[c]
first,last = edge[0]
i,n0 = first,n
(can_skip = [skip, last-first].min
i += can_skip
n += can_skip
skip -= can_skip) if skip > 0
while i < last && n < infty && (@text[i] == @text[n] || edge.equal?(@joker_edge))
i += 1; n += 1
end
if i == last
# came to the next node
node = edge[1]
else
# splitting edge
middle_node = [ nil, node, {@text[i] => edge} ]
edge_to_middle = [ [first, i], middle_node ]
edges[c] = edge_to_middle
edge[0][0] = i
edge[1][1] = middle_node
middle_node[0] = build_tree(node[0], n0, n, i-first)
node = middle_node
end
else
# no way to go; creating leaf
new_leaf = [ nil, node, {} ]
edges[c] = [ [n, @infty], new_leaf ]
node = node[0]
end
end
node
end
end
Search method
class SuffixTree
public
def search(word)
node = @root
i = n = 0
infty = word.size
while n < infty
edges = node[2]
c = word[n]
if edge = edges[ c ]
i = edge[0][0]
last = edge[0][1]
(i += 1 and n += 1) while @text[i] == word[n] && n < infty && i < last
if i == last
node = edge[1]
else
break
end
else
break
end
end
n == infty ? i - infty : nil
end
end
Verbose version
Simple
require 'yaml'
...
tree = SuffixTree.new("abrakadabra")
puts tree.to_yaml
could give you all information about tree internals.
But let's write
pritty_print for educational purposes and
make
build_tree more verbose.
require 'yaml'
class NilClass
def name
"nil"
end
end
class Array
def name
@name ||= (
if self[1]
#puts self.inspect
range = self[1][2].to_a.find{|k,v| v[1].object_id==self.object_id}[1][0]
self[1].name + $text[range[0]...range[1]]
else
if self[2].is_a?(Proc)
"&"
else
"$"
end
end
)
end
end
class SuffixTree
# node = [ suffix_link, parent, edges ]
# edges = { first_char => edge }
# edge = [ range, node ]
# range = [ first, last ] # corresponds to interval [ first, last )
def initialize(text)
$text = @text = text
@joker_edge = [[0,1]]
@joker = [nil, nil, lambda{|c| @joker_edge}]
@root = [@joker, nil, {}]
@joker_edge << @root
@infty = @text.size
build_tree(@root, 0, @infty)
end
def build_tree(node, n, infty, skip=0)
#puts "ENTER #{[node.name, @text[n...infty]].inspect}"
while n < infty
puts "-----------------------"
pp(@root)
#puts "SEARCH #{@text[n...infty]} FROM #{node.name}"
c = @text[n]
edges = node[2]
if edge = edges[c]
first,last = edge[0]
i,n0 = first,n
can_skip = [skip, last-first].min
i += can_skip
n += can_skip
skip -= can_skip
while i < last && n < infty && (@text[i]==@text[n] || edge==@joker_edge)
i += 1; n += 1
end
if i == last
# came to the next node
puts "NEXT NODE #{@text[n0...n]}+#{@text[n...infty]}: #{node.name} -> #{edge[1].name}"
node = edge[1]
else
# splitting edge
puts "SPLIT EDGE (#{@text[(0...n0)]},#{@text[(n0...n)]},#{@text[(n...@infty)]}) #{@text[ (first...last)]} = #{@text[ (first...i)]}+#{@text[ (i...last)]} "
middle_node = [ nil, node, {@text[i] => edge} ]
edge_to_middle = [ [first, i], middle_node ]
edges[c] = edge_to_middle
edge[0][0] = i
edge[1][1] = middle_node
node = build_tree(node[0], n0, n, i - first)
middle_node[0] = node
node = middle_node
end
else
# no way to go; creating leaf
new_leaf = [ nil, node, {} ]
edges[c] = [ [n, @infty], new_leaf ]
puts "CREATE LEAF (#{@text[(n...@infty)]}). #{node.name} -> #{node[0].name}"
node = node[0]
end
end
#puts "OUT #{node.name}"
node
end
# pretty print
def pp(node=nil, indent = 0)
node ||= @root
space = " " * indent
puts space + "ID : #{node.name}"
puts space + "link : #{node[0].name}"
# puts space + "parent: #{node[1].name}"
puts space + "edges : "
if node == @joker
puts space + " JOKER"
else
node[2].each do |k,v|
puts space + " -#{k.to_i.chr} [#{v[0][0]},#{v[0][1]})=#{@text[ v[0][0]...v[0][1] ] }:"
pp(v[1], indent + 1)
end
end
end
def search(word)
node = @root
n = 0
infty = word.size
i = 0
while n < infty
edges = node[2]
c = word[n]
if edge = edges[ c ]
i,last = edge[0]
while @text[i] == word[n] && n < infty && i < last
i += 1; n += 1
end
if i == last
node = edge[1]
else
break
end
else
break
end
end
n == infty ? i - infty : nil
end
end
text = "abrababb"
tree = SuffixTree.new(text)
tree.pp
Tree for "abrababb"
ID : $
link : &
edges :
-a [0,2)=ab:
ID : $ab
link : $b
edges :
-a [5,8)=abb:
ID : $ababb
link : nil
edges :
-r [2,8)=rababb:
ID : $abrababb
link : nil
edges :
-b [7,8)=b:
ID : $abb
link : nil
edges :
-r [2,8)=rababb:
ID : $rababb
link : nil
edges :
-b [1,2)=b:
ID : $b
link : $
edges :
-a [5,8)=abb:
ID : $babb
link : nil
edges :
-r [2,8)=rababb:
ID : $brababb
link : nil
edges :
-b [7,8)=b:
ID : $bb
link : nil
edges :
Log of building tree for "abrababb"
Benchmarks and Testing
chars = "qwertyuiopasdfghjklzxcvbnm,./'][;1234567890".split(//)
require 'benchmark'
Benchmark.bmbm do |b|
[2, 3, 40].each do |alpha_size|
[5, 10, 20].each do |i|
n = 100 * i
text = ""
n.times{ text << chars[rand(alpha_size)] }
text = text + text.reverse + text
b.report("#{text.size}-#{alpha_size}") do
SuffixTree.new(text)
end
end
end
end
require 'test/unit'
class TestSuffixTree < Test::Unit::TestCase
def test_simple
text = "abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaaz"
tree = SuffixTree.new(text)
tree.search( "abra" )
assert( tree.search( "" ) )
assert( tree.search( text[(7..12)] ) )
assert( tree.search( text[(3..12)] ) )
assert( tree.search( text[(3..22)] ) )
assert( tree.search( text[(21..22)] ) )
assert( tree.search( "abrakadaa" ) == nil )
assert( tree.search( text ) )
end
def test_small_strings
[
"a",
"aaaaaaa",
"azazaz",
"abcabc",
"abrababb",
"abraabaabb",
"azazazaza",
"aaaaaaaaaaaaaaaaaaaa",
"aaaaaaaaaaaaaaaaaaaaz",
"zaaaaaaaaaaaaaaaaaaaa",
"aaaaaaazaaaaaaaaaaaa",
"aaaaaaazaaaaazaaaaaa",
"abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaa",
"abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaaz"
].each do |text|
tree = SuffixTree.new(text)
(1..text.size).each do |length|
(0...text.size-length).each do |i|
word = text[ (i...i+length) ]
assert( text[tree.search(word).to_i, word.length] == word,
"Searching for #{word} in #{text}" )
end
end
end
end
end
Output:
user system total real
1500-2 0.030000 0.000000 0.030000 ( 0.028018)
3000-2 0.050000 0.010000 0.060000 ( 0.055245)
6000-2 0.100000 0.010000 0.110000 ( 0.110966)
1500-3 0.030000 0.000000 0.030000 ( 0.026031)
3000-3 0.050000 0.000000 0.050000 ( 0.051156)
6000-3 0.100000 0.000000 0.100000 ( 0.104076)
1500-40 0.020000 0.000000 0.020000 ( 0.020655)
3000-40 0.040000 0.010000 0.050000 ( 0.044020)
6000-40 0.080000 0.000000 0.080000 ( 0.088967)
Loaded suite ../suffix_tree
Started
..
Finished in 0.36168 seconds.
2 tests, 3861 assertions, 0 failures, 0 errors