Раздел «Язык Ruby».SuffixTree:

Simple implementation of SuffixTree (suffix trie) in Ruby

Build tree in linear time

class SuffixTree
  
  # node =  [ suffix_link, parent, edges  ]
  # edges = { first_char => edge }
  # edge =  [ range, target_node ]
  # range = [ first, last ] # corresponds to interval  [ first, last ) 
  
  def initialize(text)
    @text = text
    @joker_edge = [[0,1]] # empty interval
    @joker = [nil, nil, lambda{|c| @joker_edge}]
    @root = [@joker, nil, {}]
    @joker_edge << @root
    @infty = @text.size
    build_tree(@root, 0, @infty)
  end
  
  private
  def build_tree(node, n, infty, skip=0)
    while n < infty
      c = @text[n]
      edges = node[2]
      if edge = edges[c]
        first,last = edge[0]
        i,n0 = first,n
        (can_skip = [skip, last-first].min
        i += can_skip
        n += can_skip
        skip -= can_skip) if skip > 0
        while i < last && n < infty && (@text[i] == @text[n] || edge.equal?(@joker_edge))
           i += 1; n += 1
        end
        if i == last
          # came to the next node
          node = edge[1]
        else
          # splitting edge
          middle_node = [ nil, node, {@text[i] => edge} ]
          edge_to_middle = [ [first, i], middle_node ]
          edges[c] = edge_to_middle
          edge[0][0] = i
          edge[1][1] = middle_node
          middle_node[0] = build_tree(node[0], n0, n, i-first)
          node = middle_node
        end
      else
        # no way to go; creating leaf
        new_leaf = [ nil,  node, {} ]
        edges[c] = [ [n, @infty], new_leaf ]
        node = node[0]
      end
    end
    node
  end
end

Search method

class SuffixTree
  public
  def search(word)
    node = @root
    i = n = 0
    infty = word.size
    while n < infty
      edges = node[2]
      c = word[n]
      if edge = edges[ c ]
        i = edge[0][0]
        last = edge[0][1]
        (i += 1 and n += 1) while @text[i] == word[n] && n < infty && i < last
        if i == last
          node = edge[1]
        else
          break
        end
      else
        break
      end
    end
    n == infty ?  i - infty : nil
  end
end

Verbose version

Simple

require 'yaml'
...
tree = SuffixTree.new("abrakadabra")
puts tree.to_yaml

could give you all information about tree internals. But let's write pritty_print for educational purposes and make build_tree more verbose.

require 'yaml'

class NilClass
  def name
    "nil"
  end
end

class Array
  def name
    @name ||= (
      if self[1]
        #puts self.inspect
        range =  self[1][2].to_a.find{|k,v| v[1].object_id==self.object_id}[1][0]
        self[1].name + $text[range[0]...range[1]]
      else
        if self[2].is_a?(Proc)
          "&"
        else
          "$"
        end
      end
    )
  end
end

class SuffixTree
  
  # node =  [ suffix_link, parent, edges  ]
  # edges = { first_char => edge }
  # edge =  [ range, node ]
  # range = [ first, last ] # corresponds to interval  [ first, last ) 
  
  def initialize(text)
    $text = @text = text
    @joker_edge = [[0,1]]
    @joker = [nil, nil, lambda{|c| @joker_edge}]
    @root = [@joker, nil, {}]
    @joker_edge << @root
    @infty = @text.size
    build_tree(@root, 0, @infty)
  end
     
  def build_tree(node, n, infty, skip=0)
    #puts "ENTER #{[node.name, @text[n...infty]].inspect}"
    while n < infty
      puts "-----------------------"
      pp(@root)
      #puts "SEARCH #{@text[n...infty]} FROM #{node.name}"
      c = @text[n]
      edges = node[2]
      if edge = edges[c]
        first,last = edge[0]
        i,n0 = first,n
        can_skip = [skip, last-first].min
        i += can_skip
        n += can_skip
        skip -= can_skip
        while i < last && n < infty && (@text[i]==@text[n] || edge==@joker_edge)
          i += 1; n += 1
        end
        if i == last
          # came to the next node
          puts "NEXT NODE #{@text[n0...n]}+#{@text[n...infty]}:  #{node.name} -> #{edge[1].name}"
          node = edge[1]
        else
          # splitting edge
          puts "SPLIT EDGE (#{@text[(0...n0)]},#{@text[(n0...n)]},#{@text[(n...@infty)]})  #{@text[ (first...last)]} = #{@text[ (first...i)]}+#{@text[ (i...last)]} "
          middle_node = [ nil, node, {@text[i] => edge} ]
          edge_to_middle = [ [first, i], middle_node ]
          edges[c] = edge_to_middle
          edge[0][0] = i
          edge[1][1] = middle_node
          node = build_tree(node[0], n0, n, i - first)
          middle_node[0] = node
          node = middle_node
        end
      else
        # no way to go; creating leaf
        new_leaf = [ nil,  node, {} ]
        edges[c] = [ [n, @infty], new_leaf ]
        puts "CREATE LEAF (#{@text[(n...@infty)]}). #{node.name} -> #{node[0].name}"
        node = node[0]
      end
    end
    #puts "OUT #{node.name}"
    node
  end
  
  # pretty print
  def pp(node=nil, indent = 0)
    node ||= @root
    space = "    " * indent
    puts  space + "ID    : #{node.name}"
    puts  space + "link  : #{node[0].name}"
    # puts  space + "parent: #{node[1].name}"
    puts  space + "edges : " 
    if node == @joker
      puts  space + "  — JOKER"
    else
      node[2].each do |k,v|
        puts  space + "  -#{k.to_i.chr} [#{v[0][0]},#{v[0][1]})=#{@text[ v[0][0]...v[0][1] ] }:" 
        pp(v[1], indent + 1)
      end
    end
  end
  
  def search(word)
    node = @root
    n = 0
    infty = word.size
    i = 0
    while n < infty
      edges = node[2]
      c = word[n]
      if edge = edges[ c ]
        i,last = edge[0]
        while @text[i] == word[n] && n < infty && i < last
          i += 1; n += 1
        end
        if i == last
          node = edge[1]
        else
          break
        end
      else
        break
      end
    end
    n == infty ?  i - infty : nil
  end
  
end

text = "abrababb"
tree = SuffixTree.new(text)
tree.pp

Tree for "abrababb"

ID    : $
link  : &
edges : 
  -a [0,2)=ab:
    ID    : $ab
    link  : $b
    edges : 
      -a [5,8)=abb:
        ID    : $ababb
        link  : nil
        edges : 
      -r [2,8)=rababb:
        ID    : $abrababb
        link  : nil
        edges : 
      -b [7,8)=b:
        ID    : $abb
        link  : nil
        edges : 
  -r [2,8)=rababb:
    ID    : $rababb
    link  : nil
    edges : 
  -b [1,2)=b:
    ID    : $b
    link  : $
    edges : 
      -a [5,8)=abb:
        ID    : $babb
        link  : nil
        edges : 
      -r [2,8)=rababb:
        ID    : $brababb
        link  : nil
        edges : 
      -b [7,8)=b:
        ID    : $bb
        link  : nil
        edges : 

Log of building tree for "abrababb"

Benchmarks and Testing

chars = "qwertyuiopasdfghjklzxcvbnm,./'][;1234567890".split(//)

require 'benchmark'

Benchmark.bmbm do |b|
  [2, 3, 40].each do |alpha_size|
    [5, 10, 20].each do |i|
      n = 100 * i
      text = ""
      n.times{ text << chars[rand(alpha_size)] }
      text = text + text.reverse + text
      b.report("#{text.size}-#{alpha_size}") do
        SuffixTree.new(text)
      end
    end
  end
end

require 'test/unit'

class TestSuffixTree < Test::Unit::TestCase
  def test_simple
    text = "abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaaz"
    tree = SuffixTree.new(text)
    tree.search( "abra" )
    assert( tree.search( "" ) )
    assert( tree.search( text[(7..12)] ) )
    assert( tree.search( text[(3..12)] ) )
    assert( tree.search( text[(3..22)] ) )
    assert( tree.search( text[(21..22)] ) )
    assert( tree.search( "abrakadaa" ) == nil )
    assert( tree.search( text ) )
  end

  def test_small_strings
    [
      "a",
      "aaaaaaa",
      "azazaz",
      "abcabc",
      "abrababb",
      "abraabaabb",
      "azazazaza",
      "aaaaaaaaaaaaaaaaaaaa",
      "aaaaaaaaaaaaaaaaaaaaz",
      "zaaaaaaaaaaaaaaaaaaaa",
      "aaaaaaazaaaaaaaaaaaa",
      "aaaaaaazaaaaazaaaaaa",
      "abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaa",
      "abrakadabraababaabbrrakadabkakadabrabrkadbkadabraaaaz"
    ].each do |text|
      tree = SuffixTree.new(text)
      (1..text.size).each do |length|
        (0...text.size-length).each do |i|
          word = text[ (i...i+length) ]
          assert( text[tree.search(word).to_i, word.length] == word,
                   "Searching for #{word} in #{text}" )
        end
      end
    end
  end
end

Output:

              user     system      total        real
1500-2    0.030000   0.000000   0.030000 (  0.028018)
3000-2    0.050000   0.010000   0.060000 (  0.055245)
6000-2    0.100000   0.010000   0.110000 (  0.110966)
1500-3    0.030000   0.000000   0.030000 (  0.026031)
3000-3    0.050000   0.000000   0.050000 (  0.051156)
6000-3    0.100000   0.000000   0.100000 (  0.104076)
1500-40   0.020000   0.000000   0.020000 (  0.020655)
3000-40   0.040000   0.010000   0.050000 (  0.044020)
6000-40   0.080000   0.000000   0.080000 (  0.088967)
Loaded suite ../suffix_tree
Started
..
Finished in 0.36168 seconds.

2 tests, 3861 assertions, 0 failures, 0 errors

Attachment sort Action Size Date Who Comment
suffix_tree.rb manage 3.7 K 13 Sep 2008 - 21:35 ArtemVoroztsov Ruby implementation of suffix tries
suffix_tree_build_log.txt manage 6.0 K 13 Sep 2008 - 21:05 ArtemVoroztsov Log of building suffix tree for "abrababb"
01ianote.pdf manage 343.2 K 03 May 2010 - 11:04 ArtemVoroztsov лекция Юрия Лифшица