# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # # -*- mode: Ruby; coding: euc-japan; -*- require "test/unit" require "fileutils" require "rast_test" require "bdb" require "test-utility" require "rast/database-generatable" module Rast class TextIndexTest < Test::Unit::TestCase include DatabaseGeneratable def test_initialize index_name = generate_text_index_name index = TextIndex.new(index_name) assert_nothing_raised do index.register(1, "foo") end index = TextIndex.new(index_name, Rast::DB::RDONLY) assert_raise(RastError) do index.register(1, "foo") end end def test_register register_test register_test_free_list register_test_set_block_size end def register_test doc_id_1 = 0 text_data = "abcdefdef" index_name = generate_text_index_name index = TextIndex.new(index_name) index.register(doc_id_1, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_rare_ngram_db(rare_ngram_db, "abc", doc_id_1, 0) check_rare_ngram_db(rare_ngram_db, "bcd", doc_id_1, 1) check_ngram_db(ngram_db, pos_file, "def", PositionInfo.new(doc_id_1, [3, 6])) check_rare_ngram_db(rare_ngram_db, "ef", doc_id_1, 7) check_rare_ngram_db(rare_ngram_db, "f", doc_id_1, 8) check_free_list(free_list_file) end doc_id_2 = 1 text_data = "1234123" index = TextIndex.new(index_name) index.register(doc_id_2, text_data) index.sync open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db(ngram_db, pos_file, "123", PositionInfo.new(doc_id_2, [0, 4])) check_rare_ngram_db(rare_ngram_db, "234", doc_id_2, 1) check_rare_ngram_db(rare_ngram_db, "341", doc_id_2, 2) check_rare_ngram_db(rare_ngram_db, "23", doc_id_2, 5) check_rare_ngram_db(rare_ngram_db, "3", doc_id_2, 6) check_free_list(free_list_file) end doc_id_3 = 2 text_data = "01234" index.register(doc_id_3, text_data) index.sync open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_rare_ngram_db(rare_ngram_db, "012", doc_id_3, 0) check_ngram_db(ngram_db, pos_file, "123", PositionInfo.new(doc_id_2, [0, 4]), PositionInfo.new(doc_id_3, [1])) check_ngram_db(ngram_db, pos_file, "234", PositionInfo.new(doc_id_2, [1]), PositionInfo.new(doc_id_3, [2])) assert_equal(false, rare_ngram_db.key?("234")) check_free_list(free_list_file) end doc_id_4 = 3 text_data = "x" * (2 + 2) index = TextIndex.new(index_name) index.register(doc_id_4, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db(ngram_db, pos_file, "xxx", PositionInfo.new(doc_id_4, (0..1).to_a)) check_rare_ngram_db(rare_ngram_db, "xx", doc_id_4, 2) check_rare_ngram_db(rare_ngram_db, "x", doc_id_4, 3) check_free_list(free_list_file) end doc_id_5 = 4 text_data = "x" * (512 + 2) index = TextIndex.new(index_name) index.register(doc_id_5, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db(ngram_db, pos_file, "xxx", PositionInfo.new(doc_id_4, (0..1).to_a), PositionInfo.new(doc_id_5, (0..511).to_a)) check_ngram_db(ngram_db, pos_file, "xx", PositionInfo.new(doc_id_4, [2]), PositionInfo.new(doc_id_5, [512])) check_ngram_db(ngram_db, pos_file, "x", PositionInfo.new(doc_id_4, [3]), PositionInfo.new(doc_id_5, [513])) assert_equal(false, rare_ngram_db.key?("xx")) assert_equal(false, rare_ngram_db.key?("x")) check_free_list(free_list_file, FreeListEntry.new(3, 1)) end index = TextIndex.new(index_name) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_free_list(free_list_file, FreeListEntry.new(3, 1)) end doc_id_6 = 6 text_data = "yyyy" index = TextIndex.new(index_name) index.register(doc_id_6, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db(ngram_db, pos_file, "xxx", PositionInfo.new(doc_id_4, (0..1).to_a), PositionInfo.new(doc_id_5, (0..511).to_a)) check_ngram_db(ngram_db, pos_file, "xx", PositionInfo.new(doc_id_4, [2]), PositionInfo.new(doc_id_5, [512])) check_ngram_db(ngram_db, pos_file, "x", PositionInfo.new(doc_id_4, [3]), PositionInfo.new(doc_id_5, [513])) check_ngram_db(ngram_db, pos_file, "yyy", PositionInfo.new(doc_id_6, [0, 1])) check_rare_ngram_db(rare_ngram_db, "yy", doc_id_6, 2) check_rare_ngram_db(rare_ngram_db, "y", doc_id_6, 3) check_free_list(free_list_file) end end def register_test_free_list index_name = generate_text_index_name index = TextIndex.new(index_name) doc_id_5 = 5 text_data = "x" * (512 + 2) # (512: 896bytes < 2blocks) index.register(doc_id_5, text_data) index.sync open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_free_list(free_list_file) end doc_id_6 = 6 text_data = "x" * (512 + 2) # (512: 896bytes < 2blocks) index.register(doc_id_6, text_data) index.sync open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_free_list(free_list_file, FreeListEntry.new(0, 2)) end doc_id_7 = 7 text_data = "yyyy" index.register(doc_id_7, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_free_list(free_list_file, FreeListEntry.new(1, 1)) end end def register_test_set_block_size doc_id_1 = 1 text_data = "aaaabbbb" index_name = generate_text_index_name index = TextIndex.new(index_name, Rast::DB::RDWR, 256) index.register(doc_id_1, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db_with_block_size(ngram_db, pos_file, "aaa", 256, PositionInfo.new(doc_id_1, [0, 1])) check_ngram_db_with_block_size(ngram_db, pos_file, "bbb", 256, PositionInfo.new(doc_id_1, [4, 5])) check_free_list(free_list_file) end doc_id_2 = 2 text_data = "a" * (256 + 2) index = TextIndex.new(index_name, Rast::DB::RDWR, 256) index.register(doc_id_2, text_data) index.close open_db do |rare_ngram_db, ngram_db, pos_file, free_list_file| check_ngram_db_with_block_size(ngram_db, pos_file, "aaa", 256, PositionInfo.new(doc_id_1, [0, 1]), PositionInfo.new(doc_id_2, (0..255).to_a)) check_free_list(free_list_file, FreeListEntry.new(0, 1)) end end def test_search search_test_simple search_test_same_ngram search_test_other search_test_without_sync end def search_test_simple index_name = generate_text_index_name text_index = TextIndex.new(index_name) text_index.register(0, "abcdef") text_index.register(1, "あいうえお") text_index.register(2, "defgh") text_index.register(3, "abc bcd") text_index.register(4, "abcd ef") text_index.register(5, "abcd bcdef cde") text_index.register(6, "abcabc") text_index.register(7, "foo tcl/tk8 bar") text_index.close text_index = TextIndex.new(index_name) result = text_index.search("") assert_equal("", result.terms[0].term) assert_equal(0, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(0, result.candidates.length) result = text_index.search("cde") assert_equal("cde", result.terms[0].term) assert_equal(2, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(0, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(2, result.candidates[0].terms[0].pos) assert_equal(5, result.candidates[1].doc_id) assert_equal(2, result.candidates[1].terms[0].count) assert_equal(6, result.candidates[1].terms[0].pos) assert_equal(2, result.candidates.length) result = text_index.search("def") assert_equal("def", result.terms[0].term) assert_equal(3, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(0, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(3, result.candidates[0].terms[0].pos) assert_equal(2, result.candidates[1].doc_id) assert_equal(1, result.candidates[1].terms[0].count) assert_equal(0, result.candidates[1].terms[0].pos) assert_equal(5, result.candidates[2].doc_id) assert_equal(1, result.candidates[2].terms[0].count) assert_equal(7, result.candidates[2].terms[0].pos) assert_equal(3, result.candidates.length) result = text_index.search("de") assert_equal("de", result.terms[0].term) assert_equal(3, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(0, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(3, result.candidates[0].terms[0].pos) assert_equal(2, result.candidates[1].doc_id) assert_equal(1, result.candidates[1].terms[0].count) assert_equal(0, result.candidates[1].terms[0].pos) assert_equal(5, result.candidates[2].doc_id) assert_equal(2, result.candidates[2].terms[0].count) assert_equal(7, result.candidates[2].terms[0].pos) assert_equal(3, result.candidates.length) result = text_index.search("いうえ") assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("cdef") assert_equal(0, result.candidates[0].doc_id) assert_equal(5, result.candidates[1].doc_id) assert_equal(2, result.candidates.length) result = text_index.search("abcd") assert_equal(0, result.candidates[0].doc_id) assert_equal(4, result.candidates[1].doc_id) assert_equal(5, result.candidates[2].doc_id) assert_equal(3, result.candidates.length) result = text_index.search("abcdef") assert_equal(0, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("xyz") assert_equal(0, result.candidates.length) result = text_index.search("abc") assert_equal(0, result.candidates[0].doc_id) assert_equal(3, result.candidates[1].doc_id) assert_equal(4, result.candidates[2].doc_id) assert_equal(5, result.candidates[3].doc_id) assert_equal(6, result.candidates[4].doc_id) assert_equal(5, result.candidates.length) result = text_index.search("い") assert_equal("い", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(1, result.candidates[0].terms[0].pos) assert_equal(1, result.candidates.length) result = text_index.search("tcl/tk") assert_equal("tcl/tk", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(7, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(4, result.candidates[0].terms[0].pos) assert_equal(1, result.candidates.length) result = text_index.search("not-found-string") assert_equal("not-found-string", result.terms[0].term) assert_equal(0, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(0, result.candidates.length) text_index.close end def search_test_same_ngram index_name = generate_text_index_name text_index = TextIndex.new(index_name) text_index.register(1, "DE F g") text_index.register(2, "100000") text_index.close text_index = TextIndex.new(index_name) result = text_index.search("DE F g") assert_equal("DE F g", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(0, result.candidates[0].terms[0].pos) assert_equal(1, result.candidates.length) result = text_index.search("100000") assert_equal("100000", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(2, result.candidates[0].doc_id) assert_equal(1, result.candidates[0].terms[0].count) assert_equal(0, result.candidates[0].terms[0].pos) assert_equal(1, result.candidates.length) text_index.close end def search_test_other index_name = generate_text_index_name text_index = TextIndex.new(index_name) text_index.register(0, " bcd abc") text_index.close text_index = TextIndex.new(index_name) result = text_index.search("abcd") assert_equal(0, result.candidates.length) text_index.close index_name = generate_text_index_name text_index = TextIndex.new(index_name) text_index.register(0, "cde abcde abcdef") text_index.close text_index = TextIndex.new(index_name) result = text_index.search("abcdef") assert_equal(1, result.candidates.length) text_index.close index_name = generate_text_index_name text_index = TextIndex.new(index_name) text_index.register(0, "abc bcd") text_index.close text_index = TextIndex.new(index_name) result = text_index.search("abcd") assert_equal(0, result.candidates.length) text_index.close end def search_test_without_sync index_name = generate_text_index_name text_index = TextIndex.new(index_name) result = text_index.search("abc") assert_equal(0, result.candidates.length) text_index.register(0, "abcdef") result = text_index.search("abc") assert_equal(1, result.candidates.length) result = text_index.search("a") assert_equal(1, result.candidates.length) text_index.sync text_index.register(1, "abc") result = text_index.search("abc") assert_equal(2, result.candidates.length) result = text_index.search("a") assert_equal(2, result.candidates.length) text_index.close end def test_optimize old_index_name = generate_text_index_name new_index_name = generate_text_index_name text_index = TextIndex.new(old_index_name) text_index.register(1, "abcd" * 2) text_index.register(2, "efgh" * 2) text_index.register(3, "ijkl" * 2) text_index.close text_index = TextIndex.new(old_index_name) result = text_index.search("abc") assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("efg") assert_equal(2, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("ijk") assert_equal(3, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("cd") assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("gh") assert_equal(2, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("kl") assert_equal(3, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) text_index.close TextIndex.optimize(old_index_name, new_index_name, {1 => 1, 3 => 2}) text_index = TextIndex.new(new_index_name) result = text_index.search("abc") assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("efg") assert_equal(0, result.candidates.length) result = text_index.search("ijk") assert_equal(2, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("cd") assert_equal(1, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) result = text_index.search("gh") assert_equal(0, result.candidates.length) result = text_index.search("kl") assert_equal(2, result.candidates[0].doc_id) assert_equal(1, result.candidates.length) text_index.close end private PositionInfo = Struct.new(:doc_id, :positions) FreeListEntry = Struct.new(:block_no, :block_count) def generate_text_index_name text_index_name = generate_db_name @rare_ngram_db_name = text_index_name + ".rng" @ngram_db_name = text_index_name + ".ngm" @pos_file_name = text_index_name + ".pos" @free_list_file_name = text_index_name + ".pfl" @expected_free_list_version = 1 return text_index_name end def check_ngram(db, ngram, *position_infos) s = db[ngram] nbytes_str = s.slice(/[\x80-\xFF]*[\x00-\x7F]/) nbytes = Rast::VNUM.unpack(nbytes_str)[0] ary = Rast::VNUM.unpack(s[nbytes_str.length, nbytes]) while !ary.empty? doc_id = ary.shift num_positions = ary.shift positions = ary.slice!(0, num_positions) expected = position_infos.shift assert_equal(expected.doc_id, doc_id) assert_equal(expected.positions, positions) end assert_equal(true, position_infos.empty?) end def check_ngram_db_with_block_size(db, file, ngram, block_size, *position_infos) block_no, block_count, data_nbytes, num_docs = db[ngram].unpack("I*") assert_equal(position_infos.length, num_docs) file.seek(block_no * block_size) s = file.read(data_nbytes) while !s.empty? doc_id, bytes = *Rast::VNUM.unpack(s.slice!(/\A(?:[\x80-\xFF]*[\x00-\x7F]){2}/)) positions = Rast::VNUM.unpack(s.slice!(0, bytes)) expected = position_infos.shift assert_equal(expected.doc_id, doc_id) assert_equal(expected.positions, positions) end assert_equal(true, position_infos.empty?) end def check_ngram_db(db, file, ngram, *position_infos) check_ngram_db_with_block_size(db, file, ngram, 512, *position_infos) end def check_rare_ngram_db(db, ngram, expected_doc_id, expected_position) s = db[ngram] doc_id, bytes = *Rast::VNUM.unpack(s.slice!(/\A(?:[\x80-\xFF]*[\x00-\x7F]){2}/)) assert_equal(s.length, bytes) positions = Rast::VNUM.unpack(s.slice!(0, bytes)) assert_equal(0, s.length) assert_equal(expected_doc_id, doc_id) assert_equal([expected_position], positions) end def check_free_list(file, *expected_entries) ary = file.read.unpack("I*") if ary.nil? assert_equal(true, expected_entries.empty?) return end assert_equal(@expected_free_list_version, ary.shift) @expected_free_list_version += 1 expected_entries.each do |expected_entry| assert_equal(expected_entry.block_no, ary.shift) assert_equal(expected_entry.block_count, ary.shift) end assert_equal(true, ary.empty?) end def open_db BDB::Btree.open(@rare_ngram_db_name, nil, 0) do |rare_ngram_db| BDB::Btree.open(@ngram_db_name, nil, 0) do |ngram_db| File.open(@pos_file_name) do |pos_file| File.open(@free_list_file_name, File::RDONLY | File::CREAT) do |free_list_file| yield(rare_ngram_db, ngram_db, pos_file, free_list_file) end end end end end end end