# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # require "test/unit" require "rast_test" require "test-utility" require "make-variable" require "rast/database-generatable" require "tempfile" require "nkf" module Rast class FilterChainTest < Test::Unit::TestCase include DatabaseGeneratable def test_invoke invoke_test_simple invoke_test_nil_mime_type invoke_test_with_filename invoke_test_only_eos_bucket invoke_test_c_mime_filter invoke_test_c_mime_filter_with_charset_conversion invoke_test_c_mime_filter_with_properties invoke_test_c_text_filter invoke_test_ruby_mime_filter invoke_test_ruby_mime_filter_with_multiple_stage invoke_test_ruby_mime_filter_with_charset_conversion invoke_test_ruby_text_filter invoke_test_no_such_text_filter invoke_test_with_invalid_charset invoke_test_protect_filter_instance end def invoke_test_simple create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new("本日は晴天なり")) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/plain") doc.commit doc = db.create_document chain = FilterChain.new(doc) assert_raises(RastError) do chain.invoke(brigade, "unknown/mime-type") end end LocalDB.open(db_name) do |db| assert_equal("本日は晴天なり", db.get_text(1)) end end def invoke_test_nil_mime_type create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new("本日は晴天なり")) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, nil) doc.commit doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new filename = File.join(MakeVariable::TOP_SRCDIR, "tests", "data", "filter", "ja", "utf-8.txt.tar") File.open(filename) do |f| brigade.insert_tail(FileBucket.new(f)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, nil) doc.commit end end LocalDB.open(db_name) do |db| assert_equal("本日は晴天なり", db.get_text(1)) assert_match(/tar テスト これは tar ファイルの1つ目です/, db.get_text(2)) assert_match(/tar テスト これは tar ファイルの2つ目です/, db.get_text(2)) end end def invoke_test_with_filename create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) names = ["open-office-org.sxw", "excel.xls", "powerpoint.ppt"] LocalDB.open(db_name, Rast::DB::RDWR) do |db| names.each do |name| filename = File.join(MakeVariable::TOP_SRCDIR, "tests", "data", "filter", "ja", name) File.open(filename) do |f| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(FileBucket.new(f)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, nil, filename) doc.commit end end end LocalDB.open(db_name) do |db| assert_match(/これはWriterファイルです/, db.get_text(1)) assert_match(/これはMS Excelファイルです/, db.get_text(2)) assert_match(/これは MS PowerPoint ファイルです/, db.get_text(3)) end end def invoke_test_only_eos_bucket create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, nil) doc.commit end LocalDB.open(db_name) do |db| assert_equal("", db.get_text(1)) end end def invoke_test_c_mime_filter create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("天気") s.concat("本日は晴天なり") doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(s)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/x-rast-test-html") doc.commit end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| assert_equal("本日は晴天なり", db.get_text(1)) end end def invoke_test_c_mime_filter_with_charset_conversion create_options = { "encoding" => "euc_jp", "preserve_text" => true, "properties" => [ { "name" => "title", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, ] } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("") s.concat("") s.concat("題名のエンコーディング変換テスト") s.concat("") s.concat("本文のエンコーディング変換テスト") s.concat("") doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(NKF.nkf("-Ws", s))) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/x-rast-test-html; charset=Shift_JIS") doc.commit end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| assert_equal("本文のエンコーディング変換テスト", NKF.nkf("-Ew", db.get_text(1))) result = db.search(NKF.nkf("-We", "本文"), {"properties" => ["title"]}) assert_equal("題名のエンコーディング変換テスト", NKF.nkf("-Ew", result.items[0].properties.title)) end end def invoke_test_c_mime_filter_with_properties create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [ { "name" => "title", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, } ], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("天気") s.concat("本日は晴天なり") doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(s)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/x-rast-test-html") doc.commit end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| result = db.search("本日", {"properties" => ["title"]}) assert_equal(1, result.hit_count) assert_equal(1, result.num_docs) assert_equal("本日", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(1, result.items[0].doc_id) assert_equal("天気", result.items[0].properties[0]) end end def invoke_test_ruby_mime_filter create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [ { "name" => "from", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, { "name" => "to", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, { "name" => "subject", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, ], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("From: rast@example.com\n") s.concat("To: rast-dev@example.com\n") s.concat("Subject: test mail\n") s.concat("\n") s.concat("本日は晴天なり\n") s.concat("明日は雨です\n") doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(s)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, nil) doc.commit end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| assert_equal("本日は晴天なり 明日は雨です ", db.get_text(1)) result = db.search("本日", {"properties" => ["from", "to", "subject"]}) assert_equal(1, result.hit_count) assert_equal(1, result.num_docs) assert_equal("本日", result.terms[0].term) assert_equal(1, result.terms[0].doc_count) assert_equal(1, result.terms.length) assert_equal(1, result.items[0].doc_id) assert_equal("rast@example.com", result.items[0].properties[0]) assert_equal("rast-dev@example.com", result.items[0].properties[1]) assert_equal("test mail", result.items[0].properties[2]) end end def invoke_test_ruby_mime_filter_with_multiple_stage create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [ { "name" => "title", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, { "name" => "author", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, } ], } path = File.join(MakeVariable::TOP_SRCDIR, "tests", "data", "filter", "ja", "msword.doc") db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| File.open(path) do |f| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(FileBucket.new(f)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "application/msword") doc.commit end end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| text = db.get_text(1) assert_match(/\sMicrosoft Word テスト\s/, text) assert_match(/\sこれはMS Word ファイルです\s/, text) assert_match(/\s1ページ目本文\s/, text) assert_match(/\s2ページ目本文\s/, text) result = db.search("word", { "need_summary" => true, "properties" => ["author", "title"] }) assert_equal("著者", result.items[0].properties.author) assert_equal("タイトル", result.items[0].properties.title) assert_equal(2, result.items[0].properties.length) end end def invoke_test_ruby_mime_filter_with_charset_conversion create_options = { "encoding" => "euc_jp", "preserve_text" => true, "properties" => [ { "name" => "subject", "type" => Rast::PROPERTY_TYPE_STRING, "search" => false, "text_search" => false, "full_text_search" => false, "unique" => false, }, ], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("Subject: 題名のエンコーディング変換テスト\n") s.concat("\n") s.concat("本文のエンコーディング変換テスト") doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(NKF.nkf("-Ws", s))) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "message/rfc822; charset=Shift_JIS") doc.commit end LocalDB.open(db_name) do |db| assert_equal("本文のエンコーディング変換テスト", NKF.nkf("-Ew", db.get_text(1))) result = db.search(NKF.nkf("-We", "本文"), {"properties" => ["subject"]}) assert_equal("題名のエンコーディング変換テスト", NKF.nkf("-Ew", result.items[0].properties.subject)) end end def invoke_test_c_text_filter create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat(" Ruby-\n.") s.concat(" Pyth-\non.") s.concat(" -\nPerl.") register_with_text_filters(db, ["combine-lineend-hyphen"], s) register_with_text_filters(db, ["combine-lineend-hyphen"], "-Scheme.") register_with_text_filters(db, ["combine-lineend-hyphen"], "Haskell-") end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| assert_equal(" Ruby- . Python. - Perl.", db.get_text(1)) assert_equal("-Scheme.", db.get_text(2)) assert_equal("Haskell-", db.get_text(3)) end end def invoke_test_ruby_text_filter create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| s = "" s.concat("本日は\n晴天なり。\n") s.concat("明日は\n雨天なり。\n") s.concat("This is\na pen.") register_with_text_filters(db, ["combine-lineend-japanese"], s) end LocalDB.open(db_name, Rast::DB::RDONLY) do |db| assert_equal("本日は晴天なり。明日は雨天なり。 This is a pen.", db.get_text(1)) end end def invoke_test_no_such_text_filter create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document assert_raise(RastError) do FilterChain.new(doc, ["no-such-text-filter"]) end end end def invoke_test_with_invalid_charset create_options = { "encoding" => "euc_jp", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document chain = FilterChain.new(doc) brigade = Brigade.new bucket = TransientBucket.new("不正エンコーディング名テスト") brigade.insert_tail(bucket) brigade.insert_tail(EOSBucket.new) assert_raise(RastError) do chain.invoke(brigade, "text/plain; charset=ISO-8859-1") end end end def invoke_test_protect_filter_instance create_options = { "encoding" => "utf8", "preserve_text" => true, "properties" => [], } db_name = generate_db_name LocalDB.create(db_name, create_options) LocalDB.open(db_name, Rast::DB::RDWR) do |db| doc = db.create_document chain = FilterChain.new(doc, ["rast-test-gc-start-invoke"]) brigade = Brigade.new brigade.insert_tail(TransientBucket.new("test test")) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/plain; charset=UTF-8") doc.commit end LocalDB.open(db_name) do |db| assert_equal("test test", db.get_text(1)) end end def register_with_text_filters(db, text_filters, s) doc = db.create_document chain = FilterChain.new(doc, text_filters) brigade = Brigade.new brigade.insert_tail(TransientBucket.new(s)) brigade.insert_tail(EOSBucket.new) chain.invoke(brigade, "text/plain") doc.commit end end end