# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # # -*- mode: Ruby; coding: euc-japan; -*- require "test/unit" require "rast_test" module Rast class Encoding class UTF8Test < Test::Unit::TestCase def setup @encoding = Encoding["utf8"] end def test_register_tokenize result = [] @encoding.register_tokenize("ruby is great.") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["rub", 0, true], result[0]) assert_equal(["uby", 1, true], result[1]) assert_equal(["by ", 2, true], result[2]) assert_equal(["y i", 3, true], result[3]) assert_equal([" is", 4, true], result[4]) assert_equal(["is ", 5, true], result[5]) assert_equal(["s g", 6, true], result[6]) assert_equal([" gr", 7, true], result[7]) assert_equal(["gre", 8, true], result[8]) assert_equal(["rea", 9, true], result[9]) assert_equal(["eat", 10, true], result[10]) assert_equal(["at", 11, true], result[11]) assert_equal(["t.", 12, true], result[12]) assert_equal([".", 13, false], result[13]) assert_equal(14, result.length) result = [] @encoding.register_tokenize("アイウエオ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("アイウ", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("イウエ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("ウエオ", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("エオ", result[3][0]) assert_equal(3, result[3][1]) assert_equal(false, result[3][2]) assert_equal("オ", result[4][0]) assert_equal(4, result[4][1]) assert_equal(false, result[4][2]) assert_equal(5, result.length) result = [] @encoding.register_tokenize("あいうえお") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("あいう", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("いうえ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("うえお", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("えお", result[3][0]) assert_equal(3, result[3][1]) assert_equal(false, result[3][2]) assert_equal("お", result[4][0]) assert_equal(4, result[4][1]) assert_equal(false, result[4][2]) assert_equal(5, result.length) result = [] @encoding.register_tokenize("日本語") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("日本", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("本語", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("語", result[2][0]) assert_equal(2, result[2][1]) assert_equal(false, result[2][2]) assert_equal(3, result.length) result = [] @encoding.register_tokenize("Rubyですよ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("Rub", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("uby", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("by", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("yで", result[3][0]) assert_equal(3, result[3][1]) assert_equal(true, result[3][2]) assert_equal("ですよ", result[4][0]) assert_equal(4, result[4][1]) assert_equal(true, result[4][2]) assert_equal("すよ", result[5][0]) assert_equal(5, result[5][1]) assert_equal(false, result[5][2]) assert_equal("よ", result[6][0]) assert_equal(6, result[6][1]) assert_equal(false, result[6][2]) assert_equal(7, result.length) result = [] @encoding.register_tokenize("") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(0, result.length) result = [] @encoding.register_tokenize("a") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("a", result[0][0]) assert_equal(0, result[0][1]) assert_equal(false, result[0][2]) assert_equal(1, result.length) result = [] @encoding.register_tokenize("あ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("あ", result[0][0]) assert_equal(0, result[0][1]) assert_equal(false, result[0][2]) assert_equal(1, result.length) result = [] @encoding.register_tokenize("DE F g") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["DE ", 0, true], result[0]) assert_equal(["E ", 1, true], result[1]) assert_equal([" ", 2, true], result[2]) assert_equal([" ", 3, true], result[3]) assert_equal([" F", 4, true], result[4]) assert_equal([" F ", 5, true], result[5]) assert_equal(["F g", 6, true], result[6]) assert_equal([" g", 7, false], result[7]) assert_equal(["g", 8, false], result[8]) assert_equal(9, result.length) result = [] @encoding.register_tokenize("100000") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["100", 0, true], result[0]) assert_equal(["000", 1, true], result[1]) assert_equal(["000", 2, true], result[2]) assert_equal(["000", 3, true], result[3]) assert_equal(["00", 4, false], result[4]) assert_equal(["0", 5, false], result[5]) assert_equal(6, result.length) result = [] @encoding.register_tokenize("abc\0def") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["abc", 0, true], result[0]) assert_equal(["bc", 1, true], result[1]) assert_equal(["c\0", 2, true], result[2]) assert_equal(["\0d", 3, true], result[3]) assert_equal(["def", 4, true], result[4]) assert_equal(["ef", 5, false], result[5]) assert_equal(["f", 6, false], result[6]) assert_equal(7, result.length) result = [] @encoding.register_tokenize("\xE3") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["\xE3", 0, false], result[0]) assert_equal(1, result.length) result = [] @encoding.register_tokenize("\xE3\x81\x82\xE3") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["\xE3\x81\x82\xE3", 0, true], result[0]) assert_equal(["\xE3", 1, false], result[1]) assert_equal(2, result.length) result = [] @encoding.register_tokenize("\xE3\x81\x82\xE3\x81\x82\xE3") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["\xE3\x81\x82\xE3\x81\x82", 0, true], result[0]) assert_equal(["\xE3\x81\x82\xE3", 1, true], result[1]) assert_equal(["\xE3", 2, false], result[2]) assert_equal(3, result.length) end def test_search_tokenize result = [] @encoding.search_tokenize("ruby is great.") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["rub", 0, true], result[0]) assert_equal(["uby", 1, true], result[1]) assert_equal(["by ", 2, true], result[2]) assert_equal(["y i", 3, true], result[3]) assert_equal([" is", 4, true], result[4]) assert_equal(["is ", 5, true], result[5]) assert_equal(["s g", 6, true], result[6]) assert_equal([" gr", 7, true], result[7]) assert_equal(["gre", 8, true], result[8]) assert_equal(["rea", 9, true], result[9]) assert_equal(["eat", 10, true], result[10]) assert_equal(["at", 11, true], result[11]) assert_equal(["t.", 12, true], result[12]) assert_equal(13, result.length) result = [] @encoding.search_tokenize("アイウエオ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("アイウ", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("イウエ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("ウエオ", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal(3, result.length) result = [] @encoding.search_tokenize("あいうえお") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("あいう", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("いうえ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("うえお", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal(3, result.length) result = [] @encoding.search_tokenize("日本語") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("日本", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("本語", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal(2, result.length) result = [] @encoding.search_tokenize("Rubyですよ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("Rub", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("uby", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("by", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("yで", result[3][0]) assert_equal(3, result[3][1]) assert_equal(true, result[3][2]) assert_equal("ですよ", result[4][0]) assert_equal(4, result[4][1]) assert_equal(true, result[4][2]) assert_equal(5, result.length) result = [] @encoding.search_tokenize("") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(0, result.length) result = [] @encoding.search_tokenize("a") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["a", 0, false], result[0]) assert_equal(1, result.length) result = [] @encoding.search_tokenize("あ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("あ", result[0][0]) assert_equal(0, result[0][1]) assert_equal(false, result[0][2]) assert_equal(1, result.length) result = [] @encoding.search_tokenize("DE F g") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["DE ", 0, true], result[0]) assert_equal(["E ", 1, true], result[1]) assert_equal([" ", 2, true], result[2]) assert_equal([" ", 3, true], result[3]) assert_equal([" F", 4, true], result[4]) assert_equal([" F ", 5, true], result[5]) assert_equal(["F g", 6, true], result[6]) assert_equal(7, result.length) result = [] @encoding.search_tokenize("100000") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["100", 0, true], result[0]) assert_equal(["000", 1, true], result[1]) assert_equal(["000", 2, true], result[2]) assert_equal(["000", 3, true], result[3]) assert_equal(4, result.length) result = [] @encoding.search_tokenize("コート") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["コート", 0, true], result[0]) assert_equal(1, result.length) end def test_normalize_text assert_equal(" abc ", @encoding.normalize_text(" abc ")) assert_equal(" abc abc", @encoding.normalize_text(" abc\nabc")) assert_equal("a b c d e ", @encoding.normalize_text("a\n \t b\nc\r\rd \ne ")) s = @encoding.normalize_text("アイウ") assert_equal("アイウ", s) s = @encoding.normalize_text("アイウアイウ") assert_equal("アイウアイウ", s) s = @encoding.normalize_text("タチツテト") assert_equal("タチツテト", s) s = @encoding.normalize_text("サシスセソタチツテト") assert_equal("サシスセソタチツテト", s) s = @encoding.normalize_text("ガギグゲゴ") assert_equal("ガギグゲゴ", s) s = @encoding.normalize_text("ダヂヅデド") assert_equal("ダヂヅデド", s) s = @encoding.normalize_text("パピプペポ") assert_equal("パピプペポ", s) s = @encoding.normalize_text("ヴ") assert_equal("ヴ", s) s = @encoding.normalize_text("バヴパ") assert_equal("バヴパ", s) s = @encoding.normalize_text(" ABC+&<>") assert_equal(" ABC+&<>", s) s = @encoding.normalize_text("   \n\r \t  ") assert_equal(" ", s) end def test_normalize_chars s = @encoding.normalize_chars("ABC") assert_equal("abc", s) s = @encoding.normalize_chars("\xC3\x81") assert_equal("\xC3\xA1", s) # includes copyright sign: \xC2\xA9 s = @encoding.normalize_chars("Ruby \xC2\xA9") assert_equal("ruby \xC2\xA9", s) =begin # todo: includes invalid utf-8 character: \xA9 s = @encoding.normalize_chars("Ruby \xA9") assert_equal("ruby \xA9", s) =end end end end end