# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # # -*- mode: Ruby; coding: euc-japan; -*- require "test/unit" require "rast_test" module Rast class Encoding class EucJpTest < Test::Unit::TestCase def setup @encoding = Encoding["euc_jp"] end def test_register_tokenize result = [] @encoding.register_tokenize("ruby is great.") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["rub", 0, true], result[0]) assert_equal(["uby", 1, true], result[1]) assert_equal(["by ", 2, true], result[2]) assert_equal(["y i", 3, true], result[3]) assert_equal([" is", 4, true], result[4]) assert_equal(["is ", 5, true], result[5]) assert_equal(["s g", 6, true], result[6]) assert_equal([" gr", 7, true], result[7]) assert_equal(["gre", 8, true], result[8]) assert_equal(["rea", 9, true], result[9]) assert_equal(["eat", 10, true], result[10]) assert_equal(["at", 11, true], result[11]) assert_equal(["t.", 12, true], result[12]) assert_equal([".", 13, false], result[13]) assert_equal(14, result.length) result = [] @encoding.register_tokenize("アイウエオ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("アイウ", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("イウエ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("ウエオ", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("エオ", result[3][0]) assert_equal(3, result[3][1]) assert_equal(false, result[3][2]) assert_equal("オ", result[4][0]) assert_equal(4, result[4][1]) assert_equal(false, result[4][2]) assert_equal(5, result.length) result = [] @encoding.register_tokenize("あいうえお") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("あいう", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("いうえ", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("うえお", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("えお", result[3][0]) assert_equal(3, result[3][1]) assert_equal(false, result[3][2]) assert_equal("お", result[4][0]) assert_equal(4, result[4][1]) assert_equal(false, result[4][2]) assert_equal(5, result.length) result = [] @encoding.register_tokenize("日本語") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("日本", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("本語", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("語", result[2][0]) assert_equal(2, result[2][1]) assert_equal(false, result[2][2]) assert_equal(3, result.length) result = [] @encoding.register_tokenize("Rubyですよ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal("Rub", result[0][0]) assert_equal(0, result[0][1]) assert_equal(true, result[0][2]) assert_equal("uby", result[1][0]) assert_equal(1, result[1][1]) assert_equal(true, result[1][2]) assert_equal("by", result[2][0]) assert_equal(2, result[2][1]) assert_equal(true, result[2][2]) assert_equal("yで", result[3][0]) assert_equal(3, result[3][1]) assert_equal(true, result[3][2]) assert_equal("ですよ", result[4][0]) assert_equal(4, result[4][1]) assert_equal(true, result[4][2]) assert_equal("すよ", result[5][0]) assert_equal(5, result[5][1]) assert_equal(false, result[5][2]) assert_equal("よ", result[6][0]) assert_equal(6, result[6][1]) assert_equal(false, result[6][2]) assert_equal(7, result.length) result = [] @encoding.register_tokenize("") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(0, result.length) result = [] @encoding.register_tokenize("a") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["a", 0, false], result[0]) assert_equal(1, result.length) result = [] @encoding.register_tokenize("あ") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["あ", 0, false], result[0]) assert_equal(1, result.length) result = [] @encoding.search_tokenize("コート") do |ngram, pos, complete| result.push([ngram, pos, complete]) end assert_equal(["コート", 0, true], result[0]) assert_equal(1, result.length) end def test_normalize_text assert_equal(" abc ", @encoding.normalize_text(" abc ")) assert_equal(" abc abc", @encoding.normalize_text(" abc\nabc")) assert_equal("a b c d e ", @encoding.normalize_text("a\n \t b\nc\r\rd \ne ")) s = @encoding.normalize_text("012ABC") assert_equal("012ABC", s) s = @encoding.normalize_text("アイウ") assert_equal("アイウ", s) s = @encoding.normalize_text("アイウアイウ") assert_equal("アイウアイウ", s) s = @encoding.normalize_text("タチツテト") assert_equal("タチツテト", s) s = @encoding.normalize_text("サシスセソタチツテト") assert_equal("サシスセソタチツテト", s) s = @encoding.normalize_text("ガギグゲゴ") assert_equal("ガギグゲゴ", s) s = @encoding.normalize_text("ダヂヅデド") assert_equal("ダヂヅデド", s) s = @encoding.normalize_text("パピプペポ") assert_equal("パピプペポ", s) s = @encoding.normalize_text("ヴ") assert_equal("ヴ", s) end def test_normalize_chars s = @encoding.normalize_chars("ABC") assert_equal("abc", s) end end end end