# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # require "htree" require File.join(File.dirname(__FILE__), "read-buckets-to-buffer") class TextHtml SUPPORTED_VERSION = 1 MIME_TYPE = "text/html" ENCODINGS = ["UTF-8", "EUC-JP"] include ReadBucketsToBuffer private def process_buffer(filter, mime_type) db_encoding = filter.db_encoding space = Rast::EncodingConverter.convert_encoding("US-ASCII", db_encoding, " ") buf = @buffer.gsub(" ", space) tree = HTree.parse(buf) input_encoding = get_input_encoding(tree) title = tree.title if title s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, title.to_s) filter.set_property("title", s) end author = "" tree.traverse_element("{http://www.w3.org/1999/xhtml}meta") do |elem| name = elem.get_attribute("name") content = elem.get_attribute("content") if name && content converted_name = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, name.to_s) converted_content = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, content.to_s) filter.set_property(converted_name, converted_content) end if converted_name == "author" author.concat(converted_content) author.concat(" ") end end tree.traverse_element("{http://www.w3.org/1999/xhtml}link") do |elem| href = elem.get_attribute("href") rev = elem.get_attribute("rev") if href && rev && rev.to_s == "made" href_str = href.to_s if (match_data = /mailto:(.*)/.match(href_str)) href_str = match_data[1] end filter.set_property("author", href_str) author.concat(href_str) author.concat(" ") end end body = tree.find_element("{http://www.w3.org/1999/xhtml}body") if (address = body.find_element("{http://www.w3.org/1999/xhtml}address")) s = address.extract_text.to_s.strip s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, s) author.concat(s) filter.set_property("author", author) end s = body.extract_text.to_s s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, s) bucket = Rast::TransientBucket.new(s) next_brigade = Rast::Brigade.new next_brigade.insert_tail(bucket) next_brigade.insert_tail(Rast::EOSBucket.new) filter.pass(next_brigade, "text/plain") end def get_input_encoding(tree) tree.traverse_element("{http://www.w3.org/1999/xhtml}meta") do |elem| if "content-type" == elem.get_attribute("http-equiv").to_s.downcase content = elem.get_attribute("content") if (match_data = /;\s+charset=([^;]+)/i.match(content.to_s)) return match_data[1] end end end return Rast::EncodingConverter.guess(@buffer, Rast::JAPANESE_ENCODINGS) end end