# Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. # # This file is part of Rast. # See the file COPYING for redistribution information. # require "open3" require File.join(File.dirname(__FILE__), "read-buckets-to-file") class ApplicationPDF SUPPORTED_VERSION = 1 MIME_TYPE = "application/pdf" include ReadBucketsToFile private def process_file(filter, mime_type, path) input_encoding = nil db_encoding = filter.db_encoding s = "" passed_nbytes = 0 IO.popen("pdftotext #{path} -") do |io| page = "" while line = io.gets if match_data = /\f/.match(line) page.concat(match_data.pre_match) if page.empty? next end if input_encoding.nil? input_encoding = Rast::EncodingConverter.guess(page, Rast::JAPANESE_ENCODINGS) end s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, page) next_brigade = Rast::Brigade.new next_brigade.insert_tail(Rast::TransientBucket.new(s)) filter.pass(next_brigade, "text/plain") passed_nbytes += s.length page = match_data.post_match else page.concat(line.strip) end end end if !$?.success? && passed_nbytes == 0 raise Rast::RastError.new("failed to convert file") end Open3.popen3("pdfinfo #{path}") do |stdin, stdout, stderr| db_properties = {} s = stdout.read ["title", "subject", "author"].each do |name| if (match_data = /^#{name}:\s*(.*)$/i.match(s)) db_properties[name] = match_data[1] end end db_properties.each do |key, value| s = Rast::EncodingConverter.convert_encoding(input_encoding, db_encoding, value) filter.set_property(key, s) end end next_brigade = Rast::Brigade.new next_brigade.insert_tail(Rast::EOSBucket.new) filter.pass(next_brigade, "text/plain") end end