/* * Copyright (C) 2005 Network Applied Communication Laboratory Co., Ltd. * * This file is part of Rast. * See the file COPYING for redistribution information. * */ #ifndef RAST_ENCODING_H #define RAST_ENCODING_H /** * @file encoding.h encoding */ #include #include "rast/rast.h" #include "rast/error.h" #include "rast/macros.h" RAST_EXTERN_C_BEGIN /** * @defgroup encoding encoding * @{ */ typedef struct rast_encoding_module_t rast_encoding_module_t; /** A structure that represents a character */ typedef struct { rast_encoding_module_t *encoding_module; const char *ptr; int nbytes; } rast_char_t; /** A structure that represents a token */ typedef struct { const char *ptr; int nbytes; int nchars; rast_pos_t pos; int is_complete; } rast_token_t; /** A structure that represents a tokenizer */ typedef struct { rast_encoding_module_t *encoding_module; apr_pool_t *pool; const unsigned char *ptr; const unsigned char *ptr_end; rast_pos_t pos; void *context; } rast_tokenizer_t; /** A structure that represents an encoding module definition */ struct rast_encoding_module_t { const char *encoding; /** * Return the length of the current character. * @param tokenizer The tokenizer. * @param len The length of the current character in bytes * @return RAST_OK if succeeded, error otherwise */ rast_error_t *(*get_char_len)(rast_tokenizer_t *tokenizer, rast_size_t *len); /** * Return the current token. * @param tokenizer The tokenizer. * @param token The current token * @return RAST_OK if succeeded, error otherwise */ rast_error_t *(*get_token)(rast_tokenizer_t *tokenizer, rast_token_t *token); /** * Return the offset to the next token. * @param tokenizer The tokenizer. * @param byte_offset The byte offset to the next token * @param char_offset The character offset to the next token * @return RAST_OK if succeeded, error otherwise */ rast_error_t *(*get_next_offset)(rast_tokenizer_t *tokenizer, rast_size_t *byte_offset, rast_size_t *char_offset); /** * Normalize text. This function may change number of characters. * @param pool The pool to allocate the memory out of * @param src The source string * @param src_len The length of the source string * @param dst The destination string * @param dst_len The length of the destination string */ void (*normalize_text)(apr_pool_t *pool, const char *src, rast_size_t src_len, char **dst, rast_size_t *dst_len); /** * Normalize each character in src. This function should not change * number of characters. * @param pool The pool to allocate the memory out of * @param src The source string * @param src_len The length of the source string * @param dst The destination string * @param dst_len The length of the destination string */ void (*normalize_chars)(apr_pool_t *pool, const char *src, rast_size_t src_len, char **dst, rast_size_t *dst_len); /** * Check whether a character is a space character or not. * @param ch The character to check * @return 1 if ch is a space character, 0 otherwise */ int (*is_space)(rast_char_t *ch); }; /** * Load encoding modules. * @param dirname The name of the directory where encoding modules are located. * @return RAST_OK if succeeded, error otherwise */ rast_error_t *rast_load_encoding_modules(const char *dirname); /** * Unload encoding modules. * @return RAST_OK if succeeded, error otherwise */ rast_error_t *rast_unload_encoding_modules(); /** * Get an encoding module. * @param name The name of the encoding module * @param module The encoding module * @return RAST_OK if succeeded, error otherwise */ rast_error_t *rast_get_encoding_module(const char *name, rast_encoding_module_t **module); /** * Normalize the text by the specified encoding_module. This function calls * rast_encoding_module_t::normalize_text at first, then calls * rast_encoding_module_t::normalize_chars. * @param encoding_module The encoding module for normalizing. * @param s The source string * @param nbytes The length of the source string in bytes. * @param new_nbytes The length of the normalized string in bytes. * @param pool The pool to allocate the memory out of * @return The normalized string */ char *rast_normalize_text(rast_encoding_module_t *encoding_module, const char *s, rast_size_t nbytes, rast_size_t *new_nbytes, apr_pool_t *pool); rast_tokenizer_t *rast_char_tokenizer_create(apr_pool_t *pool, rast_encoding_module_t *, const char *s, rast_size_t nbytes); rast_error_t *rast_char_tokenizer_next(rast_tokenizer_t *tokenizer); rast_error_t *rast_char_tokenizer_get_current(rast_tokenizer_t *tokenizer, rast_char_t *ch); int rast_char_tokenizer_is_done(rast_tokenizer_t *tokenizer); rast_tokenizer_t *rast_register_tokenizer_create(apr_pool_t *pool, rast_encoding_module_t *, const char *s, rast_size_t nbytes); rast_error_t *rast_register_tokenizer_next(rast_tokenizer_t *tokenizer); rast_error_t *rast_register_tokenizer_get_current(rast_tokenizer_t *tokenizer, rast_token_t *token); int rast_register_tokenizer_is_done(rast_tokenizer_t *tokenizer); rast_tokenizer_t *rast_search_tokenizer_create(apr_pool_t *pool, rast_encoding_module_t *, const char *s, rast_size_t nbytes); rast_error_t *rast_search_tokenizer_next(rast_tokenizer_t *tokenizer); rast_error_t *rast_search_tokenizer_get_current(rast_tokenizer_t *tokenizer, rast_token_t *token); int rast_search_tokenizer_is_done(rast_tokenizer_t *tokenizer); int rast_count_chars(rast_encoding_module_t *encoding_module, const char *s, rast_size_t nbytes, apr_pool_t *pool); int rast_char_is_space(rast_char_t *ch); /** @} */ RAST_EXTERN_C_END #endif /* RAST_ENCODING_H */ /* vim: set filetype=c sw=4 expandtab : */