X-Git-Url: https://git.hcoop.net/hcoop/debian/courier-authlib.git/blobdiff_plain/01037b081eab5fb3f208489dc3e052ec3a2c8ba1..1420868b3e321353480efbb7eb35e1e8d9943223:/libs/unicode/unicode.h diff --git a/libs/unicode/unicode.h b/libs/unicode/unicode.h deleted file mode 100644 index 4e018b7..0000000 --- a/libs/unicode/unicode.h +++ /dev/null @@ -1,1692 +0,0 @@ -#ifndef unicode_h -#define unicode_h - -/* -** Copyright 2000-2011 Double Precision, Inc. -** See COPYING for distribution information. -** -*/ - -#ifdef __cplusplus - -#include -#include -#include - -extern "C" { -#endif - -#if 0 -} -#endif - -#include "unicode/unicode_config.h" - -#include - -#include -#if HAVE_WCHAR_H -#include -#endif - -#if HAVE_STDDEF_H -#include -#endif -#include - -#include - -typedef uint32_t unicode_char; - -/* -** The system default character set, from the locale. -*/ - -extern const char *unicode_default_chset(); - -/* Unicode upper/lower/title case conversion functions */ - -extern unicode_char unicode_uc(unicode_char); -extern unicode_char unicode_lc(unicode_char); -extern unicode_char unicode_tc(unicode_char); - -/* -** Look up HTML 4.0/XHTML entity. -** -** n="amp", etc... -** -** Returns the unicode entity value, or 0 if no such entity is defined. -*/ - -unicode_char unicode_html40ent_lookup(const char *n); - -/* -** -** Return "width" of unicode character. -** -** This is defined as follows: for characters having the F or W property in -** tr11 (EastAsianWidth), unicode_wcwidth() returns 2. -** -** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line -** breaking property as per tr14, unicode_wcwdith() returns 0. For all other -** cases, 1. -** -** This provides a rough estimate of the "width" of the character if its -** shown on a text console. -*/ - -extern int unicode_wcwidth(unicode_char c); -extern size_t unicode_wcwidth_str(const unicode_char *c); - -/* -** The unicode-ish isspace() -*/ -extern int unicode_isspace(unicode_char ch); - -/* Internal unicode table lookup function */ - -extern uint8_t unicode_tab_lookup(unicode_char ch, - const size_t *unicode_indextab, - size_t unicode_indextab_sizeof, - const uint8_t (*unicode_rangetab)[2], - const uint8_t *unicode_classtab, - uint8_t uclass); - -/* -** Implementation of grapheme cluster boundary rules, as per tr29, -** including GB9a and GB9b. -** -** Returns non-zero if there's a grapheme break between the two referenced -** characters. -*/ - -int unicode_grapheme_break(unicode_char a, unicode_char b); - -/* -** Implementation of line break rules, as per tr14. -** -** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The -** first parameter is a callback function that gets invoked with two -** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument. -** The second parameter to unicode_lb_init() is the opaque passthrough -** pointer, that is passed as the second argument to the callback function -** with no further interpretation. -** -** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(), -** passing the handle and one unicode character. Repeatedly invoke -** unicode_lb_next() to specify the input string for the linebreaking -** algorithm, then invoke unicode_lb_end() to finish calculating the -** linebreaking algorithm, and deallocate the opaque linebreaking handle. -** -** The callback function gets invoked once for each invocation of -** unicode_lb_next(). The contract is that before unicode_lb_end() returns, -** the callback function will get invoked the exact number of times that -** unicode_lb_next(), as long as each invocation of the callback function -** returned 0; nothing more, nothing less. The first parameter to the callback -** function will be one of the following values: -** -** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding -** character. -** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding -** character. -** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding -** character (the preceding character is a space, or an equivalent). -** -** The callback function should return 0. A non-zero value indicates an -** error, which gets propagated up to the caller. The contract that the -** callback function gets invoked the same number of times that -** unicode_lb_next() gets invoked is now broken. -*/ - -#define UNICODE_LB_MANDATORY -1 -#define UNICODE_LB_NONE 0 -#define UNICODE_LB_ALLOWED 1 - -struct unicode_lb_info; - -typedef struct unicode_lb_info *unicode_lb_info_t; - -/* -** Allocate a linebreaking handle. -*/ -extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *), - void *cb_arg); - -/* -** Feed the next character through the linebreaking algorithm. -** A non-zero return code indicates that the callback function was invoked -** and it returned a non-zero return code (which is propagated as a return -** value). unicode_lb_end() must still be invoked, in this case. -** -** A zero return code indicates that if the callback function was invoked, -** it returned 0. -*/ - -extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch); - -/* -** Convenience function that invokes unicode_lb_next() with a list of -** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned -** 0, or the first non-zero return value from unicode_lb_next(). -*/ - -extern int unicode_lb_next_cnt(unicode_lb_info_t i, - const unicode_char *chars, - size_t cnt); - -/* -** Finish the linebreaking algorithm. -** -** A non-zero return code indicates that the callback function was invoked -** and it returned a non-zero return code (which is propagated as a return -** value). -** -** A zero return code indicates that if the callback function was invoked, -** it returned 0, and that the callback function was invoked exactly the same -** number of times that unicode_lb_next() was invoked. -** -** In all case, the linebreak handle will no longer be valid when this -** function returns. -*/ - -extern int unicode_lb_end(unicode_lb_info_t i); - -/* -** An alternative linebreak API where the callback function receives the -** original unicode character in addition to its linebreak value. -** -** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose -** semantics are the same as their _lb_ counterparts. -*/ - -struct unicode_lbc_info; - -typedef struct unicode_lbc_info *unicode_lbc_info_t; - -extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, - void *), - void *cb_arg); -extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch); -extern int unicode_lbc_end(unicode_lbc_info_t i); - -/* -** Set linebreaking options. -** -** OPTIONS SUBJECT TO CHANGE. -*/ - -extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts); - -extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts); - -/* -** Tailorization of LB24: Prevent pluses, as in "C++", from breaking. -** -** Adds the following to LB24: -** -** PR x PR -** -** AL x PR -** -** ID x PR -**/ -#define UNICODE_LB_OPT_PRBREAK 0x0001 - - -/* -** Tailored / breaking rules. -** -** Adds the following rule to LB13: -** -** SY x EX -** -** SY x AL -** -** SY x ID -** -** SP ÷ SY, which takes precedence over "x SY". -*/ -#define UNICODE_LB_OPT_SYBREAK 0x0002 - -/* -** Tailored / breaking rules. -** -** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before -** and after mdash and ndash. -*/ -#define UNICODE_LB_OPT_DASHWJ 0x0004 - -/* -** Implemention of word break rules, as per tr29. -** -** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The -** first parameter is a callback function that gets invoked with two -** arguments: an int flag, and a passthrough argument. The second parameter to -** unicode_wb_init() is the opaque passthrough pointer, that is passed as the -** second argument to the callback function with no further interpretation. -** -** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(), -** passing the handle and one unicode character. Repeatedly invoke -** unicode_wb_next() to specify the input string for the wordbreaking -** algorithm, then invoke unicode_wb_end() to finish calculating the -** wordbreaking algorithm, and deallocate the opaque wordbreaking handle. -** -** The callback function gets invoked once for each invocation of -** unicode_wb_next(). The contract is that before unicode_wb_end() returns, -** the callback function will get invoked the exact number of times that -** unicode_wb_next(), as long as each invocation of the callback function -** returned 0; nothing more, nothing less. The first parameter to the callback -** function will be an int. A non-zero value indicates that there is a word -** break between this character and the preceding one. -** -** The callback function should return 0. A non-zero value indicates an -** error, which gets propagated up to the caller. The contract that the -** callback function gets invoked the same number of times that -** unicode_lb_next() gets invoked is now broken. -*/ - -struct unicode_wb_info; - -typedef struct unicode_wb_info *unicode_wb_info_t; - -/* -** Allocate a wordbreaking handle. -*/ -extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *), - void *cb_arg); - -/* -** Feed the next character through the wordbreaking algorithm. -** A non-zero return code indicates that the callback function was invoked -** and it returned a non-zero return code (which is propagated as a return -** value). unicode_wb_end() must still be invoked, in this case. -** -** A zero return code indicates that if the callback function was invoked, -** it returned 0. -*/ - -extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch); - -/* -** Convenience function that invokes unicode_wb_next() with a list of -** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned -** 0, or the first non-zero return value from unicode_wb_next(). -*/ - -extern int unicode_wb_next_cnt(unicode_wb_info_t i, - const unicode_char *chars, - size_t cnt); - -/* -** Finish the wordbreaking algorithm. -** -** A non-zero return code indicates that the callback function was invoked -** and it returned a non-zero return code (which is propagated as a return -** value). -** -** A zero return code indicates that if the callback function was invoked, -** it returned 0, and that the callback function was invoked exactly the same -** number of times that unicode_wb_next() was invoked. -** -** In all case, the wordbreak handle will no longer be valid when this -** function returns. -*/ - -extern int unicode_wb_end(unicode_wb_info_t i); - -/* -** Search for a word boundary. -** -** Obtain a handle by calling unicode_wbscan_init(), then invoke -** unicode_wbscan_next() to provide a unicode stream, then invoke -** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode -** characters from the beginning of the stream until the first word boundary. -** -** You may prematurely stop calling unicode_wbscan_next() once it returns a -** non-0 value, which means that there is sufficient context to compute the -** first word boundary, and all further calls to unicode_wbscan_next() will -** be internal no-ops. -*/ - -struct unicode_wbscan_info; - -typedef struct unicode_wbscan_info *unicode_wbscan_info_t; - -unicode_wbscan_info_t unicode_wbscan_init(); - -int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch); - -size_t unicode_wbscan_end(unicode_wbscan_info_t i); - -/* -** A buffer that holds unicode characters, and dynamically grows as needed. -*/ - -struct unicode_buf { - unicode_char *ptr; /* The unicode characters */ - size_t size, /* Buffer size */ - len, /* How many characters in ptr are initialized */ - max; /* Maximum size the buffer can grow to */ -}; - -/* -** Initialize a buffer. Constructor. -*/ - -void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */ - struct unicode_buf *p, - - /* - ** Maximum size the buffer can grow to. (size_t)-1 - ** means unlimited. - */ - size_t max); -/* -** Like unicode_buf_init, and initialize the new buffer with the contents of -** another buffer. The maximum size of the initialized buffer is exactly the -** number of characters in the existing buffer. This copies a buffer using -** the minimum amount of heap space. -*/ - -#define unicode_buf_init_copy(a,b) \ - do { \ - unicode_buf_init((a), unicode_buf_len(b)); \ - unicode_buf_append_buf((a),(b)); \ - } while (0) - -/* -** Deinitialize the buffer. Destructor. Frees memory. -*/ - -void unicode_buf_deinit(struct unicode_buf *p); - -/* -** Official way to access the characters in the unicode buffer. -*/ -#define unicode_buf_ptr(p) ((p)->ptr) - -/* -** Official way of obtaining the number of characters in the unicode buffer. -*/ -#define unicode_buf_len(p) ((p)->len) - -/* -** Remove all existing characters from an initialized buffer. Sets len to 0. -*/ - -#define unicode_buf_clear(p) ((p)->len=0) - -/* -** Append characters to the existing characters in the unicode buffer. -** The buffer grows, if needed. If the buffer would exceed its maximum size, -** the extra characters get truncated. -** -** Returns 0 if the characters were appended. -1 for a malloc failure. -*/ - -int unicode_buf_append(struct unicode_buf *p, /* The buffer */ - const unicode_char *uc, /* Characters to append */ - size_t l); /* How many of them */ - -/* -** Convert an iso-8859-1 char string and invoke unicode_buf_append(). -*/ - -void unicode_buf_append_char(struct unicode_buf *dst, - const char *str, - size_t cnt); - -/* -** Remove some portion of the unicode buffer -*/ - -void unicode_buf_remove(struct unicode_buf *p, /* The buffer */ - size_t pos, /* Offset in buffer */ - size_t cnt); /* How many to remove */ - -/* -** Append the contents of an existing buffer to another one. -*/ - -#define unicode_buf_append_buf(a,b) \ - unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b)) - - -/* -** The equivalent of strcmp() for unicode buffers. -*/ - -int unicode_buf_cmp(const struct unicode_buf *a, - const struct unicode_buf *b); - -/* -** The equivalent of unicode_buf_cmp, except that the second buffer is an -** iso-8859-1 string. -*/ - -int unicode_buf_cmp_str(const struct unicode_buf *p, - const char *c, /* iso-8859-1 string */ - size_t cl); /* Number of chars in c */ - -/* -** A wrapper for iconv(3). This wrapper provides a different API for iconv(3). -** A handle gets created by libmail_u_convert_init(). -** libmail_u_convert_init() receives a pointer to the output function -** which receives converted character text. -** -** The output function receives a pointer to the converted character text, and -** the number of characters in the converted text. -** -** The character text to convert gets passed, repeatedly, to -** libmail_u_convert(). Each call to libmail_u_convert() results in -** the output function being invoked, zero or more times, with the converted -** text. Finally, libmail_u_convert_deinit() stops the conversion and -** deallocates the conversion handle. -** -** Internal buffering takes place. libmail_u_convert_deinit() may result -** in the output function being called one or more times, to receive the final -** part of the converted character stream. -** -** The output function should return 0. A non-0 value causes -** libmail_u_convert() and/or libmail_u_convert_deinit() returning -** non-0. -*/ - -struct libmail_u_convert_hdr; - -typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t; - -/* -** libmail_u_convert_init() returns a non-NULL handle for the requested -** conversion, or NULL if the requested conversion is not available. -*/ - -libmail_u_convert_handle_t -libmail_u_convert_init(/* Convert from this chset */ - const char *src_chset, - - /* Convert to this chset */ - const char *dst_chset, - - /* The output function */ - - int (*output_func)(const char *, size_t, void *), - - /* Passthrough arg */ - void *convert_arg); - -/* -** Repeatedly pass the character text to convert to libmail_u_convert(). -** -** Returns non-0 if the output function returned non-0, or 0 if all invocations -** of the output function returned 0. -*/ - -int libmail_u_convert(/* The conversion handle */ - libmail_u_convert_handle_t handle, - - /* Text to convert */ - const char *text, - - /* Number of bytes to convert */ - size_t cnt); - -/* -** Finish character set conversion. The handle gets deallocated. -** -** May still result in one or more invocations of the output function. -** Returns non-zero if any previous invocation of the output function returned -** non-zero (this includes any invocations of the output function resulting -** from this call, or prior libmail_u_convert() calls), or 0 if all -** invocations of the output function returned 0. -** -** If the errptr is not NULL, *errptr is set to non-zero if there were any -** conversion errors -- if there was any text that could not be converted to -** the destination character text. -*/ - -int libmail_u_convert_deinit(libmail_u_convert_handle_t handle, - int *errptr); - - -/* -** Specialization: save converted character text in a buffer. -** -** Implementation: call libmail_u_convert_tocbuf_init() instead of -** libmail_u_convert_init(), then call libmail_u_convert() and -** libmail_u_convert_deinit(), as usual. -** -** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a -** malloc()ed buffer, and the number of converted characters, the size of the -** malloc()ed buffer, are placed into *csize_ret arguments, that were passed -** to libmail_u_convert_tou_init(). -** -** Note: if the converted string is an empty string, *cbufsize_ret is set to 0, -** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer). -** -** The optional nullterminate places a trailing \0 character after the -** converted string (this is included in *cbufsize_ret). -*/ - -libmail_u_convert_handle_t -libmail_u_convert_tocbuf_init(/* Convert from this chset */ - const char *src_chset, - - /* Convert to this chset */ - const char *dst_chset, - - /* malloced buffer */ - char **cbufptr_ret, - - /* size of the malloced buffer */ - size_t *cbufsize_ret, - - /* null terminate the resulting string */ - int nullterminate - ); - - -/* -** Specialization: convert some character text to a unicode_char array. -** -** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char -** array. -** -** The returned *ucsize_ret is initialized with the number of unicode_chars, -** rather than the byte count. -** -** In all other ways, this function behaves identically to -** libmail_u_convert_tocbuf_init(). -*/ - -libmail_u_convert_handle_t -libmail_u_convert_tou_init(/* Convert from this chset */ - const char *src_chset, - - /* malloc()ed buffer pointer, on exit. */ - unicode_char **ucptr_ret, - - /* size of the malloc()ed buffer, upon exit */ - size_t *ucsize_ret, - - /* If true, terminate with U+0x0000, for convenience */ - int nullterminate - ); - -/* -** Specialization: convert a unicode_char array to some character text. -** -** This is the opposite of libmail_u_convert_tou_init(). Call this to -** initialize the conversion handle, then use libmail_u_convert_uc() -** instead of libmail_u_convert. -*/ - -libmail_u_convert_handle_t -libmail_u_convert_fromu_init(/* Convert to this chset */ - const char *dst_chset, - - /* malloc()ed buffer pointer, on exit. */ - char **cbufptr_ret, - - /* size of the malloc()ed buffer, upon exit */ - size_t *cbufsize_ret, - - /* If true, terminate with U+0x0000, for convenience */ - int nullterminate - ); - -int libmail_u_convert_uc(/* The conversion handle */ - libmail_u_convert_handle_t handle, - - /* Text to convert */ - const unicode_char *text, - - /* Number of bytes to convert */ - size_t cnt); - -/* -** Initialize conversion to UTF-8. -** -** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the -** destination charset as UTF-8. -*/ - -libmail_u_convert_handle_t -libmail_u_convert_tocbuf_toutf8_init(const char *src_chset, - char **cbufptr_ret, - size_t *cbufsize_ret, - int nullterminate); - -/* -** Initialize conversion from UTF-8. -** -** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the -** source charset as UTF-8. -*/ - -libmail_u_convert_handle_t -libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset, - char **cbufptr_ret, - size_t *cbufsize_ret, - int nullterminate); - -/* -** Convert a character string to UTF-8. -** -** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an -** error occured. -*/ -char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */ - const char *text, - - /* Character set to convert to UTF-8 */ - const char *charset, - - /* - ** If non-NULL, and a non-NULL pointer is - ** returned, *error is set to non-zero if - ** a character conversion error has occured. - */ - int *error); - -/* -** Convert UTF-8 text to another character set. -** -** Returns a malloc-ed buffer holding the string converted to the specified -** character set, or NULL if an error occured. -*/ - -char *libmail_u_convert_fromutf8(/* A UTF-8 string */ - const char *text, - - /* - ** Convert the UTF-8 string to this character - ** set. - */ - - const char *charset, - - /* - ** If non-NULL, and a non-NULL pointer is - ** returned, *error is set to non-zero if - ** a character conversion error has occured. - */ - int *error); - -/* -** Convert one charset to another charset, placing the result in a malloc-ed -** buffer. -** -** Returns a malloc-ed buffer holding the string converted to the specified -** character set, or NULL if an error occured. -*/ - -char *libmail_u_convert_tobuf(/* A string to convert */ - const char *text, - - /* - ** String's charset. - */ - - const char *charset, - - /* - ** Destination charset - */ - const char *dstcharset, - - /* - ** If non-NULL, and a non-NULL pointer is - ** returned, *error is set to non-zero if - ** a character conversion error has occured. - */ - int *error); - -/* -** Convenience function: call libmail_u_convert_tou_init(), feed the -** character string through libmail_u_convert(), then call -** libmail_u_convert_deinit(). -** -** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size -** holding the unicode char array. -*/ - -int libmail_u_convert_tou_tobuf(/* Character text to convert */ - const char *text, - - /* Number of characters */ - size_t text_l, - - /* text's charset */ - const char *charset, - - /* - ** If this function returns 0, this gets - ** initialized - */ - unicode_char **uc, - - /* - ** Size of the allocated buffer - */ - size_t *ucsize, - - /* - ** If not null and this function returns 0, - ** this is set to non-0 if there - ** was a conversion error (but the output - ** buffer gets still allocated and - ** initialized) - */ - int *err); - -/* -** Convenience function: call libmail_u_convert_fromu_init(), feed the -** unicode_array through libmail_u_convert_uc(), then call -** libmail_u_convert_deinit(). -** -** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size -** holding the converted character string -*/ - -int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */ - const unicode_char *utext, - - /* - ** Size of the unicode array. - ** If this is (size_t)-1, utext is a - ** 0-terminated array. - */ - size_t utext_l, - - /* - ** Convert the unicode array to this charset. - */ - const char *charset, - - /* - ** If libmail_u_convert_fromu_tobuf() - ** returns 0, this is initialized to a - ** malloced buffer with a 0-terminated - ** string is kept. - */ - char **c, - - /* - ** Size of the initialized array, including - ** the 0-terminator. - */ - size_t *csize, - - /* - ** If libmail_u_convert_fromu_tobuf() - ** returns 0 and this is not NULL, - ** *err is set to non-0 if there was a - ** conversion error to the requested - ** character set. - */ - int *err); - -/* -** Convenience function: convert a string in a given character set -** to/from uppercase, lowercase, or something else. -** -** This is done by calling libmail_u_convert_tou_tobuf() first, -** applying the title_func and char_func, then using -** libmail_u_convert_fromu_tobuf(). -** -** A NULL return indicates that the requested conversion cannot be performed. -*/ - -char *libmail_u_convert_tocase( /* String to convert */ - const char *str, - - /* String's character set */ - - const char *charset, - - /* - ** Conversion of the first character in - ** str: unicode_uc, unicode_lc, or unicode_tc: - */ - - unicode_char (*first_char_func)(unicode_char), - - /* - ** Conversion of the second and the remaining - ** character in str. If NULL, same as - ** first_char_func. - */ - unicode_char (*char_func)(unicode_char)); - - - -/* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */ - -extern const char libmail_u_ucs4_native[]; - -/* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */ - -extern const char libmail_u_ucs2_native[]; - -/* -** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset -** parameter. -** -** This can be followed by a " " and up to 15 characters to be escaped in -** addition to unicode chars. -*/ - -#define unicode_x_imap_modutf7 "x-imap-modutf7" - -#if 0 -{ -#endif - -#ifdef __cplusplus -} - -extern size_t unicode_wcwidth(const std::vector &uc); - -namespace mail { - - /* - ** Interface to iconv. - ** - ** Subclass converted(). Invoke begin(), then operator(), repeatedly, - ** then end(). - ** - ** converted() receives the converted text. - */ - - class iconvert { - - libmail_u_convert_handle_t handle; - - public: - iconvert(); - virtual ~iconvert(); - - /* Start conversion. - ** Returns false if the requested conversion cannot be done. - **/ - - bool begin(/* Convert from */ - const std::string &src_chset, - - /* Convert to */ - const std::string &dst_chset); - - /* Feed iconv(3). Returns false if the conversion was aborted. - */ - - bool operator()(const char *, size_t); - - bool operator()(const unicode_char *, size_t); - - /* - ** Get the results here. If the subclass returns a non-0 - ** value, the conversion is aborted. - */ - - virtual int converted(const char *, size_t); - - /* - ** End of conversion. - ** - ** Returns true if all calls to converted() returned 0, - ** false if the conversion was aborted. - ** - ** errflag is set to true if there was a character that could - ** not be converted, and passed to converted(). - */ - - bool end(bool &errflag) - { - return end(&errflag); - } - - bool end() - { - return end(NULL); - } - - /* Convert between two different charsets */ - - static std::string convert(const std::string &text, - const std::string &charset, - const std::string &dstcharset, - bool &errflag); - - /* Convert between two different charsets */ - - static std::string convert(const std::string &text, - const std::string &charset, - const std::string &dstcharset) - { - bool dummy; - - return convert(text, charset, dstcharset, dummy); - } - - /* Convert from unicode to a charset */ - - static std::string convert(const std::vector &uc, - const std::string &dstcharset, - bool &errflag); - - /* Convert from unicode to a charset */ - - static std::string convert(const std::vector &uc, - const std::string &dstcharset) - { - bool dummy; - - return convert(uc, dstcharset, dummy); - } - - /* Convert charset to unicode */ - - static bool convert(const std::string &text, - const std::string &charset, - std::vector &uc); - - - /* Convert to upper/lower/title case */ - - static std::string - convert_tocase(/* Text string */ - const std::string &text, - - /* Its charset */ - const std::string &charset, - - /* First character: unicode_uc, unicode_lc, or unicode_tc */ - unicode_char (*first_char_func)(unicode_char), - - /* If not NULL, second and subsequent chars */ - unicode_char (*char_func)(unicode_char) - =NULL) - { - bool dummy; - - return convert_tocase(text, charset, dummy, - first_char_func, - char_func); - } - - /* Convert to upper/lower/title case */ - - static std::string - convert_tocase(/* Text string */ - const std::string &text, - - /* Its charset */ - const std::string &charset, - - /* Set if there's a conversion error */ - bool &err, - - /* First character: unicode_uc, unicode_lc, or unicode_tc */ - unicode_char (*first_char_func)(unicode_char), - - /* If not NULL, second and subsequent chars */ - unicode_char (*char_func)(unicode_char) - =NULL); - private: - bool end(bool *); - - public: - class tou; - class fromu; - }; - - /* Convert output of iconvert to unicode_chars. */ - - class iconvert::tou : public iconvert { - - public: - bool begin(const std::string &chset); - - virtual int converted(const unicode_char *, size_t); - - using iconvert::operator(); - private: - int converted(const char *ptr, size_t cnt); - - public: - template class to_iter_class; - - template - static output_iter_t convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - output_iter_t out_iter); - - template - static void convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - std::vector &out_buf) - { - out_buf.clear(); - std::back_insert_iterator > - insert_iter(out_buf); - - convert(from_iter, to_iter, chset, insert_iter); - } - - static void convert(const std::string &str, - const std::string &chset, - std::vector &out_buf); - }; - - /* Helper class that saves unicode output into an output iterator */ - - template - class iconvert::tou::to_iter_class : public iconvert::tou { - - iter_t iter; - public: - - to_iter_class(iter_t iterValue) - : iter(iterValue) {} - - using tou::operator(); - - operator iter_t() const { return iter; } - - private: - int converted(const unicode_char *ptr, size_t cnt) - { - while (cnt) - { - *iter=*ptr; - - ++iter; - ++ptr; - --cnt; - } - return 0; - } - }; - - template - output_iter_t iconvert::tou::convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - output_iter_t out_iter) - { - class to_iter_class out(out_iter); - - if (!out.begin(chset)) - return out; - - std::vector string; - - while (from_iter != to_iter) - { - string.push_back(*from_iter++); - - if (string.size() > 31) - { - out(&string[0], string.size()); - string.clear(); - } - } - - if (string.size() > 0) - out(&string[0], string.size()); - - out.end(); - return out; - } - - /* Convert output of iconvert from unicode_chars. */ - - class iconvert::fromu : public iconvert { - - public: - bool begin(const std::string &chset); - - using iconvert::operator(); - - template class to_iter_class; - - template - static output_iter_t convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - output_iter_t out_iter); - - template - static void convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - std::string &out_buf) - { - out_buf=""; - std::back_insert_iterator - insert_iter(out_buf); - - convert(from_iter, to_iter, chset, insert_iter); - } - - static void convert(const std::vector &ubuf, - const std::string &chset, - std::string &out_buf); - - static std::string convert(const std::vector - &ubuf, - const std::string &chset); - }; - - /* Helper class that saves unicode output into an output iterator */ - - template - class iconvert::fromu::to_iter_class : public iconvert::fromu { - - iter_t iter; - public: - - to_iter_class(iter_t iterValue) - : iter(iterValue) {} - - using fromu::operator(); - - operator iter_t() const { return iter; } - - private: - int converted(const char *ptr, size_t cnt) - { - while (cnt) - { - *iter=*ptr; - - ++iter; - ++ptr; - --cnt; - } - return 0; - } - }; - - template - output_iter_t iconvert::fromu::convert(input_iter_t from_iter, - input_iter_t to_iter, - const std::string &chset, - output_iter_t out_iter) - { - class to_iter_class out(out_iter); - - if (!out.begin(chset)) - return out; - - std::vector string; - - while (from_iter != to_iter) - { - string.push_back(*from_iter++); - - if (string.size() > 31) - { - out(&string[0], string.size()); - string.clear(); - } - } - - if (string.size() > 0) - out(&string[0], string.size()); - - out.end(); - return out; - } - - /* - ** Unicode linebreaking algorithm, tr14. - */ - - extern "C" int linebreak_trampoline(int value, void *ptr); - extern "C" int linebreakc_trampoline(int value, unicode_char ch, - void *ptr); - - /* - ** Subclass linebreak_callback_base, implement operator()(int). - ** - ** Use operator<< or operator()(iterator, iterator) to feed - ** unicode_chars into the linebreaking algorithm. The subclass receives - ** UNICODE_LB values, as they become available. - */ - - class linebreak_callback_base { - - unicode_lb_info_t handle; - - int opts; - - linebreak_callback_base(const linebreak_callback_base &); - /* NOT IMPLEMENTED */ - - linebreak_callback_base &operator==(const - linebreak_callback_base &); - /* NOT IMPLEMENTED */ - - public: - linebreak_callback_base(); - virtual ~linebreak_callback_base(); - - void finish(); - - void set_opts(int opts); - - friend int linebreak_trampoline(int, void *); - - linebreak_callback_base &operator<<(unicode_char uc); - - template - linebreak_callback_base &operator()(iter_type beg_iter, - iter_type end_iter) - { - while (beg_iter != end_iter) - operator<<(*beg_iter++); - return *this; - } - - linebreak_callback_base &operator<<(const - std::vector - &vec) - { - return operator()(vec.begin(), vec.end()); - } - private: - virtual int operator()(int); - }; - - class linebreak_callback_save_buf : public linebreak_callback_base { - - public: - std::list lb_buf; - - linebreak_callback_save_buf(); - ~linebreak_callback_save_buf(); - - private: - int operator()(int value); - }; - - /* - ** Convert an input iterator sequence over unicode_chars into - ** an input iterator sequence over linebreak values. - */ - - template class linebreak_iter - : public std::iterator - { - mutable input_t iter_value, end_iter_value; - - mutable linebreak_callback_save_buf *buf; - - void fill() const - { - if (buf == NULL) - return; - - while (buf->lb_buf.empty()) - { - if (iter_value == end_iter_value) - { - buf->finish(); - if (buf->lb_buf.empty()) - { - delete buf; - buf=NULL; - } - break; - } - - buf->operator<<(*iter_value++); - } - } - - mutable value_type bufvalue; - - public: - linebreak_iter(const input_t &iter_valueArg, - const input_t &iter_endvalueArg) - : iter_value(iter_valueArg), - end_iter_value(iter_endvalueArg), - buf(new linebreak_callback_save_buf) - { - } - - linebreak_iter() : buf(NULL) - { - } - - void set_opts(int opts) - { - if (buf) - buf->set_opts(opts); - } - - ~linebreak_iter() - { - if (buf) - delete buf; - } - - linebreak_iter(const linebreak_iter &v) - : buf(NULL) - { - operator=(v); - } - - linebreak_iter &operator=(const - linebreak_iter &v) - { - if (buf) - delete buf; - buf=v.buf; - iter_value=v.iter_value; - end_iter_value=v.end_iter_value; - v.buf=NULL; - return *this; - } - - bool operator==(const linebreak_iter &v) const - { - fill(); - v.fill(); - - return buf == NULL && v.buf == NULL; - } - - bool operator!=(const linebreak_iter &v) const - { - return !operator==(v); - } - - value_type operator*() const - { - fill(); - return buf == NULL ? UNICODE_LB_MANDATORY: - buf->lb_buf.front(); - } - - linebreak_iter &operator++() - { - bufvalue=operator*(); - - if (buf) - buf->lb_buf.pop_front(); - return *this; - } - - const value_type *operator++(int) - { - operator++(); - return &bufvalue; - } - }; - - /* - ** Like linebreak_callback_base, except the subclass receives both - ** the linebreaking value, and the unicode character. - */ - - class linebreakc_callback_base { - - unicode_lbc_info_t handle; - - int opts; - - linebreakc_callback_base(const linebreakc_callback_base &); - /* NOT IMPLEMENTED */ - - linebreakc_callback_base &operator==(const - linebreakc_callback_base - &); - /* NOT IMPLEMENTED */ - - - public: - linebreakc_callback_base(); - virtual ~linebreakc_callback_base(); - - void finish(); - - void set_opts(int opts); - - friend int linebreakc_trampoline(int, unicode_char, void *); - - linebreakc_callback_base &operator<<(unicode_char uc); - - template - linebreakc_callback_base &operator()(iter_type beg_iter, - iter_type end_iter) - { - while (beg_iter != end_iter) - operator<<(*beg_iter++); - return *this; - } - - linebreakc_callback_base &operator<<(const - std::vector - &vec) - { - return operator()(vec.begin(), vec.end()); - } - private: - virtual int operator()(int, unicode_char); - }; - - class linebreakc_callback_save_buf : public linebreakc_callback_base { - - public: - std::list > lb_buf; - - linebreakc_callback_save_buf(); - ~linebreakc_callback_save_buf(); - - private: - int operator()(int, unicode_char); - }; - - - /* - ** Convert an input iterator sequence over unicode_chars into - ** an input iterator sequence over std::pair, - ** the original unicode character, and the linebreaking value before - ** the character. - */ - - template class linebreakc_iter - : public std::iterator, void> - { - mutable input_t iter_value, end_iter_value; - - mutable linebreakc_callback_save_buf *buf; - - void fill() const - { - if (buf == NULL) - return; - - while (buf->lb_buf.empty()) - { - if (iter_value == end_iter_value) - { - buf->finish(); - if (buf->lb_buf.empty()) - { - delete buf; - buf=NULL; - } - break; - } - - buf->operator<<(*iter_value); - ++iter_value; - } - } - - mutable value_type bufvalue; - - public: - linebreakc_iter(const input_t &iter_valueArg, - const input_t &iter_endvalueArg) - : iter_value(iter_valueArg), - end_iter_value(iter_endvalueArg), - buf(new linebreakc_callback_save_buf) - { - } - - linebreakc_iter() : buf(NULL) - { - } - - ~linebreakc_iter() - { - if (buf) - delete buf; - } - - linebreakc_iter(const linebreakc_iter &v) - : buf(NULL) - { - operator=(v); - } - - linebreakc_iter &operator=(const - linebreakc_iter &v) - { - if (buf) - delete buf; - buf=v.buf; - iter_value=v.iter_value; - end_iter_value=v.end_iter_value; - v.buf=NULL; - return *this; - } - - bool operator==(const linebreakc_iter &v) const - { - fill(); - v.fill(); - - return buf == NULL && v.buf == NULL; - } - - bool operator!=(const linebreakc_iter &v) const - { - return !operator==(v); - } - - value_type operator*() const - { - fill(); - return buf == NULL ? - std::make_pair(UNICODE_LB_MANDATORY, - (unicode_char)0): - buf->lb_buf.front(); - } - - linebreakc_iter &operator++() - { - bufvalue=operator*(); - - if (buf) - buf->lb_buf.pop_front(); - return *this; - } - - const value_type *operator++(int) - { - operator++(); - return &bufvalue; - } - }; - - - /* - ** Subclass wordbreak_callback_base, implement operator()(int). - ** - ** Use operator<< or operator()(iterator, iterator) to feed - ** unicode_chars into the wordbreaking algorithm. The subclass receives - ** word flags, as they become available. - */ - - extern "C" int wordbreak_trampoline(int value, void *ptr); - - class wordbreak_callback_base { - - unicode_wb_info_t handle; - - wordbreak_callback_base(const wordbreak_callback_base &); - /* NOT IMPLEMENTED */ - - wordbreak_callback_base &operator==(const - wordbreak_callback_base &); - /* NOT IMPLEMENTED */ - - public: - wordbreak_callback_base(); - virtual ~wordbreak_callback_base(); - - void finish(); - - friend int wordbreak_trampoline(int, void *); - - wordbreak_callback_base &operator<<(unicode_char uc); - - template - wordbreak_callback_base &operator()(iter_type beg_iter, - iter_type end_iter) - { - while (beg_iter != end_iter) - operator<<(*beg_iter++); - return *this; - } - - wordbreak_callback_base &operator<<(const - std::vector - &vec) - { - return operator()(vec.begin(), vec.end()); - } - private: - virtual int operator()(bool); - }; - - /* - ** A C++ wrapper for unicode_wbscan. - */ - - class wordbreakscan { - - unicode_wbscan_info_t handle; - - wordbreakscan(const wordbreakscan &); - /* NOT IMPLEMENTED */ - - wordbreakscan &operator==(const wordbreakscan &); - /* NOT IMPLEMENTED */ - public: - - wordbreakscan(); - ~wordbreakscan(); - - bool operator<<(unicode_char uc); - - size_t finish(); - }; - -} -#endif - -#endif