5 ** Copyright 2000-2011 Double Precision, Inc.
6 ** See COPYING for distribution information.
23 #include "unicode/unicode_config.h"
37 #include <sys/types.h>
39 typedef uint32_t unicode_char
;
42 ** The system default character set, from the locale.
45 extern const char *unicode_default_chset();
47 /* Unicode upper/lower/title case conversion functions */
49 extern unicode_char
unicode_uc(unicode_char
);
50 extern unicode_char
unicode_lc(unicode_char
);
51 extern unicode_char
unicode_tc(unicode_char
);
54 ** Look up HTML 4.0/XHTML entity.
58 ** Returns the unicode entity value, or 0 if no such entity is defined.
61 unicode_char
unicode_html40ent_lookup(const char *n
);
65 ** Return "width" of unicode character.
67 ** This is defined as follows: for characters having the F or W property in
68 ** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
70 ** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
71 ** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
74 ** This provides a rough estimate of the "width" of the character if its
75 ** shown on a text console.
78 extern int unicode_wcwidth(unicode_char c
);
79 extern size_t unicode_wcwidth_str(const unicode_char
*c
);
82 ** The unicode-ish isspace()
84 extern int unicode_isspace(unicode_char ch
);
86 /* Internal unicode table lookup function */
88 extern uint8_t unicode_tab_lookup(unicode_char ch
,
89 const size_t *unicode_indextab
,
90 size_t unicode_indextab_sizeof
,
91 const uint8_t (*unicode_rangetab
)[2],
92 const uint8_t *unicode_classtab
,
96 ** Implementation of grapheme cluster boundary rules, as per tr29,
97 ** including GB9a and GB9b.
99 ** Returns non-zero if there's a grapheme break between the two referenced
103 int unicode_grapheme_break(unicode_char a
, unicode_char b
);
106 ** Implementation of line break rules, as per tr14.
108 ** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
109 ** first parameter is a callback function that gets invoked with two
110 ** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
111 ** The second parameter to unicode_lb_init() is the opaque passthrough
112 ** pointer, that is passed as the second argument to the callback function
113 ** with no further interpretation.
115 ** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
116 ** passing the handle and one unicode character. Repeatedly invoke
117 ** unicode_lb_next() to specify the input string for the linebreaking
118 ** algorithm, then invoke unicode_lb_end() to finish calculating the
119 ** linebreaking algorithm, and deallocate the opaque linebreaking handle.
121 ** The callback function gets invoked once for each invocation of
122 ** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
123 ** the callback function will get invoked the exact number of times that
124 ** unicode_lb_next(), as long as each invocation of the callback function
125 ** returned 0; nothing more, nothing less. The first parameter to the callback
126 ** function will be one of the following values:
128 ** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
130 ** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
132 ** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
133 ** character (the preceding character is a space, or an equivalent).
135 ** The callback function should return 0. A non-zero value indicates an
136 ** error, which gets propagated up to the caller. The contract that the
137 ** callback function gets invoked the same number of times that
138 ** unicode_lb_next() gets invoked is now broken.
141 #define UNICODE_LB_MANDATORY -1
142 #define UNICODE_LB_NONE 0
143 #define UNICODE_LB_ALLOWED 1
145 struct unicode_lb_info
;
147 typedef struct unicode_lb_info
*unicode_lb_info_t
;
150 ** Allocate a linebreaking handle.
152 extern unicode_lb_info_t
unicode_lb_init(int (*cb_func
)(int, void *),
156 ** Feed the next character through the linebreaking algorithm.
157 ** A non-zero return code indicates that the callback function was invoked
158 ** and it returned a non-zero return code (which is propagated as a return
159 ** value). unicode_lb_end() must still be invoked, in this case.
161 ** A zero return code indicates that if the callback function was invoked,
165 extern int unicode_lb_next(unicode_lb_info_t i
, unicode_char ch
);
168 ** Convenience function that invokes unicode_lb_next() with a list of
169 ** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
170 ** 0, or the first non-zero return value from unicode_lb_next().
173 extern int unicode_lb_next_cnt(unicode_lb_info_t i
,
174 const unicode_char
*chars
,
178 ** Finish the linebreaking algorithm.
180 ** A non-zero return code indicates that the callback function was invoked
181 ** and it returned a non-zero return code (which is propagated as a return
184 ** A zero return code indicates that if the callback function was invoked,
185 ** it returned 0, and that the callback function was invoked exactly the same
186 ** number of times that unicode_lb_next() was invoked.
188 ** In all case, the linebreak handle will no longer be valid when this
192 extern int unicode_lb_end(unicode_lb_info_t i
);
195 ** An alternative linebreak API where the callback function receives the
196 ** original unicode character in addition to its linebreak value.
198 ** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
199 ** semantics are the same as their _lb_ counterparts.
202 struct unicode_lbc_info
;
204 typedef struct unicode_lbc_info
*unicode_lbc_info_t
;
206 extern unicode_lbc_info_t
unicode_lbc_init(int (*cb_func
)(int, unicode_char
,
209 extern int unicode_lbc_next(unicode_lbc_info_t i
, unicode_char ch
);
210 extern int unicode_lbc_end(unicode_lbc_info_t i
);
213 ** Set linebreaking options.
215 ** OPTIONS SUBJECT TO CHANGE.
218 extern void unicode_lb_set_opts(unicode_lb_info_t i
, int opts
);
220 extern void unicode_lbc_set_opts(unicode_lbc_info_t i
, int opts
);
223 ** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
225 ** Adds the following to LB24:
233 #define UNICODE_LB_OPT_PRBREAK 0x0001
237 ** Tailored / breaking rules.
239 ** Adds the following rule to LB13:
247 ** SP รท SY, which takes precedence over "x SY".
249 #define UNICODE_LB_OPT_SYBREAK 0x0002
252 ** Tailored / breaking rules.
254 ** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
255 ** and after mdash and ndash.
257 #define UNICODE_LB_OPT_DASHWJ 0x0004
260 ** Implemention of word break rules, as per tr29.
262 ** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
263 ** first parameter is a callback function that gets invoked with two
264 ** arguments: an int flag, and a passthrough argument. The second parameter to
265 ** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
266 ** second argument to the callback function with no further interpretation.
268 ** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
269 ** passing the handle and one unicode character. Repeatedly invoke
270 ** unicode_wb_next() to specify the input string for the wordbreaking
271 ** algorithm, then invoke unicode_wb_end() to finish calculating the
272 ** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
274 ** The callback function gets invoked once for each invocation of
275 ** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
276 ** the callback function will get invoked the exact number of times that
277 ** unicode_wb_next(), as long as each invocation of the callback function
278 ** returned 0; nothing more, nothing less. The first parameter to the callback
279 ** function will be an int. A non-zero value indicates that there is a word
280 ** break between this character and the preceding one.
282 ** The callback function should return 0. A non-zero value indicates an
283 ** error, which gets propagated up to the caller. The contract that the
284 ** callback function gets invoked the same number of times that
285 ** unicode_lb_next() gets invoked is now broken.
288 struct unicode_wb_info
;
290 typedef struct unicode_wb_info
*unicode_wb_info_t
;
293 ** Allocate a wordbreaking handle.
295 extern unicode_wb_info_t
unicode_wb_init(int (*cb_func
)(int, void *),
299 ** Feed the next character through the wordbreaking algorithm.
300 ** A non-zero return code indicates that the callback function was invoked
301 ** and it returned a non-zero return code (which is propagated as a return
302 ** value). unicode_wb_end() must still be invoked, in this case.
304 ** A zero return code indicates that if the callback function was invoked,
308 extern int unicode_wb_next(unicode_wb_info_t i
, unicode_char ch
);
311 ** Convenience function that invokes unicode_wb_next() with a list of
312 ** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
313 ** 0, or the first non-zero return value from unicode_wb_next().
316 extern int unicode_wb_next_cnt(unicode_wb_info_t i
,
317 const unicode_char
*chars
,
321 ** Finish the wordbreaking algorithm.
323 ** A non-zero return code indicates that the callback function was invoked
324 ** and it returned a non-zero return code (which is propagated as a return
327 ** A zero return code indicates that if the callback function was invoked,
328 ** it returned 0, and that the callback function was invoked exactly the same
329 ** number of times that unicode_wb_next() was invoked.
331 ** In all case, the wordbreak handle will no longer be valid when this
335 extern int unicode_wb_end(unicode_wb_info_t i
);
338 ** Search for a word boundary.
340 ** Obtain a handle by calling unicode_wbscan_init(), then invoke
341 ** unicode_wbscan_next() to provide a unicode stream, then invoke
342 ** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
343 ** characters from the beginning of the stream until the first word boundary.
345 ** You may prematurely stop calling unicode_wbscan_next() once it returns a
346 ** non-0 value, which means that there is sufficient context to compute the
347 ** first word boundary, and all further calls to unicode_wbscan_next() will
348 ** be internal no-ops.
351 struct unicode_wbscan_info
;
353 typedef struct unicode_wbscan_info
*unicode_wbscan_info_t
;
355 unicode_wbscan_info_t
unicode_wbscan_init();
357 int unicode_wbscan_next(unicode_wbscan_info_t i
, unicode_char ch
);
359 size_t unicode_wbscan_end(unicode_wbscan_info_t i
);
362 ** A buffer that holds unicode characters, and dynamically grows as needed.
366 unicode_char
*ptr
; /* The unicode characters */
367 size_t size
, /* Buffer size */
368 len
, /* How many characters in ptr are initialized */
369 max
; /* Maximum size the buffer can grow to */
373 ** Initialize a buffer. Constructor.
376 void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
377 struct unicode_buf
*p
,
380 ** Maximum size the buffer can grow to. (size_t)-1
385 ** Like unicode_buf_init, and initialize the new buffer with the contents of
386 ** another buffer. The maximum size of the initialized buffer is exactly the
387 ** number of characters in the existing buffer. This copies a buffer using
388 ** the minimum amount of heap space.
391 #define unicode_buf_init_copy(a,b) \
393 unicode_buf_init((a), unicode_buf_len(b)); \
394 unicode_buf_append_buf((a),(b)); \
398 ** Deinitialize the buffer. Destructor. Frees memory.
401 void unicode_buf_deinit(struct unicode_buf
*p
);
404 ** Official way to access the characters in the unicode buffer.
406 #define unicode_buf_ptr(p) ((p)->ptr)
409 ** Official way of obtaining the number of characters in the unicode buffer.
411 #define unicode_buf_len(p) ((p)->len)
414 ** Remove all existing characters from an initialized buffer. Sets len to 0.
417 #define unicode_buf_clear(p) ((p)->len=0)
420 ** Append characters to the existing characters in the unicode buffer.
421 ** The buffer grows, if needed. If the buffer would exceed its maximum size,
422 ** the extra characters get truncated.
424 ** Returns 0 if the characters were appended. -1 for a malloc failure.
427 int unicode_buf_append(struct unicode_buf
*p
, /* The buffer */
428 const unicode_char
*uc
, /* Characters to append */
429 size_t l
); /* How many of them */
432 ** Convert an iso-8859-1 char string and invoke unicode_buf_append().
435 void unicode_buf_append_char(struct unicode_buf
*dst
,
440 ** Remove some portion of the unicode buffer
443 void unicode_buf_remove(struct unicode_buf
*p
, /* The buffer */
444 size_t pos
, /* Offset in buffer */
445 size_t cnt
); /* How many to remove */
448 ** Append the contents of an existing buffer to another one.
451 #define unicode_buf_append_buf(a,b) \
452 unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
456 ** The equivalent of strcmp() for unicode buffers.
459 int unicode_buf_cmp(const struct unicode_buf
*a
,
460 const struct unicode_buf
*b
);
463 ** The equivalent of unicode_buf_cmp, except that the second buffer is an
464 ** iso-8859-1 string.
467 int unicode_buf_cmp_str(const struct unicode_buf
*p
,
468 const char *c
, /* iso-8859-1 string */
469 size_t cl
); /* Number of chars in c */
472 ** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
473 ** A handle gets created by libmail_u_convert_init().
474 ** libmail_u_convert_init() receives a pointer to the output function
475 ** which receives converted character text.
477 ** The output function receives a pointer to the converted character text, and
478 ** the number of characters in the converted text.
480 ** The character text to convert gets passed, repeatedly, to
481 ** libmail_u_convert(). Each call to libmail_u_convert() results in
482 ** the output function being invoked, zero or more times, with the converted
483 ** text. Finally, libmail_u_convert_deinit() stops the conversion and
484 ** deallocates the conversion handle.
486 ** Internal buffering takes place. libmail_u_convert_deinit() may result
487 ** in the output function being called one or more times, to receive the final
488 ** part of the converted character stream.
490 ** The output function should return 0. A non-0 value causes
491 ** libmail_u_convert() and/or libmail_u_convert_deinit() returning
495 struct libmail_u_convert_hdr
;
497 typedef struct libmail_u_convert_hdr
*libmail_u_convert_handle_t
;
500 ** libmail_u_convert_init() returns a non-NULL handle for the requested
501 ** conversion, or NULL if the requested conversion is not available.
504 libmail_u_convert_handle_t
505 libmail_u_convert_init(/* Convert from this chset */
506 const char *src_chset
,
508 /* Convert to this chset */
509 const char *dst_chset
,
511 /* The output function */
513 int (*output_func
)(const char *, size_t, void *),
515 /* Passthrough arg */
519 ** Repeatedly pass the character text to convert to libmail_u_convert().
521 ** Returns non-0 if the output function returned non-0, or 0 if all invocations
522 ** of the output function returned 0.
525 int libmail_u_convert(/* The conversion handle */
526 libmail_u_convert_handle_t handle
,
528 /* Text to convert */
531 /* Number of bytes to convert */
535 ** Finish character set conversion. The handle gets deallocated.
537 ** May still result in one or more invocations of the output function.
538 ** Returns non-zero if any previous invocation of the output function returned
539 ** non-zero (this includes any invocations of the output function resulting
540 ** from this call, or prior libmail_u_convert() calls), or 0 if all
541 ** invocations of the output function returned 0.
543 ** If the errptr is not NULL, *errptr is set to non-zero if there were any
544 ** conversion errors -- if there was any text that could not be converted to
545 ** the destination character text.
548 int libmail_u_convert_deinit(libmail_u_convert_handle_t handle
,
553 ** Specialization: save converted character text in a buffer.
555 ** Implementation: call libmail_u_convert_tocbuf_init() instead of
556 ** libmail_u_convert_init(), then call libmail_u_convert() and
557 ** libmail_u_convert_deinit(), as usual.
559 ** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
560 ** malloc()ed buffer, and the number of converted characters, the size of the
561 ** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
562 ** to libmail_u_convert_tou_init().
564 ** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
565 ** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
567 ** The optional nullterminate places a trailing \0 character after the
568 ** converted string (this is included in *cbufsize_ret).
571 libmail_u_convert_handle_t
572 libmail_u_convert_tocbuf_init(/* Convert from this chset */
573 const char *src_chset
,
575 /* Convert to this chset */
576 const char *dst_chset
,
578 /* malloced buffer */
581 /* size of the malloced buffer */
582 size_t *cbufsize_ret
,
584 /* null terminate the resulting string */
590 ** Specialization: convert some character text to a unicode_char array.
592 ** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char
595 ** The returned *ucsize_ret is initialized with the number of unicode_chars,
596 ** rather than the byte count.
598 ** In all other ways, this function behaves identically to
599 ** libmail_u_convert_tocbuf_init().
602 libmail_u_convert_handle_t
603 libmail_u_convert_tou_init(/* Convert from this chset */
604 const char *src_chset
,
606 /* malloc()ed buffer pointer, on exit. */
607 unicode_char
**ucptr_ret
,
609 /* size of the malloc()ed buffer, upon exit */
612 /* If true, terminate with U+0x0000, for convenience */
617 ** Specialization: convert a unicode_char array to some character text.
619 ** This is the opposite of libmail_u_convert_tou_init(). Call this to
620 ** initialize the conversion handle, then use libmail_u_convert_uc()
621 ** instead of libmail_u_convert.
624 libmail_u_convert_handle_t
625 libmail_u_convert_fromu_init(/* Convert to this chset */
626 const char *dst_chset
,
628 /* malloc()ed buffer pointer, on exit. */
631 /* size of the malloc()ed buffer, upon exit */
632 size_t *cbufsize_ret
,
634 /* If true, terminate with U+0x0000, for convenience */
638 int libmail_u_convert_uc(/* The conversion handle */
639 libmail_u_convert_handle_t handle
,
641 /* Text to convert */
642 const unicode_char
*text
,
644 /* Number of bytes to convert */
648 ** Initialize conversion to UTF-8.
650 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
651 ** destination charset as UTF-8.
654 libmail_u_convert_handle_t
655 libmail_u_convert_tocbuf_toutf8_init(const char *src_chset
,
657 size_t *cbufsize_ret
,
661 ** Initialize conversion from UTF-8.
663 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
664 ** source charset as UTF-8.
667 libmail_u_convert_handle_t
668 libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset
,
670 size_t *cbufsize_ret
,
674 ** Convert a character string to UTF-8.
676 ** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
679 char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */
682 /* Character set to convert to UTF-8 */
686 ** If non-NULL, and a non-NULL pointer is
687 ** returned, *error is set to non-zero if
688 ** a character conversion error has occured.
693 ** Convert UTF-8 text to another character set.
695 ** Returns a malloc-ed buffer holding the string converted to the specified
696 ** character set, or NULL if an error occured.
699 char *libmail_u_convert_fromutf8(/* A UTF-8 string */
703 ** Convert the UTF-8 string to this character
710 ** If non-NULL, and a non-NULL pointer is
711 ** returned, *error is set to non-zero if
712 ** a character conversion error has occured.
717 ** Convert one charset to another charset, placing the result in a malloc-ed
720 ** Returns a malloc-ed buffer holding the string converted to the specified
721 ** character set, or NULL if an error occured.
724 char *libmail_u_convert_tobuf(/* A string to convert */
734 ** Destination charset
736 const char *dstcharset
,
739 ** If non-NULL, and a non-NULL pointer is
740 ** returned, *error is set to non-zero if
741 ** a character conversion error has occured.
746 ** Convenience function: call libmail_u_convert_tou_init(), feed the
747 ** character string through libmail_u_convert(), then call
748 ** libmail_u_convert_deinit().
750 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
751 ** holding the unicode char array.
754 int libmail_u_convert_tou_tobuf(/* Character text to convert */
757 /* Number of characters */
764 ** If this function returns 0, this gets
770 ** Size of the allocated buffer
775 ** If not null and this function returns 0,
776 ** this is set to non-0 if there
777 ** was a conversion error (but the output
778 ** buffer gets still allocated and
784 ** Convenience function: call libmail_u_convert_fromu_init(), feed the
785 ** unicode_array through libmail_u_convert_uc(), then call
786 ** libmail_u_convert_deinit().
788 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
789 ** holding the converted character string
792 int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */
793 const unicode_char
*utext
,
796 ** Size of the unicode array.
797 ** If this is (size_t)-1, utext is a
798 ** 0-terminated array.
803 ** Convert the unicode array to this charset.
808 ** If libmail_u_convert_fromu_tobuf()
809 ** returns 0, this is initialized to a
810 ** malloced buffer with a 0-terminated
816 ** Size of the initialized array, including
822 ** If libmail_u_convert_fromu_tobuf()
823 ** returns 0 and this is not NULL,
824 ** *err is set to non-0 if there was a
825 ** conversion error to the requested
831 ** Convenience function: convert a string in a given character set
832 ** to/from uppercase, lowercase, or something else.
834 ** This is done by calling libmail_u_convert_tou_tobuf() first,
835 ** applying the title_func and char_func, then using
836 ** libmail_u_convert_fromu_tobuf().
838 ** A NULL return indicates that the requested conversion cannot be performed.
841 char *libmail_u_convert_tocase( /* String to convert */
844 /* String's character set */
849 ** Conversion of the first character in
850 ** str: unicode_uc, unicode_lc, or unicode_tc:
853 unicode_char (*first_char_func
)(unicode_char
),
856 ** Conversion of the second and the remaining
857 ** character in str. If NULL, same as
860 unicode_char (*char_func
)(unicode_char
));
864 /* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */
866 extern const char libmail_u_ucs4_native
[];
868 /* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */
870 extern const char libmail_u_ucs2_native
[];
873 ** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
876 ** This can be followed by a " " and up to 15 characters to be escaped in
877 ** addition to unicode chars.
880 #define unicode_x_imap_modutf7 "x-imap-modutf7"
889 extern size_t unicode_wcwidth(const std::vector
<unicode_char
> &uc
);
894 ** Interface to iconv.
896 ** Subclass converted(). Invoke begin(), then operator(), repeatedly,
899 ** converted() receives the converted text.
904 libmail_u_convert_handle_t handle
;
911 ** Returns false if the requested conversion cannot be done.
914 bool begin(/* Convert from */
915 const std::string
&src_chset
,
918 const std::string
&dst_chset
);
920 /* Feed iconv(3). Returns false if the conversion was aborted.
923 bool operator()(const char *, size_t);
925 bool operator()(const unicode_char
*, size_t);
928 ** Get the results here. If the subclass returns a non-0
929 ** value, the conversion is aborted.
932 virtual int converted(const char *, size_t);
935 ** End of conversion.
937 ** Returns true if all calls to converted() returned 0,
938 ** false if the conversion was aborted.
940 ** errflag is set to true if there was a character that could
941 ** not be converted, and passed to converted().
944 bool end(bool &errflag
)
946 return end(&errflag
);
954 /* Convert between two different charsets */
956 static std::string
convert(const std::string
&text
,
957 const std::string
&charset
,
958 const std::string
&dstcharset
,
961 /* Convert between two different charsets */
963 static std::string
convert(const std::string
&text
,
964 const std::string
&charset
,
965 const std::string
&dstcharset
)
969 return convert(text
, charset
, dstcharset
, dummy
);
972 /* Convert from unicode to a charset */
974 static std::string
convert(const std::vector
<unicode_char
> &uc
,
975 const std::string
&dstcharset
,
978 /* Convert from unicode to a charset */
980 static std::string
convert(const std::vector
<unicode_char
> &uc
,
981 const std::string
&dstcharset
)
985 return convert(uc
, dstcharset
, dummy
);
988 /* Convert charset to unicode */
990 static bool convert(const std::string
&text
,
991 const std::string
&charset
,
992 std::vector
<unicode_char
> &uc
);
995 /* Convert to upper/lower/title case */
998 convert_tocase(/* Text string */
999 const std::string
&text
,
1002 const std::string
&charset
,
1004 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1005 unicode_char (*first_char_func
)(unicode_char
),
1007 /* If not NULL, second and subsequent chars */
1008 unicode_char (*char_func
)(unicode_char
)
1013 return convert_tocase(text
, charset
, dummy
,
1018 /* Convert to upper/lower/title case */
1021 convert_tocase(/* Text string */
1022 const std::string
&text
,
1025 const std::string
&charset
,
1027 /* Set if there's a conversion error */
1030 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1031 unicode_char (*first_char_func
)(unicode_char
),
1033 /* If not NULL, second and subsequent chars */
1034 unicode_char (*char_func
)(unicode_char
)
1044 /* Convert output of iconvert to unicode_chars. */
1046 class iconvert::tou
: public iconvert
{
1049 bool begin(const std::string
&chset
);
1051 virtual int converted(const unicode_char
*, size_t);
1053 using iconvert::operator();
1055 int converted(const char *ptr
, size_t cnt
);
1058 template<typename iter_t
> class to_iter_class
;
1060 template<typename input_iter_t
,
1061 typename output_iter_t
>
1062 static output_iter_t
convert(input_iter_t from_iter
,
1063 input_iter_t to_iter
,
1064 const std::string
&chset
,
1065 output_iter_t out_iter
);
1067 template<typename input_iter_t
>
1068 static void convert(input_iter_t from_iter
,
1069 input_iter_t to_iter
,
1070 const std::string
&chset
,
1071 std::vector
<unicode_char
> &out_buf
)
1074 std::back_insert_iterator
<std::vector
<unicode_char
> >
1075 insert_iter(out_buf
);
1077 convert(from_iter
, to_iter
, chset
, insert_iter
);
1080 static void convert(const std::string
&str
,
1081 const std::string
&chset
,
1082 std::vector
<unicode_char
> &out_buf
);
1085 /* Helper class that saves unicode output into an output iterator */
1087 template<typename iter_t
>
1088 class iconvert::tou::to_iter_class
: public iconvert::tou
{
1093 to_iter_class(iter_t iterValue
)
1094 : iter(iterValue
) {}
1096 using tou::operator();
1098 operator iter_t() const { return iter
; }
1101 int converted(const unicode_char
*ptr
, size_t cnt
)
1115 template<typename input_iter_t
,
1116 typename output_iter_t
>
1117 output_iter_t
iconvert::tou::convert(input_iter_t from_iter
,
1118 input_iter_t to_iter
,
1119 const std::string
&chset
,
1120 output_iter_t out_iter
)
1122 class to_iter_class
<output_iter_t
> out(out_iter
);
1124 if (!out
.begin(chset
))
1127 std::vector
<char> string
;
1129 while (from_iter
!= to_iter
)
1131 string
.push_back(*from_iter
++);
1133 if (string
.size() > 31)
1135 out(&string
[0], string
.size());
1140 if (string
.size() > 0)
1141 out(&string
[0], string
.size());
1147 /* Convert output of iconvert from unicode_chars. */
1149 class iconvert::fromu
: public iconvert
{
1152 bool begin(const std::string
&chset
);
1154 using iconvert::operator();
1156 template<typename iter_t
> class to_iter_class
;
1158 template<typename input_iter_t
,
1159 typename output_iter_t
>
1160 static output_iter_t
convert(input_iter_t from_iter
,
1161 input_iter_t to_iter
,
1162 const std::string
&chset
,
1163 output_iter_t out_iter
);
1165 template<typename input_iter_t
>
1166 static void convert(input_iter_t from_iter
,
1167 input_iter_t to_iter
,
1168 const std::string
&chset
,
1169 std::string
&out_buf
)
1172 std::back_insert_iterator
<std::string
>
1173 insert_iter(out_buf
);
1175 convert(from_iter
, to_iter
, chset
, insert_iter
);
1178 static void convert(const std::vector
<unicode_char
> &ubuf
,
1179 const std::string
&chset
,
1180 std::string
&out_buf
);
1182 static std::string
convert(const std::vector
<unicode_char
>
1184 const std::string
&chset
);
1187 /* Helper class that saves unicode output into an output iterator */
1189 template<typename iter_t
>
1190 class iconvert::fromu::to_iter_class
: public iconvert::fromu
{
1195 to_iter_class(iter_t iterValue
)
1196 : iter(iterValue
) {}
1198 using fromu::operator();
1200 operator iter_t() const { return iter
; }
1203 int converted(const char *ptr
, size_t cnt
)
1217 template<typename input_iter_t
,
1218 typename output_iter_t
>
1219 output_iter_t
iconvert::fromu::convert(input_iter_t from_iter
,
1220 input_iter_t to_iter
,
1221 const std::string
&chset
,
1222 output_iter_t out_iter
)
1224 class to_iter_class
<output_iter_t
> out(out_iter
);
1226 if (!out
.begin(chset
))
1229 std::vector
<unicode_char
> string
;
1231 while (from_iter
!= to_iter
)
1233 string
.push_back(*from_iter
++);
1235 if (string
.size() > 31)
1237 out(&string
[0], string
.size());
1242 if (string
.size() > 0)
1243 out(&string
[0], string
.size());
1250 ** Unicode linebreaking algorithm, tr14.
1253 extern "C" int linebreak_trampoline(int value
, void *ptr
);
1254 extern "C" int linebreakc_trampoline(int value
, unicode_char ch
,
1258 ** Subclass linebreak_callback_base, implement operator()(int).
1260 ** Use operator<< or operator()(iterator, iterator) to feed
1261 ** unicode_chars into the linebreaking algorithm. The subclass receives
1262 ** UNICODE_LB values, as they become available.
1265 class linebreak_callback_base
{
1267 unicode_lb_info_t handle
;
1271 linebreak_callback_base(const linebreak_callback_base
&);
1272 /* NOT IMPLEMENTED */
1274 linebreak_callback_base
&operator==(const
1275 linebreak_callback_base
&);
1276 /* NOT IMPLEMENTED */
1279 linebreak_callback_base();
1280 virtual ~linebreak_callback_base();
1284 void set_opts(int opts
);
1286 friend int linebreak_trampoline(int, void *);
1288 linebreak_callback_base
&operator<<(unicode_char uc
);
1290 template<typename iter_type
>
1291 linebreak_callback_base
&operator()(iter_type beg_iter
,
1294 while (beg_iter
!= end_iter
)
1295 operator<<(*beg_iter
++);
1299 linebreak_callback_base
&operator<<(const
1300 std::vector
<unicode_char
>
1303 return operator()(vec
.begin(), vec
.end());
1306 virtual int operator()(int);
1309 class linebreak_callback_save_buf
: public linebreak_callback_base
{
1312 std::list
<int> lb_buf
;
1314 linebreak_callback_save_buf();
1315 ~linebreak_callback_save_buf();
1318 int operator()(int value
);
1322 ** Convert an input iterator sequence over unicode_chars into
1323 ** an input iterator sequence over linebreak values.
1326 template<typename input_t
> class linebreak_iter
1327 : public std::iterator
<std::input_iterator_tag
, int, void>
1329 mutable input_t iter_value
, end_iter_value
;
1331 mutable linebreak_callback_save_buf
*buf
;
1338 while (buf
->lb_buf
.empty())
1340 if (iter_value
== end_iter_value
)
1343 if (buf
->lb_buf
.empty())
1351 buf
->operator<<(*iter_value
++);
1355 mutable value_type bufvalue
;
1358 linebreak_iter(const input_t
&iter_valueArg
,
1359 const input_t
&iter_endvalueArg
)
1360 : iter_value(iter_valueArg
),
1361 end_iter_value(iter_endvalueArg
),
1362 buf(new linebreak_callback_save_buf
)
1366 linebreak_iter() : buf(NULL
)
1370 void set_opts(int opts
)
1373 buf
->set_opts(opts
);
1382 linebreak_iter(const linebreak_iter
<input_t
> &v
)
1388 linebreak_iter
<input_t
> &operator=(const
1389 linebreak_iter
<input_t
> &v
)
1394 iter_value
=v
.iter_value
;
1395 end_iter_value
=v
.end_iter_value
;
1400 bool operator==(const linebreak_iter
<input_t
> &v
) const
1405 return buf
== NULL
&& v
.buf
== NULL
;
1408 bool operator!=(const linebreak_iter
<input_t
> &v
) const
1410 return !operator==(v
);
1413 value_type
operator*() const
1416 return buf
== NULL
? UNICODE_LB_MANDATORY
:
1417 buf
->lb_buf
.front();
1420 linebreak_iter
<input_t
> &operator++()
1422 bufvalue
=operator*();
1425 buf
->lb_buf
.pop_front();
1429 const value_type
*operator++(int)
1437 ** Like linebreak_callback_base, except the subclass receives both
1438 ** the linebreaking value, and the unicode character.
1441 class linebreakc_callback_base
{
1443 unicode_lbc_info_t handle
;
1447 linebreakc_callback_base(const linebreakc_callback_base
&);
1448 /* NOT IMPLEMENTED */
1450 linebreakc_callback_base
&operator==(const
1451 linebreakc_callback_base
1453 /* NOT IMPLEMENTED */
1457 linebreakc_callback_base();
1458 virtual ~linebreakc_callback_base();
1462 void set_opts(int opts
);
1464 friend int linebreakc_trampoline(int, unicode_char
, void *);
1466 linebreakc_callback_base
&operator<<(unicode_char uc
);
1468 template<typename iter_type
>
1469 linebreakc_callback_base
&operator()(iter_type beg_iter
,
1472 while (beg_iter
!= end_iter
)
1473 operator<<(*beg_iter
++);
1477 linebreakc_callback_base
&operator<<(const
1478 std::vector
<unicode_char
>
1481 return operator()(vec
.begin(), vec
.end());
1484 virtual int operator()(int, unicode_char
);
1487 class linebreakc_callback_save_buf
: public linebreakc_callback_base
{
1490 std::list
<std::pair
<int, unicode_char
> > lb_buf
;
1492 linebreakc_callback_save_buf();
1493 ~linebreakc_callback_save_buf();
1496 int operator()(int, unicode_char
);
1501 ** Convert an input iterator sequence over unicode_chars into
1502 ** an input iterator sequence over std::pair<int, unicode_char>,
1503 ** the original unicode character, and the linebreaking value before
1507 template<typename input_t
> class linebreakc_iter
1508 : public std::iterator
<std::input_iterator_tag
,
1509 std::pair
<int, unicode_char
>, void>
1511 mutable input_t iter_value
, end_iter_value
;
1513 mutable linebreakc_callback_save_buf
*buf
;
1520 while (buf
->lb_buf
.empty())
1522 if (iter_value
== end_iter_value
)
1525 if (buf
->lb_buf
.empty())
1533 buf
->operator<<(*iter_value
);
1538 mutable value_type bufvalue
;
1541 linebreakc_iter(const input_t
&iter_valueArg
,
1542 const input_t
&iter_endvalueArg
)
1543 : iter_value(iter_valueArg
),
1544 end_iter_value(iter_endvalueArg
),
1545 buf(new linebreakc_callback_save_buf
)
1549 linebreakc_iter() : buf(NULL
)
1559 linebreakc_iter(const linebreakc_iter
<input_t
> &v
)
1565 linebreakc_iter
<input_t
> &operator=(const
1566 linebreakc_iter
<input_t
> &v
)
1571 iter_value
=v
.iter_value
;
1572 end_iter_value
=v
.end_iter_value
;
1577 bool operator==(const linebreakc_iter
<input_t
> &v
) const
1582 return buf
== NULL
&& v
.buf
== NULL
;
1585 bool operator!=(const linebreakc_iter
<input_t
> &v
) const
1587 return !operator==(v
);
1590 value_type
operator*() const
1593 return buf
== NULL
?
1594 std::make_pair(UNICODE_LB_MANDATORY
,
1596 buf
->lb_buf
.front();
1599 linebreakc_iter
<input_t
> &operator++()
1601 bufvalue
=operator*();
1604 buf
->lb_buf
.pop_front();
1608 const value_type
*operator++(int)
1617 ** Subclass wordbreak_callback_base, implement operator()(int).
1619 ** Use operator<< or operator()(iterator, iterator) to feed
1620 ** unicode_chars into the wordbreaking algorithm. The subclass receives
1621 ** word flags, as they become available.
1624 extern "C" int wordbreak_trampoline(int value
, void *ptr
);
1626 class wordbreak_callback_base
{
1628 unicode_wb_info_t handle
;
1630 wordbreak_callback_base(const wordbreak_callback_base
&);
1631 /* NOT IMPLEMENTED */
1633 wordbreak_callback_base
&operator==(const
1634 wordbreak_callback_base
&);
1635 /* NOT IMPLEMENTED */
1638 wordbreak_callback_base();
1639 virtual ~wordbreak_callback_base();
1643 friend int wordbreak_trampoline(int, void *);
1645 wordbreak_callback_base
&operator<<(unicode_char uc
);
1647 template<typename iter_type
>
1648 wordbreak_callback_base
&operator()(iter_type beg_iter
,
1651 while (beg_iter
!= end_iter
)
1652 operator<<(*beg_iter
++);
1656 wordbreak_callback_base
&operator<<(const
1657 std::vector
<unicode_char
>
1660 return operator()(vec
.begin(), vec
.end());
1663 virtual int operator()(bool);
1667 ** A C++ wrapper for unicode_wbscan.
1670 class wordbreakscan
{
1672 unicode_wbscan_info_t handle
;
1674 wordbreakscan(const wordbreakscan
&);
1675 /* NOT IMPLEMENTED */
1677 wordbreakscan
&operator==(const wordbreakscan
&);
1678 /* NOT IMPLEMENTED */
1684 bool operator<<(unicode_char uc
);