Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode.h
1 #ifndef unicode_h
2 #define unicode_h
3
4 /*
5 ** Copyright 2000-2011 Double Precision, Inc.
6 ** See COPYING for distribution information.
7 **
8 */
9
10 #ifdef __cplusplus
11
12 #include <string>
13 #include <vector>
14 #include <list>
15
16 extern "C" {
17 #endif
18
19 #if 0
20 }
21 #endif
22
23 #include "unicode/unicode_config.h"
24
25 #include <stdlib.h>
26
27 #include <stdio.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_STDDEF_H
33 #include <stddef.h>
34 #endif
35 #include <stdint.h>
36
37 #include <sys/types.h>
38
39 typedef uint32_t unicode_char;
40
41 /*
42 ** The system default character set, from the locale.
43 */
44
45 extern const char *unicode_default_chset();
46
47 /* Unicode upper/lower/title case conversion functions */
48
49 extern unicode_char unicode_uc(unicode_char);
50 extern unicode_char unicode_lc(unicode_char);
51 extern unicode_char unicode_tc(unicode_char);
52
53 /*
54 ** Look up HTML 4.0/XHTML entity.
55 **
56 ** n="amp", etc...
57 **
58 ** Returns the unicode entity value, or 0 if no such entity is defined.
59 */
60
61 unicode_char unicode_html40ent_lookup(const char *n);
62
63 /*
64 **
65 ** Return "width" of unicode character.
66 **
67 ** This is defined as follows: for characters having the F or W property in
68 ** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
69 **
70 ** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
71 ** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
72 ** cases, 1.
73 **
74 ** This provides a rough estimate of the "width" of the character if its
75 ** shown on a text console.
76 */
77
78 extern int unicode_wcwidth(unicode_char c);
79 extern size_t unicode_wcwidth_str(const unicode_char *c);
80
81 /*
82 ** The unicode-ish isspace()
83 */
84 extern int unicode_isspace(unicode_char ch);
85
86 /* Internal unicode table lookup function */
87
88 extern uint8_t unicode_tab_lookup(unicode_char ch,
89 const size_t *unicode_indextab,
90 size_t unicode_indextab_sizeof,
91 const uint8_t (*unicode_rangetab)[2],
92 const uint8_t *unicode_classtab,
93 uint8_t uclass);
94
95 /*
96 ** Implementation of grapheme cluster boundary rules, as per tr29,
97 ** including GB9a and GB9b.
98 **
99 ** Returns non-zero if there's a grapheme break between the two referenced
100 ** characters.
101 */
102
103 int unicode_grapheme_break(unicode_char a, unicode_char b);
104
105 /*
106 ** Implementation of line break rules, as per tr14.
107 **
108 ** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
109 ** first parameter is a callback function that gets invoked with two
110 ** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
111 ** The second parameter to unicode_lb_init() is the opaque passthrough
112 ** pointer, that is passed as the second argument to the callback function
113 ** with no further interpretation.
114 **
115 ** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
116 ** passing the handle and one unicode character. Repeatedly invoke
117 ** unicode_lb_next() to specify the input string for the linebreaking
118 ** algorithm, then invoke unicode_lb_end() to finish calculating the
119 ** linebreaking algorithm, and deallocate the opaque linebreaking handle.
120 **
121 ** The callback function gets invoked once for each invocation of
122 ** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
123 ** the callback function will get invoked the exact number of times that
124 ** unicode_lb_next(), as long as each invocation of the callback function
125 ** returned 0; nothing more, nothing less. The first parameter to the callback
126 ** function will be one of the following values:
127 **
128 ** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
129 ** character.
130 ** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
131 ** character.
132 ** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
133 ** character (the preceding character is a space, or an equivalent).
134 **
135 ** The callback function should return 0. A non-zero value indicates an
136 ** error, which gets propagated up to the caller. The contract that the
137 ** callback function gets invoked the same number of times that
138 ** unicode_lb_next() gets invoked is now broken.
139 */
140
141 #define UNICODE_LB_MANDATORY -1
142 #define UNICODE_LB_NONE 0
143 #define UNICODE_LB_ALLOWED 1
144
145 struct unicode_lb_info;
146
147 typedef struct unicode_lb_info *unicode_lb_info_t;
148
149 /*
150 ** Allocate a linebreaking handle.
151 */
152 extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
153 void *cb_arg);
154
155 /*
156 ** Feed the next character through the linebreaking algorithm.
157 ** A non-zero return code indicates that the callback function was invoked
158 ** and it returned a non-zero return code (which is propagated as a return
159 ** value). unicode_lb_end() must still be invoked, in this case.
160 **
161 ** A zero return code indicates that if the callback function was invoked,
162 ** it returned 0.
163 */
164
165 extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch);
166
167 /*
168 ** Convenience function that invokes unicode_lb_next() with a list of
169 ** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
170 ** 0, or the first non-zero return value from unicode_lb_next().
171 */
172
173 extern int unicode_lb_next_cnt(unicode_lb_info_t i,
174 const unicode_char *chars,
175 size_t cnt);
176
177 /*
178 ** Finish the linebreaking algorithm.
179 **
180 ** A non-zero return code indicates that the callback function was invoked
181 ** and it returned a non-zero return code (which is propagated as a return
182 ** value).
183 **
184 ** A zero return code indicates that if the callback function was invoked,
185 ** it returned 0, and that the callback function was invoked exactly the same
186 ** number of times that unicode_lb_next() was invoked.
187 **
188 ** In all case, the linebreak handle will no longer be valid when this
189 ** function returns.
190 */
191
192 extern int unicode_lb_end(unicode_lb_info_t i);
193
194 /*
195 ** An alternative linebreak API where the callback function receives the
196 ** original unicode character in addition to its linebreak value.
197 **
198 ** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
199 ** semantics are the same as their _lb_ counterparts.
200 */
201
202 struct unicode_lbc_info;
203
204 typedef struct unicode_lbc_info *unicode_lbc_info_t;
205
206 extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char,
207 void *),
208 void *cb_arg);
209 extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch);
210 extern int unicode_lbc_end(unicode_lbc_info_t i);
211
212 /*
213 ** Set linebreaking options.
214 **
215 ** OPTIONS SUBJECT TO CHANGE.
216 */
217
218 extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts);
219
220 extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts);
221
222 /*
223 ** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
224 **
225 ** Adds the following to LB24:
226 **
227 ** PR x PR
228 **
229 ** AL x PR
230 **
231 ** ID x PR
232 **/
233 #define UNICODE_LB_OPT_PRBREAK 0x0001
234
235
236 /*
237 ** Tailored / breaking rules.
238 **
239 ** Adds the following rule to LB13:
240 **
241 ** SY x EX
242 **
243 ** SY x AL
244 **
245 ** SY x ID
246 **
247 ** SP รท SY, which takes precedence over "x SY".
248 */
249 #define UNICODE_LB_OPT_SYBREAK 0x0002
250
251 /*
252 ** Tailored / breaking rules.
253 **
254 ** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
255 ** and after mdash and ndash.
256 */
257 #define UNICODE_LB_OPT_DASHWJ 0x0004
258
259 /*
260 ** Implemention of word break rules, as per tr29.
261 **
262 ** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
263 ** first parameter is a callback function that gets invoked with two
264 ** arguments: an int flag, and a passthrough argument. The second parameter to
265 ** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
266 ** second argument to the callback function with no further interpretation.
267 **
268 ** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
269 ** passing the handle and one unicode character. Repeatedly invoke
270 ** unicode_wb_next() to specify the input string for the wordbreaking
271 ** algorithm, then invoke unicode_wb_end() to finish calculating the
272 ** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
273 **
274 ** The callback function gets invoked once for each invocation of
275 ** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
276 ** the callback function will get invoked the exact number of times that
277 ** unicode_wb_next(), as long as each invocation of the callback function
278 ** returned 0; nothing more, nothing less. The first parameter to the callback
279 ** function will be an int. A non-zero value indicates that there is a word
280 ** break between this character and the preceding one.
281 **
282 ** The callback function should return 0. A non-zero value indicates an
283 ** error, which gets propagated up to the caller. The contract that the
284 ** callback function gets invoked the same number of times that
285 ** unicode_lb_next() gets invoked is now broken.
286 */
287
288 struct unicode_wb_info;
289
290 typedef struct unicode_wb_info *unicode_wb_info_t;
291
292 /*
293 ** Allocate a wordbreaking handle.
294 */
295 extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
296 void *cb_arg);
297
298 /*
299 ** Feed the next character through the wordbreaking algorithm.
300 ** A non-zero return code indicates that the callback function was invoked
301 ** and it returned a non-zero return code (which is propagated as a return
302 ** value). unicode_wb_end() must still be invoked, in this case.
303 **
304 ** A zero return code indicates that if the callback function was invoked,
305 ** it returned 0.
306 */
307
308 extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch);
309
310 /*
311 ** Convenience function that invokes unicode_wb_next() with a list of
312 ** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
313 ** 0, or the first non-zero return value from unicode_wb_next().
314 */
315
316 extern int unicode_wb_next_cnt(unicode_wb_info_t i,
317 const unicode_char *chars,
318 size_t cnt);
319
320 /*
321 ** Finish the wordbreaking algorithm.
322 **
323 ** A non-zero return code indicates that the callback function was invoked
324 ** and it returned a non-zero return code (which is propagated as a return
325 ** value).
326 **
327 ** A zero return code indicates that if the callback function was invoked,
328 ** it returned 0, and that the callback function was invoked exactly the same
329 ** number of times that unicode_wb_next() was invoked.
330 **
331 ** In all case, the wordbreak handle will no longer be valid when this
332 ** function returns.
333 */
334
335 extern int unicode_wb_end(unicode_wb_info_t i);
336
337 /*
338 ** Search for a word boundary.
339 **
340 ** Obtain a handle by calling unicode_wbscan_init(), then invoke
341 ** unicode_wbscan_next() to provide a unicode stream, then invoke
342 ** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
343 ** characters from the beginning of the stream until the first word boundary.
344 **
345 ** You may prematurely stop calling unicode_wbscan_next() once it returns a
346 ** non-0 value, which means that there is sufficient context to compute the
347 ** first word boundary, and all further calls to unicode_wbscan_next() will
348 ** be internal no-ops.
349 */
350
351 struct unicode_wbscan_info;
352
353 typedef struct unicode_wbscan_info *unicode_wbscan_info_t;
354
355 unicode_wbscan_info_t unicode_wbscan_init();
356
357 int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch);
358
359 size_t unicode_wbscan_end(unicode_wbscan_info_t i);
360
361 /*
362 ** A buffer that holds unicode characters, and dynamically grows as needed.
363 */
364
365 struct unicode_buf {
366 unicode_char *ptr; /* The unicode characters */
367 size_t size, /* Buffer size */
368 len, /* How many characters in ptr are initialized */
369 max; /* Maximum size the buffer can grow to */
370 };
371
372 /*
373 ** Initialize a buffer. Constructor.
374 */
375
376 void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
377 struct unicode_buf *p,
378
379 /*
380 ** Maximum size the buffer can grow to. (size_t)-1
381 ** means unlimited.
382 */
383 size_t max);
384 /*
385 ** Like unicode_buf_init, and initialize the new buffer with the contents of
386 ** another buffer. The maximum size of the initialized buffer is exactly the
387 ** number of characters in the existing buffer. This copies a buffer using
388 ** the minimum amount of heap space.
389 */
390
391 #define unicode_buf_init_copy(a,b) \
392 do { \
393 unicode_buf_init((a), unicode_buf_len(b)); \
394 unicode_buf_append_buf((a),(b)); \
395 } while (0)
396
397 /*
398 ** Deinitialize the buffer. Destructor. Frees memory.
399 */
400
401 void unicode_buf_deinit(struct unicode_buf *p);
402
403 /*
404 ** Official way to access the characters in the unicode buffer.
405 */
406 #define unicode_buf_ptr(p) ((p)->ptr)
407
408 /*
409 ** Official way of obtaining the number of characters in the unicode buffer.
410 */
411 #define unicode_buf_len(p) ((p)->len)
412
413 /*
414 ** Remove all existing characters from an initialized buffer. Sets len to 0.
415 */
416
417 #define unicode_buf_clear(p) ((p)->len=0)
418
419 /*
420 ** Append characters to the existing characters in the unicode buffer.
421 ** The buffer grows, if needed. If the buffer would exceed its maximum size,
422 ** the extra characters get truncated.
423 **
424 ** Returns 0 if the characters were appended. -1 for a malloc failure.
425 */
426
427 int unicode_buf_append(struct unicode_buf *p, /* The buffer */
428 const unicode_char *uc, /* Characters to append */
429 size_t l); /* How many of them */
430
431 /*
432 ** Convert an iso-8859-1 char string and invoke unicode_buf_append().
433 */
434
435 void unicode_buf_append_char(struct unicode_buf *dst,
436 const char *str,
437 size_t cnt);
438
439 /*
440 ** Remove some portion of the unicode buffer
441 */
442
443 void unicode_buf_remove(struct unicode_buf *p, /* The buffer */
444 size_t pos, /* Offset in buffer */
445 size_t cnt); /* How many to remove */
446
447 /*
448 ** Append the contents of an existing buffer to another one.
449 */
450
451 #define unicode_buf_append_buf(a,b) \
452 unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
453
454
455 /*
456 ** The equivalent of strcmp() for unicode buffers.
457 */
458
459 int unicode_buf_cmp(const struct unicode_buf *a,
460 const struct unicode_buf *b);
461
462 /*
463 ** The equivalent of unicode_buf_cmp, except that the second buffer is an
464 ** iso-8859-1 string.
465 */
466
467 int unicode_buf_cmp_str(const struct unicode_buf *p,
468 const char *c, /* iso-8859-1 string */
469 size_t cl); /* Number of chars in c */
470
471 /*
472 ** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
473 ** A handle gets created by libmail_u_convert_init().
474 ** libmail_u_convert_init() receives a pointer to the output function
475 ** which receives converted character text.
476 **
477 ** The output function receives a pointer to the converted character text, and
478 ** the number of characters in the converted text.
479 **
480 ** The character text to convert gets passed, repeatedly, to
481 ** libmail_u_convert(). Each call to libmail_u_convert() results in
482 ** the output function being invoked, zero or more times, with the converted
483 ** text. Finally, libmail_u_convert_deinit() stops the conversion and
484 ** deallocates the conversion handle.
485 **
486 ** Internal buffering takes place. libmail_u_convert_deinit() may result
487 ** in the output function being called one or more times, to receive the final
488 ** part of the converted character stream.
489 **
490 ** The output function should return 0. A non-0 value causes
491 ** libmail_u_convert() and/or libmail_u_convert_deinit() returning
492 ** non-0.
493 */
494
495 struct libmail_u_convert_hdr;
496
497 typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t;
498
499 /*
500 ** libmail_u_convert_init() returns a non-NULL handle for the requested
501 ** conversion, or NULL if the requested conversion is not available.
502 */
503
504 libmail_u_convert_handle_t
505 libmail_u_convert_init(/* Convert from this chset */
506 const char *src_chset,
507
508 /* Convert to this chset */
509 const char *dst_chset,
510
511 /* The output function */
512
513 int (*output_func)(const char *, size_t, void *),
514
515 /* Passthrough arg */
516 void *convert_arg);
517
518 /*
519 ** Repeatedly pass the character text to convert to libmail_u_convert().
520 **
521 ** Returns non-0 if the output function returned non-0, or 0 if all invocations
522 ** of the output function returned 0.
523 */
524
525 int libmail_u_convert(/* The conversion handle */
526 libmail_u_convert_handle_t handle,
527
528 /* Text to convert */
529 const char *text,
530
531 /* Number of bytes to convert */
532 size_t cnt);
533
534 /*
535 ** Finish character set conversion. The handle gets deallocated.
536 **
537 ** May still result in one or more invocations of the output function.
538 ** Returns non-zero if any previous invocation of the output function returned
539 ** non-zero (this includes any invocations of the output function resulting
540 ** from this call, or prior libmail_u_convert() calls), or 0 if all
541 ** invocations of the output function returned 0.
542 **
543 ** If the errptr is not NULL, *errptr is set to non-zero if there were any
544 ** conversion errors -- if there was any text that could not be converted to
545 ** the destination character text.
546 */
547
548 int libmail_u_convert_deinit(libmail_u_convert_handle_t handle,
549 int *errptr);
550
551
552 /*
553 ** Specialization: save converted character text in a buffer.
554 **
555 ** Implementation: call libmail_u_convert_tocbuf_init() instead of
556 ** libmail_u_convert_init(), then call libmail_u_convert() and
557 ** libmail_u_convert_deinit(), as usual.
558 **
559 ** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
560 ** malloc()ed buffer, and the number of converted characters, the size of the
561 ** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
562 ** to libmail_u_convert_tou_init().
563 **
564 ** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
565 ** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
566 **
567 ** The optional nullterminate places a trailing \0 character after the
568 ** converted string (this is included in *cbufsize_ret).
569 */
570
571 libmail_u_convert_handle_t
572 libmail_u_convert_tocbuf_init(/* Convert from this chset */
573 const char *src_chset,
574
575 /* Convert to this chset */
576 const char *dst_chset,
577
578 /* malloced buffer */
579 char **cbufptr_ret,
580
581 /* size of the malloced buffer */
582 size_t *cbufsize_ret,
583
584 /* null terminate the resulting string */
585 int nullterminate
586 );
587
588
589 /*
590 ** Specialization: convert some character text to a unicode_char array.
591 **
592 ** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char
593 ** array.
594 **
595 ** The returned *ucsize_ret is initialized with the number of unicode_chars,
596 ** rather than the byte count.
597 **
598 ** In all other ways, this function behaves identically to
599 ** libmail_u_convert_tocbuf_init().
600 */
601
602 libmail_u_convert_handle_t
603 libmail_u_convert_tou_init(/* Convert from this chset */
604 const char *src_chset,
605
606 /* malloc()ed buffer pointer, on exit. */
607 unicode_char **ucptr_ret,
608
609 /* size of the malloc()ed buffer, upon exit */
610 size_t *ucsize_ret,
611
612 /* If true, terminate with U+0x0000, for convenience */
613 int nullterminate
614 );
615
616 /*
617 ** Specialization: convert a unicode_char array to some character text.
618 **
619 ** This is the opposite of libmail_u_convert_tou_init(). Call this to
620 ** initialize the conversion handle, then use libmail_u_convert_uc()
621 ** instead of libmail_u_convert.
622 */
623
624 libmail_u_convert_handle_t
625 libmail_u_convert_fromu_init(/* Convert to this chset */
626 const char *dst_chset,
627
628 /* malloc()ed buffer pointer, on exit. */
629 char **cbufptr_ret,
630
631 /* size of the malloc()ed buffer, upon exit */
632 size_t *cbufsize_ret,
633
634 /* If true, terminate with U+0x0000, for convenience */
635 int nullterminate
636 );
637
638 int libmail_u_convert_uc(/* The conversion handle */
639 libmail_u_convert_handle_t handle,
640
641 /* Text to convert */
642 const unicode_char *text,
643
644 /* Number of bytes to convert */
645 size_t cnt);
646
647 /*
648 ** Initialize conversion to UTF-8.
649 **
650 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
651 ** destination charset as UTF-8.
652 */
653
654 libmail_u_convert_handle_t
655 libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
656 char **cbufptr_ret,
657 size_t *cbufsize_ret,
658 int nullterminate);
659
660 /*
661 ** Initialize conversion from UTF-8.
662 **
663 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
664 ** source charset as UTF-8.
665 */
666
667 libmail_u_convert_handle_t
668 libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
669 char **cbufptr_ret,
670 size_t *cbufsize_ret,
671 int nullterminate);
672
673 /*
674 ** Convert a character string to UTF-8.
675 **
676 ** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
677 ** error occured.
678 */
679 char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */
680 const char *text,
681
682 /* Character set to convert to UTF-8 */
683 const char *charset,
684
685 /*
686 ** If non-NULL, and a non-NULL pointer is
687 ** returned, *error is set to non-zero if
688 ** a character conversion error has occured.
689 */
690 int *error);
691
692 /*
693 ** Convert UTF-8 text to another character set.
694 **
695 ** Returns a malloc-ed buffer holding the string converted to the specified
696 ** character set, or NULL if an error occured.
697 */
698
699 char *libmail_u_convert_fromutf8(/* A UTF-8 string */
700 const char *text,
701
702 /*
703 ** Convert the UTF-8 string to this character
704 ** set.
705 */
706
707 const char *charset,
708
709 /*
710 ** If non-NULL, and a non-NULL pointer is
711 ** returned, *error is set to non-zero if
712 ** a character conversion error has occured.
713 */
714 int *error);
715
716 /*
717 ** Convert one charset to another charset, placing the result in a malloc-ed
718 ** buffer.
719 **
720 ** Returns a malloc-ed buffer holding the string converted to the specified
721 ** character set, or NULL if an error occured.
722 */
723
724 char *libmail_u_convert_tobuf(/* A string to convert */
725 const char *text,
726
727 /*
728 ** String's charset.
729 */
730
731 const char *charset,
732
733 /*
734 ** Destination charset
735 */
736 const char *dstcharset,
737
738 /*
739 ** If non-NULL, and a non-NULL pointer is
740 ** returned, *error is set to non-zero if
741 ** a character conversion error has occured.
742 */
743 int *error);
744
745 /*
746 ** Convenience function: call libmail_u_convert_tou_init(), feed the
747 ** character string through libmail_u_convert(), then call
748 ** libmail_u_convert_deinit().
749 **
750 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
751 ** holding the unicode char array.
752 */
753
754 int libmail_u_convert_tou_tobuf(/* Character text to convert */
755 const char *text,
756
757 /* Number of characters */
758 size_t text_l,
759
760 /* text's charset */
761 const char *charset,
762
763 /*
764 ** If this function returns 0, this gets
765 ** initialized
766 */
767 unicode_char **uc,
768
769 /*
770 ** Size of the allocated buffer
771 */
772 size_t *ucsize,
773
774 /*
775 ** If not null and this function returns 0,
776 ** this is set to non-0 if there
777 ** was a conversion error (but the output
778 ** buffer gets still allocated and
779 ** initialized)
780 */
781 int *err);
782
783 /*
784 ** Convenience function: call libmail_u_convert_fromu_init(), feed the
785 ** unicode_array through libmail_u_convert_uc(), then call
786 ** libmail_u_convert_deinit().
787 **
788 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
789 ** holding the converted character string
790 */
791
792 int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */
793 const unicode_char *utext,
794
795 /*
796 ** Size of the unicode array.
797 ** If this is (size_t)-1, utext is a
798 ** 0-terminated array.
799 */
800 size_t utext_l,
801
802 /*
803 ** Convert the unicode array to this charset.
804 */
805 const char *charset,
806
807 /*
808 ** If libmail_u_convert_fromu_tobuf()
809 ** returns 0, this is initialized to a
810 ** malloced buffer with a 0-terminated
811 ** string is kept.
812 */
813 char **c,
814
815 /*
816 ** Size of the initialized array, including
817 ** the 0-terminator.
818 */
819 size_t *csize,
820
821 /*
822 ** If libmail_u_convert_fromu_tobuf()
823 ** returns 0 and this is not NULL,
824 ** *err is set to non-0 if there was a
825 ** conversion error to the requested
826 ** character set.
827 */
828 int *err);
829
830 /*
831 ** Convenience function: convert a string in a given character set
832 ** to/from uppercase, lowercase, or something else.
833 **
834 ** This is done by calling libmail_u_convert_tou_tobuf() first,
835 ** applying the title_func and char_func, then using
836 ** libmail_u_convert_fromu_tobuf().
837 **
838 ** A NULL return indicates that the requested conversion cannot be performed.
839 */
840
841 char *libmail_u_convert_tocase( /* String to convert */
842 const char *str,
843
844 /* String's character set */
845
846 const char *charset,
847
848 /*
849 ** Conversion of the first character in
850 ** str: unicode_uc, unicode_lc, or unicode_tc:
851 */
852
853 unicode_char (*first_char_func)(unicode_char),
854
855 /*
856 ** Conversion of the second and the remaining
857 ** character in str. If NULL, same as
858 ** first_char_func.
859 */
860 unicode_char (*char_func)(unicode_char));
861
862
863
864 /* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */
865
866 extern const char libmail_u_ucs4_native[];
867
868 /* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */
869
870 extern const char libmail_u_ucs2_native[];
871
872 /*
873 ** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
874 ** parameter.
875 **
876 ** This can be followed by a " " and up to 15 characters to be escaped in
877 ** addition to unicode chars.
878 */
879
880 #define unicode_x_imap_modutf7 "x-imap-modutf7"
881
882 #if 0
883 {
884 #endif
885
886 #ifdef __cplusplus
887 }
888
889 extern size_t unicode_wcwidth(const std::vector<unicode_char> &uc);
890
891 namespace mail {
892
893 /*
894 ** Interface to iconv.
895 **
896 ** Subclass converted(). Invoke begin(), then operator(), repeatedly,
897 ** then end().
898 **
899 ** converted() receives the converted text.
900 */
901
902 class iconvert {
903
904 libmail_u_convert_handle_t handle;
905
906 public:
907 iconvert();
908 virtual ~iconvert();
909
910 /* Start conversion.
911 ** Returns false if the requested conversion cannot be done.
912 **/
913
914 bool begin(/* Convert from */
915 const std::string &src_chset,
916
917 /* Convert to */
918 const std::string &dst_chset);
919
920 /* Feed iconv(3). Returns false if the conversion was aborted.
921 */
922
923 bool operator()(const char *, size_t);
924
925 bool operator()(const unicode_char *, size_t);
926
927 /*
928 ** Get the results here. If the subclass returns a non-0
929 ** value, the conversion is aborted.
930 */
931
932 virtual int converted(const char *, size_t);
933
934 /*
935 ** End of conversion.
936 **
937 ** Returns true if all calls to converted() returned 0,
938 ** false if the conversion was aborted.
939 **
940 ** errflag is set to true if there was a character that could
941 ** not be converted, and passed to converted().
942 */
943
944 bool end(bool &errflag)
945 {
946 return end(&errflag);
947 }
948
949 bool end()
950 {
951 return end(NULL);
952 }
953
954 /* Convert between two different charsets */
955
956 static std::string convert(const std::string &text,
957 const std::string &charset,
958 const std::string &dstcharset,
959 bool &errflag);
960
961 /* Convert between two different charsets */
962
963 static std::string convert(const std::string &text,
964 const std::string &charset,
965 const std::string &dstcharset)
966 {
967 bool dummy;
968
969 return convert(text, charset, dstcharset, dummy);
970 }
971
972 /* Convert from unicode to a charset */
973
974 static std::string convert(const std::vector<unicode_char> &uc,
975 const std::string &dstcharset,
976 bool &errflag);
977
978 /* Convert from unicode to a charset */
979
980 static std::string convert(const std::vector<unicode_char> &uc,
981 const std::string &dstcharset)
982 {
983 bool dummy;
984
985 return convert(uc, dstcharset, dummy);
986 }
987
988 /* Convert charset to unicode */
989
990 static bool convert(const std::string &text,
991 const std::string &charset,
992 std::vector<unicode_char> &uc);
993
994
995 /* Convert to upper/lower/title case */
996
997 static std::string
998 convert_tocase(/* Text string */
999 const std::string &text,
1000
1001 /* Its charset */
1002 const std::string &charset,
1003
1004 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1005 unicode_char (*first_char_func)(unicode_char),
1006
1007 /* If not NULL, second and subsequent chars */
1008 unicode_char (*char_func)(unicode_char)
1009 =NULL)
1010 {
1011 bool dummy;
1012
1013 return convert_tocase(text, charset, dummy,
1014 first_char_func,
1015 char_func);
1016 }
1017
1018 /* Convert to upper/lower/title case */
1019
1020 static std::string
1021 convert_tocase(/* Text string */
1022 const std::string &text,
1023
1024 /* Its charset */
1025 const std::string &charset,
1026
1027 /* Set if there's a conversion error */
1028 bool &err,
1029
1030 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1031 unicode_char (*first_char_func)(unicode_char),
1032
1033 /* If not NULL, second and subsequent chars */
1034 unicode_char (*char_func)(unicode_char)
1035 =NULL);
1036 private:
1037 bool end(bool *);
1038
1039 public:
1040 class tou;
1041 class fromu;
1042 };
1043
1044 /* Convert output of iconvert to unicode_chars. */
1045
1046 class iconvert::tou : public iconvert {
1047
1048 public:
1049 bool begin(const std::string &chset);
1050
1051 virtual int converted(const unicode_char *, size_t);
1052
1053 using iconvert::operator();
1054 private:
1055 int converted(const char *ptr, size_t cnt);
1056
1057 public:
1058 template<typename iter_t> class to_iter_class;
1059
1060 template<typename input_iter_t,
1061 typename output_iter_t>
1062 static output_iter_t convert(input_iter_t from_iter,
1063 input_iter_t to_iter,
1064 const std::string &chset,
1065 output_iter_t out_iter);
1066
1067 template<typename input_iter_t>
1068 static void convert(input_iter_t from_iter,
1069 input_iter_t to_iter,
1070 const std::string &chset,
1071 std::vector<unicode_char> &out_buf)
1072 {
1073 out_buf.clear();
1074 std::back_insert_iterator<std::vector<unicode_char> >
1075 insert_iter(out_buf);
1076
1077 convert(from_iter, to_iter, chset, insert_iter);
1078 }
1079
1080 static void convert(const std::string &str,
1081 const std::string &chset,
1082 std::vector<unicode_char> &out_buf);
1083 };
1084
1085 /* Helper class that saves unicode output into an output iterator */
1086
1087 template<typename iter_t>
1088 class iconvert::tou::to_iter_class : public iconvert::tou {
1089
1090 iter_t iter;
1091 public:
1092
1093 to_iter_class(iter_t iterValue)
1094 : iter(iterValue) {}
1095
1096 using tou::operator();
1097
1098 operator iter_t() const { return iter; }
1099
1100 private:
1101 int converted(const unicode_char *ptr, size_t cnt)
1102 {
1103 while (cnt)
1104 {
1105 *iter=*ptr;
1106
1107 ++iter;
1108 ++ptr;
1109 --cnt;
1110 }
1111 return 0;
1112 }
1113 };
1114
1115 template<typename input_iter_t,
1116 typename output_iter_t>
1117 output_iter_t iconvert::tou::convert(input_iter_t from_iter,
1118 input_iter_t to_iter,
1119 const std::string &chset,
1120 output_iter_t out_iter)
1121 {
1122 class to_iter_class<output_iter_t> out(out_iter);
1123
1124 if (!out.begin(chset))
1125 return out;
1126
1127 std::vector<char> string;
1128
1129 while (from_iter != to_iter)
1130 {
1131 string.push_back(*from_iter++);
1132
1133 if (string.size() > 31)
1134 {
1135 out(&string[0], string.size());
1136 string.clear();
1137 }
1138 }
1139
1140 if (string.size() > 0)
1141 out(&string[0], string.size());
1142
1143 out.end();
1144 return out;
1145 }
1146
1147 /* Convert output of iconvert from unicode_chars. */
1148
1149 class iconvert::fromu : public iconvert {
1150
1151 public:
1152 bool begin(const std::string &chset);
1153
1154 using iconvert::operator();
1155
1156 template<typename iter_t> class to_iter_class;
1157
1158 template<typename input_iter_t,
1159 typename output_iter_t>
1160 static output_iter_t convert(input_iter_t from_iter,
1161 input_iter_t to_iter,
1162 const std::string &chset,
1163 output_iter_t out_iter);
1164
1165 template<typename input_iter_t>
1166 static void convert(input_iter_t from_iter,
1167 input_iter_t to_iter,
1168 const std::string &chset,
1169 std::string &out_buf)
1170 {
1171 out_buf="";
1172 std::back_insert_iterator<std::string>
1173 insert_iter(out_buf);
1174
1175 convert(from_iter, to_iter, chset, insert_iter);
1176 }
1177
1178 static void convert(const std::vector<unicode_char> &ubuf,
1179 const std::string &chset,
1180 std::string &out_buf);
1181
1182 static std::string convert(const std::vector<unicode_char>
1183 &ubuf,
1184 const std::string &chset);
1185 };
1186
1187 /* Helper class that saves unicode output into an output iterator */
1188
1189 template<typename iter_t>
1190 class iconvert::fromu::to_iter_class : public iconvert::fromu {
1191
1192 iter_t iter;
1193 public:
1194
1195 to_iter_class(iter_t iterValue)
1196 : iter(iterValue) {}
1197
1198 using fromu::operator();
1199
1200 operator iter_t() const { return iter; }
1201
1202 private:
1203 int converted(const char *ptr, size_t cnt)
1204 {
1205 while (cnt)
1206 {
1207 *iter=*ptr;
1208
1209 ++iter;
1210 ++ptr;
1211 --cnt;
1212 }
1213 return 0;
1214 }
1215 };
1216
1217 template<typename input_iter_t,
1218 typename output_iter_t>
1219 output_iter_t iconvert::fromu::convert(input_iter_t from_iter,
1220 input_iter_t to_iter,
1221 const std::string &chset,
1222 output_iter_t out_iter)
1223 {
1224 class to_iter_class<output_iter_t> out(out_iter);
1225
1226 if (!out.begin(chset))
1227 return out;
1228
1229 std::vector<unicode_char> string;
1230
1231 while (from_iter != to_iter)
1232 {
1233 string.push_back(*from_iter++);
1234
1235 if (string.size() > 31)
1236 {
1237 out(&string[0], string.size());
1238 string.clear();
1239 }
1240 }
1241
1242 if (string.size() > 0)
1243 out(&string[0], string.size());
1244
1245 out.end();
1246 return out;
1247 }
1248
1249 /*
1250 ** Unicode linebreaking algorithm, tr14.
1251 */
1252
1253 extern "C" int linebreak_trampoline(int value, void *ptr);
1254 extern "C" int linebreakc_trampoline(int value, unicode_char ch,
1255 void *ptr);
1256
1257 /*
1258 ** Subclass linebreak_callback_base, implement operator()(int).
1259 **
1260 ** Use operator<< or operator()(iterator, iterator) to feed
1261 ** unicode_chars into the linebreaking algorithm. The subclass receives
1262 ** UNICODE_LB values, as they become available.
1263 */
1264
1265 class linebreak_callback_base {
1266
1267 unicode_lb_info_t handle;
1268
1269 int opts;
1270
1271 linebreak_callback_base(const linebreak_callback_base &);
1272 /* NOT IMPLEMENTED */
1273
1274 linebreak_callback_base &operator==(const
1275 linebreak_callback_base &);
1276 /* NOT IMPLEMENTED */
1277
1278 public:
1279 linebreak_callback_base();
1280 virtual ~linebreak_callback_base();
1281
1282 void finish();
1283
1284 void set_opts(int opts);
1285
1286 friend int linebreak_trampoline(int, void *);
1287
1288 linebreak_callback_base &operator<<(unicode_char uc);
1289
1290 template<typename iter_type>
1291 linebreak_callback_base &operator()(iter_type beg_iter,
1292 iter_type end_iter)
1293 {
1294 while (beg_iter != end_iter)
1295 operator<<(*beg_iter++);
1296 return *this;
1297 }
1298
1299 linebreak_callback_base &operator<<(const
1300 std::vector<unicode_char>
1301 &vec)
1302 {
1303 return operator()(vec.begin(), vec.end());
1304 }
1305 private:
1306 virtual int operator()(int);
1307 };
1308
1309 class linebreak_callback_save_buf : public linebreak_callback_base {
1310
1311 public:
1312 std::list<int> lb_buf;
1313
1314 linebreak_callback_save_buf();
1315 ~linebreak_callback_save_buf();
1316
1317 private:
1318 int operator()(int value);
1319 };
1320
1321 /*
1322 ** Convert an input iterator sequence over unicode_chars into
1323 ** an input iterator sequence over linebreak values.
1324 */
1325
1326 template<typename input_t> class linebreak_iter
1327 : public std::iterator<std::input_iterator_tag, int, void>
1328 {
1329 mutable input_t iter_value, end_iter_value;
1330
1331 mutable linebreak_callback_save_buf *buf;
1332
1333 void fill() const
1334 {
1335 if (buf == NULL)
1336 return;
1337
1338 while (buf->lb_buf.empty())
1339 {
1340 if (iter_value == end_iter_value)
1341 {
1342 buf->finish();
1343 if (buf->lb_buf.empty())
1344 {
1345 delete buf;
1346 buf=NULL;
1347 }
1348 break;
1349 }
1350
1351 buf->operator<<(*iter_value++);
1352 }
1353 }
1354
1355 mutable value_type bufvalue;
1356
1357 public:
1358 linebreak_iter(const input_t &iter_valueArg,
1359 const input_t &iter_endvalueArg)
1360 : iter_value(iter_valueArg),
1361 end_iter_value(iter_endvalueArg),
1362 buf(new linebreak_callback_save_buf)
1363 {
1364 }
1365
1366 linebreak_iter() : buf(NULL)
1367 {
1368 }
1369
1370 void set_opts(int opts)
1371 {
1372 if (buf)
1373 buf->set_opts(opts);
1374 }
1375
1376 ~linebreak_iter()
1377 {
1378 if (buf)
1379 delete buf;
1380 }
1381
1382 linebreak_iter(const linebreak_iter<input_t> &v)
1383 : buf(NULL)
1384 {
1385 operator=(v);
1386 }
1387
1388 linebreak_iter<input_t> &operator=(const
1389 linebreak_iter<input_t> &v)
1390 {
1391 if (buf)
1392 delete buf;
1393 buf=v.buf;
1394 iter_value=v.iter_value;
1395 end_iter_value=v.end_iter_value;
1396 v.buf=NULL;
1397 return *this;
1398 }
1399
1400 bool operator==(const linebreak_iter<input_t> &v) const
1401 {
1402 fill();
1403 v.fill();
1404
1405 return buf == NULL && v.buf == NULL;
1406 }
1407
1408 bool operator!=(const linebreak_iter<input_t> &v) const
1409 {
1410 return !operator==(v);
1411 }
1412
1413 value_type operator*() const
1414 {
1415 fill();
1416 return buf == NULL ? UNICODE_LB_MANDATORY:
1417 buf->lb_buf.front();
1418 }
1419
1420 linebreak_iter<input_t> &operator++()
1421 {
1422 bufvalue=operator*();
1423
1424 if (buf)
1425 buf->lb_buf.pop_front();
1426 return *this;
1427 }
1428
1429 const value_type *operator++(int)
1430 {
1431 operator++();
1432 return &bufvalue;
1433 }
1434 };
1435
1436 /*
1437 ** Like linebreak_callback_base, except the subclass receives both
1438 ** the linebreaking value, and the unicode character.
1439 */
1440
1441 class linebreakc_callback_base {
1442
1443 unicode_lbc_info_t handle;
1444
1445 int opts;
1446
1447 linebreakc_callback_base(const linebreakc_callback_base &);
1448 /* NOT IMPLEMENTED */
1449
1450 linebreakc_callback_base &operator==(const
1451 linebreakc_callback_base
1452 &);
1453 /* NOT IMPLEMENTED */
1454
1455
1456 public:
1457 linebreakc_callback_base();
1458 virtual ~linebreakc_callback_base();
1459
1460 void finish();
1461
1462 void set_opts(int opts);
1463
1464 friend int linebreakc_trampoline(int, unicode_char, void *);
1465
1466 linebreakc_callback_base &operator<<(unicode_char uc);
1467
1468 template<typename iter_type>
1469 linebreakc_callback_base &operator()(iter_type beg_iter,
1470 iter_type end_iter)
1471 {
1472 while (beg_iter != end_iter)
1473 operator<<(*beg_iter++);
1474 return *this;
1475 }
1476
1477 linebreakc_callback_base &operator<<(const
1478 std::vector<unicode_char>
1479 &vec)
1480 {
1481 return operator()(vec.begin(), vec.end());
1482 }
1483 private:
1484 virtual int operator()(int, unicode_char);
1485 };
1486
1487 class linebreakc_callback_save_buf : public linebreakc_callback_base {
1488
1489 public:
1490 std::list<std::pair<int, unicode_char> > lb_buf;
1491
1492 linebreakc_callback_save_buf();
1493 ~linebreakc_callback_save_buf();
1494
1495 private:
1496 int operator()(int, unicode_char);
1497 };
1498
1499
1500 /*
1501 ** Convert an input iterator sequence over unicode_chars into
1502 ** an input iterator sequence over std::pair<int, unicode_char>,
1503 ** the original unicode character, and the linebreaking value before
1504 ** the character.
1505 */
1506
1507 template<typename input_t> class linebreakc_iter
1508 : public std::iterator<std::input_iterator_tag,
1509 std::pair<int, unicode_char>, void>
1510 {
1511 mutable input_t iter_value, end_iter_value;
1512
1513 mutable linebreakc_callback_save_buf *buf;
1514
1515 void fill() const
1516 {
1517 if (buf == NULL)
1518 return;
1519
1520 while (buf->lb_buf.empty())
1521 {
1522 if (iter_value == end_iter_value)
1523 {
1524 buf->finish();
1525 if (buf->lb_buf.empty())
1526 {
1527 delete buf;
1528 buf=NULL;
1529 }
1530 break;
1531 }
1532
1533 buf->operator<<(*iter_value);
1534 ++iter_value;
1535 }
1536 }
1537
1538 mutable value_type bufvalue;
1539
1540 public:
1541 linebreakc_iter(const input_t &iter_valueArg,
1542 const input_t &iter_endvalueArg)
1543 : iter_value(iter_valueArg),
1544 end_iter_value(iter_endvalueArg),
1545 buf(new linebreakc_callback_save_buf)
1546 {
1547 }
1548
1549 linebreakc_iter() : buf(NULL)
1550 {
1551 }
1552
1553 ~linebreakc_iter()
1554 {
1555 if (buf)
1556 delete buf;
1557 }
1558
1559 linebreakc_iter(const linebreakc_iter<input_t> &v)
1560 : buf(NULL)
1561 {
1562 operator=(v);
1563 }
1564
1565 linebreakc_iter<input_t> &operator=(const
1566 linebreakc_iter<input_t> &v)
1567 {
1568 if (buf)
1569 delete buf;
1570 buf=v.buf;
1571 iter_value=v.iter_value;
1572 end_iter_value=v.end_iter_value;
1573 v.buf=NULL;
1574 return *this;
1575 }
1576
1577 bool operator==(const linebreakc_iter<input_t> &v) const
1578 {
1579 fill();
1580 v.fill();
1581
1582 return buf == NULL && v.buf == NULL;
1583 }
1584
1585 bool operator!=(const linebreakc_iter<input_t> &v) const
1586 {
1587 return !operator==(v);
1588 }
1589
1590 value_type operator*() const
1591 {
1592 fill();
1593 return buf == NULL ?
1594 std::make_pair(UNICODE_LB_MANDATORY,
1595 (unicode_char)0):
1596 buf->lb_buf.front();
1597 }
1598
1599 linebreakc_iter<input_t> &operator++()
1600 {
1601 bufvalue=operator*();
1602
1603 if (buf)
1604 buf->lb_buf.pop_front();
1605 return *this;
1606 }
1607
1608 const value_type *operator++(int)
1609 {
1610 operator++();
1611 return &bufvalue;
1612 }
1613 };
1614
1615
1616 /*
1617 ** Subclass wordbreak_callback_base, implement operator()(int).
1618 **
1619 ** Use operator<< or operator()(iterator, iterator) to feed
1620 ** unicode_chars into the wordbreaking algorithm. The subclass receives
1621 ** word flags, as they become available.
1622 */
1623
1624 extern "C" int wordbreak_trampoline(int value, void *ptr);
1625
1626 class wordbreak_callback_base {
1627
1628 unicode_wb_info_t handle;
1629
1630 wordbreak_callback_base(const wordbreak_callback_base &);
1631 /* NOT IMPLEMENTED */
1632
1633 wordbreak_callback_base &operator==(const
1634 wordbreak_callback_base &);
1635 /* NOT IMPLEMENTED */
1636
1637 public:
1638 wordbreak_callback_base();
1639 virtual ~wordbreak_callback_base();
1640
1641 void finish();
1642
1643 friend int wordbreak_trampoline(int, void *);
1644
1645 wordbreak_callback_base &operator<<(unicode_char uc);
1646
1647 template<typename iter_type>
1648 wordbreak_callback_base &operator()(iter_type beg_iter,
1649 iter_type end_iter)
1650 {
1651 while (beg_iter != end_iter)
1652 operator<<(*beg_iter++);
1653 return *this;
1654 }
1655
1656 wordbreak_callback_base &operator<<(const
1657 std::vector<unicode_char>
1658 &vec)
1659 {
1660 return operator()(vec.begin(), vec.end());
1661 }
1662 private:
1663 virtual int operator()(bool);
1664 };
1665
1666 /*
1667 ** A C++ wrapper for unicode_wbscan.
1668 */
1669
1670 class wordbreakscan {
1671
1672 unicode_wbscan_info_t handle;
1673
1674 wordbreakscan(const wordbreakscan &);
1675 /* NOT IMPLEMENTED */
1676
1677 wordbreakscan &operator==(const wordbreakscan &);
1678 /* NOT IMPLEMENTED */
1679 public:
1680
1681 wordbreakscan();
1682 ~wordbreakscan();
1683
1684 bool operator<<(unicode_char uc);
1685
1686 size_t finish();
1687 };
1688
1689 }
1690 #endif
1691
1692 #endif