Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode.h
CommitLineData
b0322a85
CE
1#ifndef unicode_h
2#define unicode_h
3
4/*
5** Copyright 2000-2011 Double Precision, Inc.
6** See COPYING for distribution information.
7**
8*/
9
10#ifdef __cplusplus
11
12#include <string>
13#include <vector>
14#include <list>
15
16extern "C" {
17#endif
18
19#if 0
20}
21#endif
22
23#include "unicode/unicode_config.h"
24
25#include <stdlib.h>
26
27#include <stdio.h>
28#if HAVE_WCHAR_H
29#include <wchar.h>
30#endif
31
32#if HAVE_STDDEF_H
33#include <stddef.h>
34#endif
35#include <stdint.h>
36
37#include <sys/types.h>
38
39typedef uint32_t unicode_char;
40
41/*
42** The system default character set, from the locale.
43*/
44
45extern const char *unicode_default_chset();
46
47/* Unicode upper/lower/title case conversion functions */
48
49extern unicode_char unicode_uc(unicode_char);
50extern unicode_char unicode_lc(unicode_char);
51extern unicode_char unicode_tc(unicode_char);
52
53/*
54** Look up HTML 4.0/XHTML entity.
55**
56** n="amp", etc...
57**
58** Returns the unicode entity value, or 0 if no such entity is defined.
59*/
60
61unicode_char unicode_html40ent_lookup(const char *n);
62
63/*
64**
65** Return "width" of unicode character.
66**
67** This is defined as follows: for characters having the F or W property in
68** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
69**
70** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
71** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
72** cases, 1.
73**
74** This provides a rough estimate of the "width" of the character if its
75** shown on a text console.
76*/
77
78extern int unicode_wcwidth(unicode_char c);
79extern size_t unicode_wcwidth_str(const unicode_char *c);
80
81/*
82** The unicode-ish isspace()
83*/
84extern int unicode_isspace(unicode_char ch);
85
86/* Internal unicode table lookup function */
87
88extern uint8_t unicode_tab_lookup(unicode_char ch,
89 const size_t *unicode_indextab,
90 size_t unicode_indextab_sizeof,
91 const uint8_t (*unicode_rangetab)[2],
92 const uint8_t *unicode_classtab,
93 uint8_t uclass);
94
95/*
96** Implementation of grapheme cluster boundary rules, as per tr29,
97** including GB9a and GB9b.
98**
99** Returns non-zero if there's a grapheme break between the two referenced
100** characters.
101*/
102
103int unicode_grapheme_break(unicode_char a, unicode_char b);
104
105/*
106** Implementation of line break rules, as per tr14.
107**
108** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
109** first parameter is a callback function that gets invoked with two
110** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
111** The second parameter to unicode_lb_init() is the opaque passthrough
112** pointer, that is passed as the second argument to the callback function
113** with no further interpretation.
114**
115** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
116** passing the handle and one unicode character. Repeatedly invoke
117** unicode_lb_next() to specify the input string for the linebreaking
118** algorithm, then invoke unicode_lb_end() to finish calculating the
119** linebreaking algorithm, and deallocate the opaque linebreaking handle.
120**
121** The callback function gets invoked once for each invocation of
122** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
123** the callback function will get invoked the exact number of times that
124** unicode_lb_next(), as long as each invocation of the callback function
125** returned 0; nothing more, nothing less. The first parameter to the callback
126** function will be one of the following values:
127**
128** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
129** character.
130** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
131** character.
132** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
133** character (the preceding character is a space, or an equivalent).
134**
135** The callback function should return 0. A non-zero value indicates an
136** error, which gets propagated up to the caller. The contract that the
137** callback function gets invoked the same number of times that
138** unicode_lb_next() gets invoked is now broken.
139*/
140
141#define UNICODE_LB_MANDATORY -1
142#define UNICODE_LB_NONE 0
143#define UNICODE_LB_ALLOWED 1
144
145struct unicode_lb_info;
146
147typedef struct unicode_lb_info *unicode_lb_info_t;
148
149/*
150** Allocate a linebreaking handle.
151*/
152extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
153 void *cb_arg);
154
155/*
156** Feed the next character through the linebreaking algorithm.
157** A non-zero return code indicates that the callback function was invoked
158** and it returned a non-zero return code (which is propagated as a return
159** value). unicode_lb_end() must still be invoked, in this case.
160**
161** A zero return code indicates that if the callback function was invoked,
162** it returned 0.
163*/
164
165extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch);
166
167/*
168** Convenience function that invokes unicode_lb_next() with a list of
169** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
170** 0, or the first non-zero return value from unicode_lb_next().
171*/
172
173extern int unicode_lb_next_cnt(unicode_lb_info_t i,
174 const unicode_char *chars,
175 size_t cnt);
176
177/*
178** Finish the linebreaking algorithm.
179**
180** A non-zero return code indicates that the callback function was invoked
181** and it returned a non-zero return code (which is propagated as a return
182** value).
183**
184** A zero return code indicates that if the callback function was invoked,
185** it returned 0, and that the callback function was invoked exactly the same
186** number of times that unicode_lb_next() was invoked.
187**
188** In all case, the linebreak handle will no longer be valid when this
189** function returns.
190*/
191
192extern int unicode_lb_end(unicode_lb_info_t i);
193
194/*
195** An alternative linebreak API where the callback function receives the
196** original unicode character in addition to its linebreak value.
197**
198** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
199** semantics are the same as their _lb_ counterparts.
200*/
201
202struct unicode_lbc_info;
203
204typedef struct unicode_lbc_info *unicode_lbc_info_t;
205
206extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char,
207 void *),
208 void *cb_arg);
209extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch);
210extern int unicode_lbc_end(unicode_lbc_info_t i);
211
212/*
213** Set linebreaking options.
214**
215** OPTIONS SUBJECT TO CHANGE.
216*/
217
218extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts);
219
220extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts);
221
222/*
223** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
224**
225** Adds the following to LB24:
226**
227** PR x PR
228**
229** AL x PR
230**
231** ID x PR
232**/
233#define UNICODE_LB_OPT_PRBREAK 0x0001
234
235
236/*
237** Tailored / breaking rules.
238**
239** Adds the following rule to LB13:
240**
241** SY x EX
242**
243** SY x AL
244**
245** SY x ID
246**
247** SP ÷ SY, which takes precedence over "x SY".
248*/
249#define UNICODE_LB_OPT_SYBREAK 0x0002
250
251/*
252** Tailored / breaking rules.
253**
254** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
255** and after mdash and ndash.
256*/
257#define UNICODE_LB_OPT_DASHWJ 0x0004
258
259/*
260** Implemention of word break rules, as per tr29.
261**
262** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
263** first parameter is a callback function that gets invoked with two
264** arguments: an int flag, and a passthrough argument. The second parameter to
265** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
266** second argument to the callback function with no further interpretation.
267**
268** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
269** passing the handle and one unicode character. Repeatedly invoke
270** unicode_wb_next() to specify the input string for the wordbreaking
271** algorithm, then invoke unicode_wb_end() to finish calculating the
272** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
273**
274** The callback function gets invoked once for each invocation of
275** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
276** the callback function will get invoked the exact number of times that
277** unicode_wb_next(), as long as each invocation of the callback function
278** returned 0; nothing more, nothing less. The first parameter to the callback
279** function will be an int. A non-zero value indicates that there is a word
280** break between this character and the preceding one.
281**
282** The callback function should return 0. A non-zero value indicates an
283** error, which gets propagated up to the caller. The contract that the
284** callback function gets invoked the same number of times that
285** unicode_lb_next() gets invoked is now broken.
286*/
287
288struct unicode_wb_info;
289
290typedef struct unicode_wb_info *unicode_wb_info_t;
291
292/*
293** Allocate a wordbreaking handle.
294*/
295extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
296 void *cb_arg);
297
298/*
299** Feed the next character through the wordbreaking algorithm.
300** A non-zero return code indicates that the callback function was invoked
301** and it returned a non-zero return code (which is propagated as a return
302** value). unicode_wb_end() must still be invoked, in this case.
303**
304** A zero return code indicates that if the callback function was invoked,
305** it returned 0.
306*/
307
308extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch);
309
310/*
311** Convenience function that invokes unicode_wb_next() with a list of
312** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
313** 0, or the first non-zero return value from unicode_wb_next().
314*/
315
316extern int unicode_wb_next_cnt(unicode_wb_info_t i,
317 const unicode_char *chars,
318 size_t cnt);
319
320/*
321** Finish the wordbreaking algorithm.
322**
323** A non-zero return code indicates that the callback function was invoked
324** and it returned a non-zero return code (which is propagated as a return
325** value).
326**
327** A zero return code indicates that if the callback function was invoked,
328** it returned 0, and that the callback function was invoked exactly the same
329** number of times that unicode_wb_next() was invoked.
330**
331** In all case, the wordbreak handle will no longer be valid when this
332** function returns.
333*/
334
335extern int unicode_wb_end(unicode_wb_info_t i);
336
337/*
338** Search for a word boundary.
339**
340** Obtain a handle by calling unicode_wbscan_init(), then invoke
341** unicode_wbscan_next() to provide a unicode stream, then invoke
342** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
343** characters from the beginning of the stream until the first word boundary.
344**
345** You may prematurely stop calling unicode_wbscan_next() once it returns a
346** non-0 value, which means that there is sufficient context to compute the
347** first word boundary, and all further calls to unicode_wbscan_next() will
348** be internal no-ops.
349*/
350
351struct unicode_wbscan_info;
352
353typedef struct unicode_wbscan_info *unicode_wbscan_info_t;
354
355unicode_wbscan_info_t unicode_wbscan_init();
356
357int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch);
358
359size_t unicode_wbscan_end(unicode_wbscan_info_t i);
360
361/*
362** A buffer that holds unicode characters, and dynamically grows as needed.
363*/
364
365struct unicode_buf {
366 unicode_char *ptr; /* The unicode characters */
367 size_t size, /* Buffer size */
368 len, /* How many characters in ptr are initialized */
369 max; /* Maximum size the buffer can grow to */
370};
371
372/*
373** Initialize a buffer. Constructor.
374*/
375
376void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
377 struct unicode_buf *p,
378
379 /*
380 ** Maximum size the buffer can grow to. (size_t)-1
381 ** means unlimited.
382 */
383 size_t max);
384/*
385** Like unicode_buf_init, and initialize the new buffer with the contents of
386** another buffer. The maximum size of the initialized buffer is exactly the
387** number of characters in the existing buffer. This copies a buffer using
388** the minimum amount of heap space.
389*/
390
391#define unicode_buf_init_copy(a,b) \
392 do { \
393 unicode_buf_init((a), unicode_buf_len(b)); \
394 unicode_buf_append_buf((a),(b)); \
395 } while (0)
396
397/*
398** Deinitialize the buffer. Destructor. Frees memory.
399*/
400
401void unicode_buf_deinit(struct unicode_buf *p);
402
403/*
404** Official way to access the characters in the unicode buffer.
405*/
406#define unicode_buf_ptr(p) ((p)->ptr)
407
408/*
409** Official way of obtaining the number of characters in the unicode buffer.
410*/
411#define unicode_buf_len(p) ((p)->len)
412
413/*
414** Remove all existing characters from an initialized buffer. Sets len to 0.
415*/
416
417#define unicode_buf_clear(p) ((p)->len=0)
418
419/*
420** Append characters to the existing characters in the unicode buffer.
421** The buffer grows, if needed. If the buffer would exceed its maximum size,
422** the extra characters get truncated.
423**
424** Returns 0 if the characters were appended. -1 for a malloc failure.
425*/
426
427int unicode_buf_append(struct unicode_buf *p, /* The buffer */
428 const unicode_char *uc, /* Characters to append */
429 size_t l); /* How many of them */
430
431/*
432** Convert an iso-8859-1 char string and invoke unicode_buf_append().
433*/
434
435void unicode_buf_append_char(struct unicode_buf *dst,
436 const char *str,
437 size_t cnt);
438
439/*
440** Remove some portion of the unicode buffer
441*/
442
443void unicode_buf_remove(struct unicode_buf *p, /* The buffer */
444 size_t pos, /* Offset in buffer */
445 size_t cnt); /* How many to remove */
446
447/*
448** Append the contents of an existing buffer to another one.
449*/
450
451#define unicode_buf_append_buf(a,b) \
452 unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
453
454
455/*
456** The equivalent of strcmp() for unicode buffers.
457*/
458
459int unicode_buf_cmp(const struct unicode_buf *a,
460 const struct unicode_buf *b);
461
462/*
463** The equivalent of unicode_buf_cmp, except that the second buffer is an
464** iso-8859-1 string.
465*/
466
467int unicode_buf_cmp_str(const struct unicode_buf *p,
468 const char *c, /* iso-8859-1 string */
469 size_t cl); /* Number of chars in c */
470
471/*
472** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
473** A handle gets created by libmail_u_convert_init().
474** libmail_u_convert_init() receives a pointer to the output function
475** which receives converted character text.
476**
477** The output function receives a pointer to the converted character text, and
478** the number of characters in the converted text.
479**
480** The character text to convert gets passed, repeatedly, to
481** libmail_u_convert(). Each call to libmail_u_convert() results in
482** the output function being invoked, zero or more times, with the converted
483** text. Finally, libmail_u_convert_deinit() stops the conversion and
484** deallocates the conversion handle.
485**
486** Internal buffering takes place. libmail_u_convert_deinit() may result
487** in the output function being called one or more times, to receive the final
488** part of the converted character stream.
489**
490** The output function should return 0. A non-0 value causes
491** libmail_u_convert() and/or libmail_u_convert_deinit() returning
492** non-0.
493*/
494
495struct libmail_u_convert_hdr;
496
497typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t;
498
499/*
500** libmail_u_convert_init() returns a non-NULL handle for the requested
501** conversion, or NULL if the requested conversion is not available.
502*/
503
504libmail_u_convert_handle_t
505libmail_u_convert_init(/* Convert from this chset */
506 const char *src_chset,
507
508 /* Convert to this chset */
509 const char *dst_chset,
510
511 /* The output function */
512
513 int (*output_func)(const char *, size_t, void *),
514
515 /* Passthrough arg */
516 void *convert_arg);
517
518/*
519** Repeatedly pass the character text to convert to libmail_u_convert().
520**
521** Returns non-0 if the output function returned non-0, or 0 if all invocations
522** of the output function returned 0.
523*/
524
525int libmail_u_convert(/* The conversion handle */
526 libmail_u_convert_handle_t handle,
527
528 /* Text to convert */
529 const char *text,
530
531 /* Number of bytes to convert */
532 size_t cnt);
533
534/*
535** Finish character set conversion. The handle gets deallocated.
536**
537** May still result in one or more invocations of the output function.
538** Returns non-zero if any previous invocation of the output function returned
539** non-zero (this includes any invocations of the output function resulting
540** from this call, or prior libmail_u_convert() calls), or 0 if all
541** invocations of the output function returned 0.
542**
543** If the errptr is not NULL, *errptr is set to non-zero if there were any
544** conversion errors -- if there was any text that could not be converted to
545** the destination character text.
546*/
547
548int libmail_u_convert_deinit(libmail_u_convert_handle_t handle,
549 int *errptr);
550
551
552/*
553** Specialization: save converted character text in a buffer.
554**
555** Implementation: call libmail_u_convert_tocbuf_init() instead of
556** libmail_u_convert_init(), then call libmail_u_convert() and
557** libmail_u_convert_deinit(), as usual.
558**
559** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
560** malloc()ed buffer, and the number of converted characters, the size of the
561** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
562** to libmail_u_convert_tou_init().
563**
564** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
565** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
566**
567** The optional nullterminate places a trailing \0 character after the
568** converted string (this is included in *cbufsize_ret).
569*/
570
571libmail_u_convert_handle_t
572libmail_u_convert_tocbuf_init(/* Convert from this chset */
573 const char *src_chset,
574
575 /* Convert to this chset */
576 const char *dst_chset,
577
578 /* malloced buffer */
579 char **cbufptr_ret,
580
581 /* size of the malloced buffer */
582 size_t *cbufsize_ret,
583
584 /* null terminate the resulting string */
585 int nullterminate
586 );
587
588
589/*
590** Specialization: convert some character text to a unicode_char array.
591**
592** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char
593** array.
594**
595** The returned *ucsize_ret is initialized with the number of unicode_chars,
596** rather than the byte count.
597**
598** In all other ways, this function behaves identically to
599** libmail_u_convert_tocbuf_init().
600*/
601
602libmail_u_convert_handle_t
603libmail_u_convert_tou_init(/* Convert from this chset */
604 const char *src_chset,
605
606 /* malloc()ed buffer pointer, on exit. */
607 unicode_char **ucptr_ret,
608
609 /* size of the malloc()ed buffer, upon exit */
610 size_t *ucsize_ret,
611
612 /* If true, terminate with U+0x0000, for convenience */
613 int nullterminate
614 );
615
616/*
617** Specialization: convert a unicode_char array to some character text.
618**
619** This is the opposite of libmail_u_convert_tou_init(). Call this to
620** initialize the conversion handle, then use libmail_u_convert_uc()
621** instead of libmail_u_convert.
622*/
623
624libmail_u_convert_handle_t
625libmail_u_convert_fromu_init(/* Convert to this chset */
626 const char *dst_chset,
627
628 /* malloc()ed buffer pointer, on exit. */
629 char **cbufptr_ret,
630
631 /* size of the malloc()ed buffer, upon exit */
632 size_t *cbufsize_ret,
633
634 /* If true, terminate with U+0x0000, for convenience */
635 int nullterminate
636 );
637
638int libmail_u_convert_uc(/* The conversion handle */
639 libmail_u_convert_handle_t handle,
640
641 /* Text to convert */
642 const unicode_char *text,
643
644 /* Number of bytes to convert */
645 size_t cnt);
646
647/*
648** Initialize conversion to UTF-8.
649**
650** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
651** destination charset as UTF-8.
652*/
653
654libmail_u_convert_handle_t
655libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
656 char **cbufptr_ret,
657 size_t *cbufsize_ret,
658 int nullterminate);
659
660/*
661** Initialize conversion from UTF-8.
662**
663** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
664** source charset as UTF-8.
665*/
666
667libmail_u_convert_handle_t
668libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
669 char **cbufptr_ret,
670 size_t *cbufsize_ret,
671 int nullterminate);
672
673/*
674** Convert a character string to UTF-8.
675**
676** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
677** error occured.
678*/
679char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */
680 const char *text,
681
682 /* Character set to convert to UTF-8 */
683 const char *charset,
684
685 /*
686 ** If non-NULL, and a non-NULL pointer is
687 ** returned, *error is set to non-zero if
688 ** a character conversion error has occured.
689 */
690 int *error);
691
692/*
693** Convert UTF-8 text to another character set.
694**
695** Returns a malloc-ed buffer holding the string converted to the specified
696** character set, or NULL if an error occured.
697*/
698
699char *libmail_u_convert_fromutf8(/* A UTF-8 string */
700 const char *text,
701
702 /*
703 ** Convert the UTF-8 string to this character
704 ** set.
705 */
706
707 const char *charset,
708
709 /*
710 ** If non-NULL, and a non-NULL pointer is
711 ** returned, *error is set to non-zero if
712 ** a character conversion error has occured.
713 */
714 int *error);
715
716/*
717** Convert one charset to another charset, placing the result in a malloc-ed
718** buffer.
719**
720** Returns a malloc-ed buffer holding the string converted to the specified
721** character set, or NULL if an error occured.
722*/
723
724char *libmail_u_convert_tobuf(/* A string to convert */
725 const char *text,
726
727 /*
728 ** String's charset.
729 */
730
731 const char *charset,
732
733 /*
734 ** Destination charset
735 */
736 const char *dstcharset,
737
738 /*
739 ** If non-NULL, and a non-NULL pointer is
740 ** returned, *error is set to non-zero if
741 ** a character conversion error has occured.
742 */
743 int *error);
744
745/*
746** Convenience function: call libmail_u_convert_tou_init(), feed the
747** character string through libmail_u_convert(), then call
748** libmail_u_convert_deinit().
749**
750** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
751** holding the unicode char array.
752*/
753
754int libmail_u_convert_tou_tobuf(/* Character text to convert */
755 const char *text,
756
757 /* Number of characters */
758 size_t text_l,
759
760 /* text's charset */
761 const char *charset,
762
763 /*
764 ** If this function returns 0, this gets
765 ** initialized
766 */
767 unicode_char **uc,
768
769 /*
770 ** Size of the allocated buffer
771 */
772 size_t *ucsize,
773
774 /*
775 ** If not null and this function returns 0,
776 ** this is set to non-0 if there
777 ** was a conversion error (but the output
778 ** buffer gets still allocated and
779 ** initialized)
780 */
781 int *err);
782
783/*
784** Convenience function: call libmail_u_convert_fromu_init(), feed the
785** unicode_array through libmail_u_convert_uc(), then call
786** libmail_u_convert_deinit().
787**
788** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
789** holding the converted character string
790*/
791
792int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */
793 const unicode_char *utext,
794
795 /*
796 ** Size of the unicode array.
797 ** If this is (size_t)-1, utext is a
798 ** 0-terminated array.
799 */
800 size_t utext_l,
801
802 /*
803 ** Convert the unicode array to this charset.
804 */
805 const char *charset,
806
807 /*
808 ** If libmail_u_convert_fromu_tobuf()
809 ** returns 0, this is initialized to a
810 ** malloced buffer with a 0-terminated
811 ** string is kept.
812 */
813 char **c,
814
815 /*
816 ** Size of the initialized array, including
817 ** the 0-terminator.
818 */
819 size_t *csize,
820
821 /*
822 ** If libmail_u_convert_fromu_tobuf()
823 ** returns 0 and this is not NULL,
824 ** *err is set to non-0 if there was a
825 ** conversion error to the requested
826 ** character set.
827 */
828 int *err);
829
830/*
831** Convenience function: convert a string in a given character set
832** to/from uppercase, lowercase, or something else.
833**
834** This is done by calling libmail_u_convert_tou_tobuf() first,
835** applying the title_func and char_func, then using
836** libmail_u_convert_fromu_tobuf().
837**
838** A NULL return indicates that the requested conversion cannot be performed.
839*/
840
841char *libmail_u_convert_tocase( /* String to convert */
842 const char *str,
843
844 /* String's character set */
845
846 const char *charset,
847
848 /*
849 ** Conversion of the first character in
850 ** str: unicode_uc, unicode_lc, or unicode_tc:
851 */
852
853 unicode_char (*first_char_func)(unicode_char),
854
855 /*
856 ** Conversion of the second and the remaining
857 ** character in str. If NULL, same as
858 ** first_char_func.
859 */
860 unicode_char (*char_func)(unicode_char));
861
862
863
864/* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */
865
866extern const char libmail_u_ucs4_native[];
867
868/* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */
869
870extern const char libmail_u_ucs2_native[];
871
872/*
873** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
874** parameter.
875**
876** This can be followed by a " " and up to 15 characters to be escaped in
877** addition to unicode chars.
878*/
879
880#define unicode_x_imap_modutf7 "x-imap-modutf7"
881
882#if 0
883{
884#endif
885
886#ifdef __cplusplus
887}
888
889extern size_t unicode_wcwidth(const std::vector<unicode_char> &uc);
890
891namespace mail {
892
893 /*
894 ** Interface to iconv.
895 **
896 ** Subclass converted(). Invoke begin(), then operator(), repeatedly,
897 ** then end().
898 **
899 ** converted() receives the converted text.
900 */
901
902 class iconvert {
903
904 libmail_u_convert_handle_t handle;
905
906 public:
907 iconvert();
908 virtual ~iconvert();
909
910 /* Start conversion.
911 ** Returns false if the requested conversion cannot be done.
912 **/
913
914 bool begin(/* Convert from */
915 const std::string &src_chset,
916
917 /* Convert to */
918 const std::string &dst_chset);
919
920 /* Feed iconv(3). Returns false if the conversion was aborted.
921 */
922
923 bool operator()(const char *, size_t);
924
925 bool operator()(const unicode_char *, size_t);
926
927 /*
928 ** Get the results here. If the subclass returns a non-0
929 ** value, the conversion is aborted.
930 */
931
932 virtual int converted(const char *, size_t);
933
934 /*
935 ** End of conversion.
936 **
937 ** Returns true if all calls to converted() returned 0,
938 ** false if the conversion was aborted.
939 **
940 ** errflag is set to true if there was a character that could
941 ** not be converted, and passed to converted().
942 */
943
944 bool end(bool &errflag)
945 {
946 return end(&errflag);
947 }
948
949 bool end()
950 {
951 return end(NULL);
952 }
953
954 /* Convert between two different charsets */
955
956 static std::string convert(const std::string &text,
957 const std::string &charset,
958 const std::string &dstcharset,
959 bool &errflag);
960
961 /* Convert between two different charsets */
962
963 static std::string convert(const std::string &text,
964 const std::string &charset,
965 const std::string &dstcharset)
966 {
967 bool dummy;
968
969 return convert(text, charset, dstcharset, dummy);
970 }
971
972 /* Convert from unicode to a charset */
973
974 static std::string convert(const std::vector<unicode_char> &uc,
975 const std::string &dstcharset,
976 bool &errflag);
977
978 /* Convert from unicode to a charset */
979
980 static std::string convert(const std::vector<unicode_char> &uc,
981 const std::string &dstcharset)
982 {
983 bool dummy;
984
985 return convert(uc, dstcharset, dummy);
986 }
987
988 /* Convert charset to unicode */
989
990 static bool convert(const std::string &text,
991 const std::string &charset,
992 std::vector<unicode_char> &uc);
993
994
995 /* Convert to upper/lower/title case */
996
997 static std::string
998 convert_tocase(/* Text string */
999 const std::string &text,
1000
1001 /* Its charset */
1002 const std::string &charset,
1003
1004 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1005 unicode_char (*first_char_func)(unicode_char),
1006
1007 /* If not NULL, second and subsequent chars */
1008 unicode_char (*char_func)(unicode_char)
1009 =NULL)
1010 {
1011 bool dummy;
1012
1013 return convert_tocase(text, charset, dummy,
1014 first_char_func,
1015 char_func);
1016 }
1017
1018 /* Convert to upper/lower/title case */
1019
1020 static std::string
1021 convert_tocase(/* Text string */
1022 const std::string &text,
1023
1024 /* Its charset */
1025 const std::string &charset,
1026
1027 /* Set if there's a conversion error */
1028 bool &err,
1029
1030 /* First character: unicode_uc, unicode_lc, or unicode_tc */
1031 unicode_char (*first_char_func)(unicode_char),
1032
1033 /* If not NULL, second and subsequent chars */
1034 unicode_char (*char_func)(unicode_char)
1035 =NULL);
1036 private:
1037 bool end(bool *);
1038
1039 public:
1040 class tou;
1041 class fromu;
1042 };
1043
1044 /* Convert output of iconvert to unicode_chars. */
1045
1046 class iconvert::tou : public iconvert {
1047
1048 public:
1049 bool begin(const std::string &chset);
1050
1051 virtual int converted(const unicode_char *, size_t);
1052
1053 using iconvert::operator();
1054 private:
1055 int converted(const char *ptr, size_t cnt);
1056
1057 public:
1058 template<typename iter_t> class to_iter_class;
1059
1060 template<typename input_iter_t,
1061 typename output_iter_t>
1062 static output_iter_t convert(input_iter_t from_iter,
1063 input_iter_t to_iter,
1064 const std::string &chset,
1065 output_iter_t out_iter);
1066
1067 template<typename input_iter_t>
1068 static void convert(input_iter_t from_iter,
1069 input_iter_t to_iter,
1070 const std::string &chset,
1071 std::vector<unicode_char> &out_buf)
1072 {
1073 out_buf.clear();
1074 std::back_insert_iterator<std::vector<unicode_char> >
1075 insert_iter(out_buf);
1076
1077 convert(from_iter, to_iter, chset, insert_iter);
1078 }
1079
1080 static void convert(const std::string &str,
1081 const std::string &chset,
1082 std::vector<unicode_char> &out_buf);
1083 };
1084
1085 /* Helper class that saves unicode output into an output iterator */
1086
1087 template<typename iter_t>
1088 class iconvert::tou::to_iter_class : public iconvert::tou {
1089
1090 iter_t iter;
1091 public:
1092
1093 to_iter_class(iter_t iterValue)
1094 : iter(iterValue) {}
1095
1096 using tou::operator();
1097
1098 operator iter_t() const { return iter; }
1099
1100 private:
1101 int converted(const unicode_char *ptr, size_t cnt)
1102 {
1103 while (cnt)
1104 {
1105 *iter=*ptr;
1106
1107 ++iter;
1108 ++ptr;
1109 --cnt;
1110 }
1111 return 0;
1112 }
1113 };
1114
1115 template<typename input_iter_t,
1116 typename output_iter_t>
1117 output_iter_t iconvert::tou::convert(input_iter_t from_iter,
1118 input_iter_t to_iter,
1119 const std::string &chset,
1120 output_iter_t out_iter)
1121 {
1122 class to_iter_class<output_iter_t> out(out_iter);
1123
1124 if (!out.begin(chset))
1125 return out;
1126
1127 std::vector<char> string;
1128
1129 while (from_iter != to_iter)
1130 {
1131 string.push_back(*from_iter++);
1132
1133 if (string.size() > 31)
1134 {
1135 out(&string[0], string.size());
1136 string.clear();
1137 }
1138 }
1139
1140 if (string.size() > 0)
1141 out(&string[0], string.size());
1142
1143 out.end();
1144 return out;
1145 }
1146
1147 /* Convert output of iconvert from unicode_chars. */
1148
1149 class iconvert::fromu : public iconvert {
1150
1151 public:
1152 bool begin(const std::string &chset);
1153
1154 using iconvert::operator();
1155
1156 template<typename iter_t> class to_iter_class;
1157
1158 template<typename input_iter_t,
1159 typename output_iter_t>
1160 static output_iter_t convert(input_iter_t from_iter,
1161 input_iter_t to_iter,
1162 const std::string &chset,
1163 output_iter_t out_iter);
1164
1165 template<typename input_iter_t>
1166 static void convert(input_iter_t from_iter,
1167 input_iter_t to_iter,
1168 const std::string &chset,
1169 std::string &out_buf)
1170 {
1171 out_buf="";
1172 std::back_insert_iterator<std::string>
1173 insert_iter(out_buf);
1174
1175 convert(from_iter, to_iter, chset, insert_iter);
1176 }
1177
1178 static void convert(const std::vector<unicode_char> &ubuf,
1179 const std::string &chset,
1180 std::string &out_buf);
1181
1182 static std::string convert(const std::vector<unicode_char>
1183 &ubuf,
1184 const std::string &chset);
1185 };
1186
1187 /* Helper class that saves unicode output into an output iterator */
1188
1189 template<typename iter_t>
1190 class iconvert::fromu::to_iter_class : public iconvert::fromu {
1191
1192 iter_t iter;
1193 public:
1194
1195 to_iter_class(iter_t iterValue)
1196 : iter(iterValue) {}
1197
1198 using fromu::operator();
1199
1200 operator iter_t() const { return iter; }
1201
1202 private:
1203 int converted(const char *ptr, size_t cnt)
1204 {
1205 while (cnt)
1206 {
1207 *iter=*ptr;
1208
1209 ++iter;
1210 ++ptr;
1211 --cnt;
1212 }
1213 return 0;
1214 }
1215 };
1216
1217 template<typename input_iter_t,
1218 typename output_iter_t>
1219 output_iter_t iconvert::fromu::convert(input_iter_t from_iter,
1220 input_iter_t to_iter,
1221 const std::string &chset,
1222 output_iter_t out_iter)
1223 {
1224 class to_iter_class<output_iter_t> out(out_iter);
1225
1226 if (!out.begin(chset))
1227 return out;
1228
1229 std::vector<unicode_char> string;
1230
1231 while (from_iter != to_iter)
1232 {
1233 string.push_back(*from_iter++);
1234
1235 if (string.size() > 31)
1236 {
1237 out(&string[0], string.size());
1238 string.clear();
1239 }
1240 }
1241
1242 if (string.size() > 0)
1243 out(&string[0], string.size());
1244
1245 out.end();
1246 return out;
1247 }
1248
1249 /*
1250 ** Unicode linebreaking algorithm, tr14.
1251 */
1252
1253 extern "C" int linebreak_trampoline(int value, void *ptr);
1254 extern "C" int linebreakc_trampoline(int value, unicode_char ch,
1255 void *ptr);
1256
1257 /*
1258 ** Subclass linebreak_callback_base, implement operator()(int).
1259 **
1260 ** Use operator<< or operator()(iterator, iterator) to feed
1261 ** unicode_chars into the linebreaking algorithm. The subclass receives
1262 ** UNICODE_LB values, as they become available.
1263 */
1264
1265 class linebreak_callback_base {
1266
1267 unicode_lb_info_t handle;
1268
1269 int opts;
1270
1271 linebreak_callback_base(const linebreak_callback_base &);
1272 /* NOT IMPLEMENTED */
1273
1274 linebreak_callback_base &operator==(const
1275 linebreak_callback_base &);
1276 /* NOT IMPLEMENTED */
1277
1278 public:
1279 linebreak_callback_base();
1280 virtual ~linebreak_callback_base();
1281
1282 void finish();
1283
1284 void set_opts(int opts);
1285
1286 friend int linebreak_trampoline(int, void *);
1287
1288 linebreak_callback_base &operator<<(unicode_char uc);
1289
1290 template<typename iter_type>
1291 linebreak_callback_base &operator()(iter_type beg_iter,
1292 iter_type end_iter)
1293 {
1294 while (beg_iter != end_iter)
1295 operator<<(*beg_iter++);
1296 return *this;
1297 }
1298
1299 linebreak_callback_base &operator<<(const
1300 std::vector<unicode_char>
1301 &vec)
1302 {
1303 return operator()(vec.begin(), vec.end());
1304 }
1305 private:
1306 virtual int operator()(int);
1307 };
1308
1309 class linebreak_callback_save_buf : public linebreak_callback_base {
1310
1311 public:
1312 std::list<int> lb_buf;
1313
1314 linebreak_callback_save_buf();
1315 ~linebreak_callback_save_buf();
1316
1317 private:
1318 int operator()(int value);
1319 };
1320
1321 /*
1322 ** Convert an input iterator sequence over unicode_chars into
1323 ** an input iterator sequence over linebreak values.
1324 */
1325
1326 template<typename input_t> class linebreak_iter
1327 : public std::iterator<std::input_iterator_tag, int, void>
1328 {
1329 mutable input_t iter_value, end_iter_value;
1330
1331 mutable linebreak_callback_save_buf *buf;
1332
1333 void fill() const
1334 {
1335 if (buf == NULL)
1336 return;
1337
1338 while (buf->lb_buf.empty())
1339 {
1340 if (iter_value == end_iter_value)
1341 {
1342 buf->finish();
1343 if (buf->lb_buf.empty())
1344 {
1345 delete buf;
1346 buf=NULL;
1347 }
1348 break;
1349 }
1350
1351 buf->operator<<(*iter_value++);
1352 }
1353 }
1354
1355 mutable value_type bufvalue;
1356
1357 public:
1358 linebreak_iter(const input_t &iter_valueArg,
1359 const input_t &iter_endvalueArg)
1360 : iter_value(iter_valueArg),
1361 end_iter_value(iter_endvalueArg),
1362 buf(new linebreak_callback_save_buf)
1363 {
1364 }
1365
1366 linebreak_iter() : buf(NULL)
1367 {
1368 }
1369
1370 void set_opts(int opts)
1371 {
1372 if (buf)
1373 buf->set_opts(opts);
1374 }
1375
1376 ~linebreak_iter()
1377 {
1378 if (buf)
1379 delete buf;
1380 }
1381
1382 linebreak_iter(const linebreak_iter<input_t> &v)
1383 : buf(NULL)
1384 {
1385 operator=(v);
1386 }
1387
1388 linebreak_iter<input_t> &operator=(const
1389 linebreak_iter<input_t> &v)
1390 {
1391 if (buf)
1392 delete buf;
1393 buf=v.buf;
1394 iter_value=v.iter_value;
1395 end_iter_value=v.end_iter_value;
1396 v.buf=NULL;
1397 return *this;
1398 }
1399
1400 bool operator==(const linebreak_iter<input_t> &v) const
1401 {
1402 fill();
1403 v.fill();
1404
1405 return buf == NULL && v.buf == NULL;
1406 }
1407
1408 bool operator!=(const linebreak_iter<input_t> &v) const
1409 {
1410 return !operator==(v);
1411 }
1412
1413 value_type operator*() const
1414 {
1415 fill();
1416 return buf == NULL ? UNICODE_LB_MANDATORY:
1417 buf->lb_buf.front();
1418 }
1419
1420 linebreak_iter<input_t> &operator++()
1421 {
1422 bufvalue=operator*();
1423
1424 if (buf)
1425 buf->lb_buf.pop_front();
1426 return *this;
1427 }
1428
1429 const value_type *operator++(int)
1430 {
1431 operator++();
1432 return &bufvalue;
1433 }
1434 };
1435
1436 /*
1437 ** Like linebreak_callback_base, except the subclass receives both
1438 ** the linebreaking value, and the unicode character.
1439 */
1440
1441 class linebreakc_callback_base {
1442
1443 unicode_lbc_info_t handle;
1444
1445 int opts;
1446
1447 linebreakc_callback_base(const linebreakc_callback_base &);
1448 /* NOT IMPLEMENTED */
1449
1450 linebreakc_callback_base &operator==(const
1451 linebreakc_callback_base
1452 &);
1453 /* NOT IMPLEMENTED */
1454
1455
1456 public:
1457 linebreakc_callback_base();
1458 virtual ~linebreakc_callback_base();
1459
1460 void finish();
1461
1462 void set_opts(int opts);
1463
1464 friend int linebreakc_trampoline(int, unicode_char, void *);
1465
1466 linebreakc_callback_base &operator<<(unicode_char uc);
1467
1468 template<typename iter_type>
1469 linebreakc_callback_base &operator()(iter_type beg_iter,
1470 iter_type end_iter)
1471 {
1472 while (beg_iter != end_iter)
1473 operator<<(*beg_iter++);
1474 return *this;
1475 }
1476
1477 linebreakc_callback_base &operator<<(const
1478 std::vector<unicode_char>
1479 &vec)
1480 {
1481 return operator()(vec.begin(), vec.end());
1482 }
1483 private:
1484 virtual int operator()(int, unicode_char);
1485 };
1486
1487 class linebreakc_callback_save_buf : public linebreakc_callback_base {
1488
1489 public:
1490 std::list<std::pair<int, unicode_char> > lb_buf;
1491
1492 linebreakc_callback_save_buf();
1493 ~linebreakc_callback_save_buf();
1494
1495 private:
1496 int operator()(int, unicode_char);
1497 };
1498
1499
1500 /*
1501 ** Convert an input iterator sequence over unicode_chars into
1502 ** an input iterator sequence over std::pair<int, unicode_char>,
1503 ** the original unicode character, and the linebreaking value before
1504 ** the character.
1505 */
1506
1507 template<typename input_t> class linebreakc_iter
1508 : public std::iterator<std::input_iterator_tag,
1509 std::pair<int, unicode_char>, void>
1510 {
1511 mutable input_t iter_value, end_iter_value;
1512
1513 mutable linebreakc_callback_save_buf *buf;
1514
1515 void fill() const
1516 {
1517 if (buf == NULL)
1518 return;
1519
1520 while (buf->lb_buf.empty())
1521 {
1522 if (iter_value == end_iter_value)
1523 {
1524 buf->finish();
1525 if (buf->lb_buf.empty())
1526 {
1527 delete buf;
1528 buf=NULL;
1529 }
1530 break;
1531 }
1532
1533 buf->operator<<(*iter_value);
1534 ++iter_value;
1535 }
1536 }
1537
1538 mutable value_type bufvalue;
1539
1540 public:
1541 linebreakc_iter(const input_t &iter_valueArg,
1542 const input_t &iter_endvalueArg)
1543 : iter_value(iter_valueArg),
1544 end_iter_value(iter_endvalueArg),
1545 buf(new linebreakc_callback_save_buf)
1546 {
1547 }
1548
1549 linebreakc_iter() : buf(NULL)
1550 {
1551 }
1552
1553 ~linebreakc_iter()
1554 {
1555 if (buf)
1556 delete buf;
1557 }
1558
1559 linebreakc_iter(const linebreakc_iter<input_t> &v)
1560 : buf(NULL)
1561 {
1562 operator=(v);
1563 }
1564
1565 linebreakc_iter<input_t> &operator=(const
1566 linebreakc_iter<input_t> &v)
1567 {
1568 if (buf)
1569 delete buf;
1570 buf=v.buf;
1571 iter_value=v.iter_value;
1572 end_iter_value=v.end_iter_value;
1573 v.buf=NULL;
1574 return *this;
1575 }
1576
1577 bool operator==(const linebreakc_iter<input_t> &v) const
1578 {
1579 fill();
1580 v.fill();
1581
1582 return buf == NULL && v.buf == NULL;
1583 }
1584
1585 bool operator!=(const linebreakc_iter<input_t> &v) const
1586 {
1587 return !operator==(v);
1588 }
1589
1590 value_type operator*() const
1591 {
1592 fill();
1593 return buf == NULL ?
1594 std::make_pair(UNICODE_LB_MANDATORY,
1595 (unicode_char)0):
1596 buf->lb_buf.front();
1597 }
1598
1599 linebreakc_iter<input_t> &operator++()
1600 {
1601 bufvalue=operator*();
1602
1603 if (buf)
1604 buf->lb_buf.pop_front();
1605 return *this;
1606 }
1607
1608 const value_type *operator++(int)
1609 {
1610 operator++();
1611 return &bufvalue;
1612 }
1613 };
1614
1615
1616 /*
1617 ** Subclass wordbreak_callback_base, implement operator()(int).
1618 **
1619 ** Use operator<< or operator()(iterator, iterator) to feed
1620 ** unicode_chars into the wordbreaking algorithm. The subclass receives
1621 ** word flags, as they become available.
1622 */
1623
1624 extern "C" int wordbreak_trampoline(int value, void *ptr);
1625
1626 class wordbreak_callback_base {
1627
1628 unicode_wb_info_t handle;
1629
1630 wordbreak_callback_base(const wordbreak_callback_base &);
1631 /* NOT IMPLEMENTED */
1632
1633 wordbreak_callback_base &operator==(const
1634 wordbreak_callback_base &);
1635 /* NOT IMPLEMENTED */
1636
1637 public:
1638 wordbreak_callback_base();
1639 virtual ~wordbreak_callback_base();
1640
1641 void finish();
1642
1643 friend int wordbreak_trampoline(int, void *);
1644
1645 wordbreak_callback_base &operator<<(unicode_char uc);
1646
1647 template<typename iter_type>
1648 wordbreak_callback_base &operator()(iter_type beg_iter,
1649 iter_type end_iter)
1650 {
1651 while (beg_iter != end_iter)
1652 operator<<(*beg_iter++);
1653 return *this;
1654 }
1655
1656 wordbreak_callback_base &operator<<(const
1657 std::vector<unicode_char>
1658 &vec)
1659 {
1660 return operator()(vec.begin(), vec.end());
1661 }
1662 private:
1663 virtual int operator()(bool);
1664 };
1665
1666 /*
1667 ** A C++ wrapper for unicode_wbscan.
1668 */
1669
1670 class wordbreakscan {
1671
1672 unicode_wbscan_info_t handle;
1673
1674 wordbreakscan(const wordbreakscan &);
1675 /* NOT IMPLEMENTED */
1676
1677 wordbreakscan &operator==(const wordbreakscan &);
1678 /* NOT IMPLEMENTED */
1679 public:
1680
1681 wordbreakscan();
1682 ~wordbreakscan();
1683
1684 bool operator<<(unicode_char uc);
1685
1686 size_t finish();
1687 };
1688
1689}
1690#endif
1691
1692#endif