libs/unicode/unicode.h

   1 #ifndef unicode_h
   2 #define unicode_h
   3
   4 /*
   5 ** Copyright 2000-2011 Double Precision, Inc.
   6 ** See COPYING for distribution information.
   7 **
   8 */
   9
  10 #ifdef  __cplusplus
  11
  12 #include <string>
  13 #include <vector>
  14 #include <list>
  15
  16 extern "C" {
  17 #endif
  18
  19 #if 0
  20 }
  21 #endif
  22
  23 #include        "unicode/unicode_config.h"
  24
  25 #include        <stdlib.h>
  26
  27 #include        <stdio.h>
  28 #if HAVE_WCHAR_H
  29 #include        <wchar.h>
  30 #endif
  31
  32 #if HAVE_STDDEF_H
  33 #include        <stddef.h>
  34 #endif
  35 #include        <stdint.h>
  36
  37 #include        <sys/types.h>
  38
  39 typedef uint32_t unicode_char;
  40
  41 /*
  42 ** The system default character set, from the locale.
  43 */
  44
  45 extern const char *unicode_default_chset();
  46
  47 /* Unicode upper/lower/title case conversion functions */
  48
  49 extern unicode_char unicode_uc(unicode_char);
  50 extern unicode_char unicode_lc(unicode_char);
  51 extern unicode_char unicode_tc(unicode_char);
  52
  53 /*
  54 ** Look up HTML 4.0/XHTML entity.
  55 **
  56 ** n="amp", etc...
  57 **
  58 ** Returns the unicode entity value, or 0 if no such entity is defined.
  59 */
  60
  61 unicode_char unicode_html40ent_lookup(const char *n);
  62
  63 /*
  64 **
  65 ** Return "width" of unicode character.
  66 **
  67 ** This is defined as follows: for characters having the F or W property in
  68 ** tr11 (EastAsianWidth), unicode_wcwidth() returns 2.
  69 **
  70 ** Otherwise, characters having the BK, CR, LF, CM, NL, WJ, and ZW line
  71 ** breaking property as per tr14, unicode_wcwdith() returns 0. For all other
  72 ** cases, 1.
  73 **
  74 ** This provides a rough estimate of the "width" of the character if its
  75 ** shown on a text console.
  76 */
  77
  78 extern int unicode_wcwidth(unicode_char c);
  79 extern size_t unicode_wcwidth_str(const unicode_char *c);
  80
  81 /*
  82 ** The unicode-ish isspace()
  83 */
  84 extern int unicode_isspace(unicode_char ch);
  85
  86 /* Internal unicode table lookup function */
  87
  88 extern uint8_t unicode_tab_lookup(unicode_char ch,
  89                                   const size_t *unicode_indextab,
  90                                   size_t unicode_indextab_sizeof,
  91                                   const uint8_t (*unicode_rangetab)[2],
  92                                   const uint8_t *unicode_classtab,
  93                                   uint8_t uclass);
  94
  95 /*
  96 ** Implementation of grapheme cluster boundary rules, as per tr29,
  97 ** including  GB9a and GB9b.
  98 **
  99 ** Returns non-zero if there's a grapheme break between the two referenced
 100 ** characters.
 101 */
 102
 103 int unicode_grapheme_break(unicode_char a, unicode_char b);
 104
 105 /*
 106 ** Implementation of line break rules, as per tr14.
 107 **
 108 ** Invoke unicode_lb_init() to initialize the linebreaking algorithm. The
 109 ** first parameter is a callback function that gets invoked with two
 110 ** arguments: UNICODE_LB_{MANDATORY|NONE|ALLOWED}, and a passthrough argument.
 111 ** The second parameter to unicode_lb_init() is the opaque passthrough
 112 ** pointer, that is passed as the second argument to the callback function
 113 ** with no further interpretation.
 114 **
 115 ** unicode_lb_init() returns an opaque handle. Invoke unicode_lb_next(),
 116 ** passing the handle and one unicode character. Repeatedly invoke
 117 ** unicode_lb_next() to specify the input string for the linebreaking
 118 ** algorithm, then invoke unicode_lb_end() to finish calculating the
 119 ** linebreaking algorithm, and deallocate the opaque linebreaking handle.
 120 **
 121 ** The callback function gets invoked once for each invocation of
 122 ** unicode_lb_next(). The contract is that before unicode_lb_end() returns,
 123 ** the callback function will get invoked the exact number of times that
 124 ** unicode_lb_next(), as long as each invocation of the callback function
 125 ** returned 0; nothing more, nothing less. The first parameter to the callback
 126 ** function will be one of the following values:
 127 **
 128 ** UNICODE_LB_MANDATORY - a linebreak is MANDATORY before the corresponding
 129 ** character.
 130 ** UNICODE_LB_NONE - a linebreak is PROHIBITED before the corresponding
 131 ** character.
 132 ** UNICODE_LB_ALLOWED - a linebreak is OPTIONAL before the corresponding
 133 ** character (the preceding character is a space, or an equivalent).
 134 **
 135 ** The callback function should return 0. A non-zero value indicates an
 136 ** error, which gets propagated up to the caller. The contract that the
 137 ** callback function gets invoked the same number of times that
 138 ** unicode_lb_next() gets invoked is now broken.
 139 */
 140
 141 #define UNICODE_LB_MANDATORY    -1
 142 #define UNICODE_LB_NONE         0
 143 #define UNICODE_LB_ALLOWED      1
 144
 145 struct unicode_lb_info;
 146
 147 typedef struct unicode_lb_info *unicode_lb_info_t;
 148
 149 /*
 150 ** Allocate a linebreaking handle.
 151 */
 152 extern unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
 153                                          void *cb_arg);
 154
 155 /*
 156 ** Feed the next character through the linebreaking algorithm.
 157 ** A non-zero return code indicates that the callback function was invoked
 158 ** and it returned a non-zero return code (which is propagated as a return
 159 ** value). unicode_lb_end() must still be invoked, in this case.
 160 **
 161 ** A zero return code indicates that if the callback function was invoked,
 162 ** it returned 0.
 163 */
 164
 165 extern int unicode_lb_next(unicode_lb_info_t i, unicode_char ch);
 166
 167 /*
 168 ** Convenience function that invokes unicode_lb_next() with a list of
 169 ** unicode chars. Returns 0 if all invocations of unicode_lb_next() returned
 170 ** 0, or the first non-zero return value from unicode_lb_next().
 171 */
 172
 173 extern int unicode_lb_next_cnt(unicode_lb_info_t i,
 174                                const unicode_char *chars,
 175                                size_t cnt);
 176
 177 /*
 178 ** Finish the linebreaking algorithm.
 179 **
 180 ** A non-zero return code indicates that the callback function was invoked
 181 ** and it returned a non-zero return code (which is propagated as a return
 182 ** value).
 183 **
 184 ** A zero return code indicates that if the callback function was invoked,
 185 ** it returned 0, and that the callback function was invoked exactly the same
 186 ** number of times that unicode_lb_next() was invoked.
 187 **
 188 ** In all case, the linebreak handle will no longer be valid when this
 189 ** function returns.
 190 */
 191
 192 extern int unicode_lb_end(unicode_lb_info_t i);
 193
 194 /*
 195 ** An alternative linebreak API where the callback function receives the
 196 ** original unicode character in addition to its linebreak value.
 197 **
 198 ** User unicode_lbc_init(), unicode_lbc_next(), and unicode_lbc_end(), whose
 199 ** semantics are the same as their _lb_ counterparts.
 200 */
 201
 202 struct unicode_lbc_info;
 203
 204 typedef struct unicode_lbc_info *unicode_lbc_info_t;
 205
 206 extern unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char,
 207                                                           void *),
 208                                            void *cb_arg);
 209 extern int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch);
 210 extern int unicode_lbc_end(unicode_lbc_info_t i);
 211
 212 /*
 213 ** Set linebreaking options.
 214 **
 215 ** OPTIONS SUBJECT TO CHANGE.
 216 */
 217
 218 extern void unicode_lb_set_opts(unicode_lb_info_t i, int opts);
 219
 220 extern void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts);
 221
 222 /*
 223 ** Tailorization of LB24: Prevent pluses, as in "C++", from breaking.
 224 **
 225 ** Adds the following to LB24:
 226 **
 227 **            PR x PR
 228 **
 229 **            AL x PR
 230 **
 231 **            ID x PR
 232 **/
 233 #define UNICODE_LB_OPT_PRBREAK 0x0001
 234
 235
 236 /*
 237 ** Tailored / breaking rules.
 238 **
 239 ** Adds the following rule to LB13:
 240 **
 241 **            SY x EX
 242 **
 243 **            SY x AL
 244 **
 245 **            SY x ID
 246 **
 247 **            SP ÷ SY, which takes precedence over "x SY".
 248 */
 249 #define UNICODE_LB_OPT_SYBREAK 0x0002
 250
 251 /*
 252 ** Tailored / breaking rules.
 253 **
 254 ** This reclassifies U+2013 and U+2014 as class WJ, prohibiting breaks before
 255 ** and after mdash and ndash.
 256 */
 257 #define UNICODE_LB_OPT_DASHWJ 0x0004
 258
 259 /*
 260 ** Implemention of word break rules, as per tr29.
 261 **
 262 ** Invoke unicode_wb_init() to initialize the wordbreaking algorithm. The
 263 ** first parameter is a callback function that gets invoked with two
 264 ** arguments: an int flag, and a passthrough argument. The second parameter to
 265 ** unicode_wb_init() is the opaque passthrough pointer, that is passed as the
 266 ** second argument to the callback function with no further interpretation.
 267 **
 268 ** unicode_wb_init() returns an opaque handle. Invoke unicode_wb_next(),
 269 ** passing the handle and one unicode character. Repeatedly invoke
 270 ** unicode_wb_next() to specify the input string for the wordbreaking
 271 ** algorithm, then invoke unicode_wb_end() to finish calculating the
 272 ** wordbreaking algorithm, and deallocate the opaque wordbreaking handle.
 273 **
 274 ** The callback function gets invoked once for each invocation of
 275 ** unicode_wb_next(). The contract is that before unicode_wb_end() returns,
 276 ** the callback function will get invoked the exact number of times that
 277 ** unicode_wb_next(), as long as each invocation of the callback function
 278 ** returned 0; nothing more, nothing less. The first parameter to the callback
 279 ** function will be an int. A non-zero value indicates that there is a word
 280 ** break between this character and the preceding one.
 281 **
 282 ** The callback function should return 0. A non-zero value indicates an
 283 ** error, which gets propagated up to the caller. The contract that the
 284 ** callback function gets invoked the same number of times that
 285 ** unicode_lb_next() gets invoked is now broken.
 286 */
 287
 288 struct unicode_wb_info;
 289
 290 typedef struct unicode_wb_info *unicode_wb_info_t;
 291
 292 /*
 293 ** Allocate a wordbreaking handle.
 294 */
 295 extern unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
 296                                          void *cb_arg);
 297
 298 /*
 299 ** Feed the next character through the wordbreaking algorithm.
 300 ** A non-zero return code indicates that the callback function was invoked
 301 ** and it returned a non-zero return code (which is propagated as a return
 302 ** value). unicode_wb_end() must still be invoked, in this case.
 303 **
 304 ** A zero return code indicates that if the callback function was invoked,
 305 ** it returned 0.
 306 */
 307
 308 extern int unicode_wb_next(unicode_wb_info_t i, unicode_char ch);
 309
 310 /*
 311 ** Convenience function that invokes unicode_wb_next() with a list of
 312 ** unicode chars. Returns 0 if all invocations of unicode_wb_next() returned
 313 ** 0, or the first non-zero return value from unicode_wb_next().
 314 */
 315
 316 extern int unicode_wb_next_cnt(unicode_wb_info_t i,
 317                                const unicode_char *chars,
 318                                size_t cnt);
 319
 320 /*
 321 ** Finish the wordbreaking algorithm.
 322 **
 323 ** A non-zero return code indicates that the callback function was invoked
 324 ** and it returned a non-zero return code (which is propagated as a return
 325 ** value).
 326 **
 327 ** A zero return code indicates that if the callback function was invoked,
 328 ** it returned 0, and that the callback function was invoked exactly the same
 329 ** number of times that unicode_wb_next() was invoked.
 330 **
 331 ** In all case, the wordbreak handle will no longer be valid when this
 332 ** function returns.
 333 */
 334
 335 extern int unicode_wb_end(unicode_wb_info_t i);
 336
 337 /*
 338 ** Search for a word boundary.
 339 **
 340 ** Obtain a handle by calling unicode_wbscan_init(), then invoke
 341 ** unicode_wbscan_next() to provide a unicode stream, then invoke
 342 ** unicode_wbscan_end(). unicode_wbscan_end() returns the number of unicode
 343 ** characters from the beginning of the stream until the first word boundary.
 344 **
 345 ** You may prematurely stop calling unicode_wbscan_next() once it returns a
 346 ** non-0 value, which means that there is sufficient context to compute the
 347 ** first word boundary, and all further calls to unicode_wbscan_next() will
 348 ** be internal no-ops.
 349 */
 350
 351 struct unicode_wbscan_info;
 352
 353 typedef struct unicode_wbscan_info *unicode_wbscan_info_t;
 354
 355 unicode_wbscan_info_t unicode_wbscan_init();
 356
 357 int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch);
 358
 359 size_t unicode_wbscan_end(unicode_wbscan_info_t i);
 360
 361 /*
 362 ** A buffer that holds unicode characters, and dynamically grows as needed.
 363 */
 364
 365 struct unicode_buf {
 366         unicode_char *ptr;      /* The unicode characters */
 367         size_t size,            /* Buffer size */
 368                 len,            /* How many characters in ptr are initialized */
 369                 max;            /* Maximum size the buffer can grow to */
 370 };
 371
 372 /*
 373 ** Initialize a buffer. Constructor.
 374 */
 375
 376 void unicode_buf_init(/* Initialize this structure. ptr, size, len cleared */
 377                       struct unicode_buf *p,
 378
 379                       /*
 380                       ** Maximum size the buffer can grow to. (size_t)-1
 381                       ** means unlimited.
 382                       */
 383                       size_t max);
 384 /*
 385 ** Like unicode_buf_init, and initialize the new buffer with the contents of
 386 ** another buffer. The maximum size of the initialized buffer is exactly the
 387 ** number of characters in the existing buffer. This copies a buffer using
 388 ** the minimum amount of heap space.
 389 */
 390
 391 #define unicode_buf_init_copy(a,b)                              \
 392         do {                                                    \
 393                 unicode_buf_init((a), unicode_buf_len(b));      \
 394                 unicode_buf_append_buf((a),(b));                \
 395         } while (0)
 396
 397 /*
 398 ** Deinitialize the buffer. Destructor. Frees memory.
 399 */
 400
 401 void unicode_buf_deinit(struct unicode_buf *p);
 402
 403 /*
 404 ** Official way to access the characters in the unicode buffer.
 405 */
 406 #define unicode_buf_ptr(p) ((p)->ptr)
 407
 408 /*
 409 ** Official way of obtaining the number of characters in the unicode buffer.
 410 */
 411 #define unicode_buf_len(p) ((p)->len)
 412
 413 /*
 414 ** Remove all existing characters from an initialized buffer. Sets len to 0.
 415 */
 416
 417 #define unicode_buf_clear(p) ((p)->len=0)
 418
 419 /*
 420 ** Append characters to the existing characters in the unicode buffer.
 421 ** The buffer grows, if needed. If the buffer would exceed its maximum size,
 422 ** the extra characters get truncated.
 423 **
 424 ** Returns 0 if the characters were appended. -1 for a malloc failure.
 425 */
 426
 427 int unicode_buf_append(struct unicode_buf *p,   /* The buffer */
 428                        const unicode_char *uc,  /* Characters to append */
 429                        size_t l);               /* How many of them */
 430
 431 /*
 432 ** Convert an iso-8859-1 char string and invoke unicode_buf_append().
 433 */
 434
 435 void unicode_buf_append_char(struct unicode_buf *dst,
 436                              const char *str,
 437                              size_t cnt);
 438
 439 /*
 440 ** Remove some portion of the unicode buffer
 441 */
 442
 443 void unicode_buf_remove(struct unicode_buf *p, /* The buffer */
 444                         size_t pos, /* Offset in buffer */
 445                         size_t cnt); /* How many to remove */
 446
 447 /*
 448 ** Append the contents of an existing buffer to another one.
 449 */
 450
 451 #define unicode_buf_append_buf(a,b)                                     \
 452         unicode_buf_append((a), unicode_buf_ptr(b), unicode_buf_len(b))
 453
 454
 455 /*
 456 ** The equivalent of strcmp() for unicode buffers.
 457 */
 458
 459 int unicode_buf_cmp(const struct unicode_buf *a,
 460                     const struct unicode_buf *b);
 461
 462 /*
 463 ** The equivalent of unicode_buf_cmp, except that the second buffer is an
 464 ** iso-8859-1 string.
 465 */
 466
 467 int unicode_buf_cmp_str(const struct unicode_buf *p,
 468                         const char *c,  /* iso-8859-1 string */
 469                         size_t cl);     /* Number of chars in c */
 470
 471 /*
 472 ** A wrapper for iconv(3). This wrapper provides a different API for iconv(3).
 473 ** A handle gets created by libmail_u_convert_init().
 474 ** libmail_u_convert_init() receives a pointer to the output function
 475 ** which receives converted character text.
 476 **
 477 ** The output function receives a pointer to the converted character text, and
 478 ** the number of characters in the converted text.
 479 **
 480 ** The character text to convert gets passed, repeatedly, to
 481 ** libmail_u_convert(). Each call to libmail_u_convert() results in
 482 ** the output function being invoked, zero or more times, with the converted
 483 ** text. Finally, libmail_u_convert_deinit() stops the conversion and
 484 ** deallocates the conversion handle.
 485 **
 486 ** Internal buffering takes place. libmail_u_convert_deinit() may result
 487 ** in the output function being called one or more times, to receive the final
 488 ** part of the converted character stream.
 489 **
 490 ** The output function should return 0. A non-0 value causes
 491 ** libmail_u_convert() and/or libmail_u_convert_deinit() returning
 492 ** non-0.
 493 */
 494
 495 struct libmail_u_convert_hdr;
 496
 497 typedef struct libmail_u_convert_hdr *libmail_u_convert_handle_t;
 498
 499 /*
 500 ** libmail_u_convert_init() returns a non-NULL handle for the requested
 501 ** conversion, or NULL if the requested conversion is not available.
 502 */
 503
 504 libmail_u_convert_handle_t
 505 libmail_u_convert_init(/* Convert from this chset */
 506                        const char *src_chset,
 507
 508                        /* Convert to this chset */
 509                        const char *dst_chset,
 510
 511                        /* The output function */
 512
 513                        int (*output_func)(const char *, size_t, void *),
 514
 515                        /* Passthrough arg */
 516                        void *convert_arg);
 517
 518 /*
 519 ** Repeatedly pass the character text to convert to libmail_u_convert().
 520 **
 521 ** Returns non-0 if the output function returned non-0, or 0 if all invocations
 522 ** of the output function returned 0.
 523 */
 524
 525 int libmail_u_convert(/* The conversion handle */
 526                       libmail_u_convert_handle_t handle,
 527
 528                       /* Text to convert */
 529                       const char *text,
 530
 531                       /* Number of bytes to convert */
 532                       size_t cnt);
 533
 534 /*
 535 ** Finish character set conversion. The handle gets deallocated.
 536 **
 537 ** May still result in one or more invocations of the output function.
 538 ** Returns non-zero if any previous invocation of the output function returned
 539 ** non-zero (this includes any invocations of the output function resulting
 540 ** from this call, or prior libmail_u_convert() calls), or 0 if all
 541 ** invocations of the output function returned 0.
 542 **
 543 ** If the errptr is not NULL, *errptr is set to non-zero if there were any
 544 ** conversion errors -- if there was any text that could not be converted to
 545 ** the destination character text.
 546 */
 547
 548 int libmail_u_convert_deinit(libmail_u_convert_handle_t handle,
 549                              int *errptr);
 550
 551
 552 /*
 553 ** Specialization: save converted character text in a buffer.
 554 **
 555 ** Implementation: call libmail_u_convert_tocbuf_init() instead of
 556 ** libmail_u_convert_init(), then call libmail_u_convert() and
 557 ** libmail_u_convert_deinit(), as usual.
 558 **
 559 ** If libmail_u_convert_deinit() returns 0, *cbufptr_ret gets initialized to a
 560 ** malloc()ed buffer, and the number of converted characters, the size of the
 561 ** malloc()ed buffer, are placed into *csize_ret arguments, that were passed
 562 ** to libmail_u_convert_tou_init().
 563 **
 564 ** Note: if the converted string is an empty string, *cbufsize_ret is set to 0,
 565 ** but *cbufptr_ptr still gets initialized (to a dummy malloced buffer).
 566 **
 567 ** The optional nullterminate places a trailing \0 character after the
 568 ** converted string (this is included in *cbufsize_ret).
 569 */
 570
 571 libmail_u_convert_handle_t
 572 libmail_u_convert_tocbuf_init(/* Convert from this chset */
 573                               const char *src_chset,
 574
 575                               /* Convert to this chset */
 576                               const char *dst_chset,
 577
 578                               /* malloced buffer */
 579                               char **cbufptr_ret,
 580
 581                               /* size of the malloced buffer */
 582                               size_t *cbufsize_ret,
 583
 584                               /* null terminate the resulting string */
 585                               int nullterminate
 586                               );
 587
 588
 589 /*
 590 ** Specialization: convert some character text to a unicode_char array.
 591 **
 592 ** This is like libmail_u_convert_tocbuf_init(), but converts to a unicode_char
 593 ** array.
 594 **
 595 ** The returned *ucsize_ret is initialized with the number of unicode_chars,
 596 ** rather than the byte count.
 597 **
 598 ** In all other ways, this function behaves identically to
 599 ** libmail_u_convert_tocbuf_init().
 600 */
 601
 602 libmail_u_convert_handle_t
 603 libmail_u_convert_tou_init(/* Convert from this chset */
 604                            const char *src_chset,
 605
 606                            /* malloc()ed buffer pointer, on exit. */
 607                            unicode_char **ucptr_ret,
 608
 609                            /* size of the malloc()ed buffer, upon exit */
 610                            size_t *ucsize_ret,
 611
 612                            /* If true, terminate with U+0x0000, for convenience */
 613                            int nullterminate
 614                            );
 615
 616 /*
 617 ** Specialization: convert a unicode_char array to some character text.
 618 **
 619 ** This is the opposite of libmail_u_convert_tou_init(). Call this to
 620 ** initialize the conversion handle, then use libmail_u_convert_uc()
 621 ** instead of libmail_u_convert.
 622 */
 623
 624 libmail_u_convert_handle_t
 625 libmail_u_convert_fromu_init(/* Convert to this chset */
 626                              const char *dst_chset,
 627
 628                              /* malloc()ed buffer pointer, on exit. */
 629                              char **cbufptr_ret,
 630
 631                              /* size of the malloc()ed buffer, upon exit */
 632                              size_t *cbufsize_ret,
 633
 634                              /* If true, terminate with U+0x0000, for convenience */
 635                              int nullterminate
 636                              );
 637
 638 int libmail_u_convert_uc(/* The conversion handle */
 639                          libmail_u_convert_handle_t handle,
 640
 641                          /* Text to convert */
 642                          const unicode_char *text,
 643
 644                          /* Number of bytes to convert */
 645                          size_t cnt);
 646
 647 /*
 648 ** Initialize conversion to UTF-8.
 649 **
 650 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
 651 ** destination charset as UTF-8.
 652 */
 653
 654 libmail_u_convert_handle_t
 655 libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
 656                                      char **cbufptr_ret,
 657                                      size_t *cbufsize_ret,
 658                                      int nullterminate);
 659
 660 /*
 661 ** Initialize conversion from UTF-8.
 662 **
 663 ** This is a wrapper for libmail_u_convert_tocbuf_init() that specifies the
 664 ** source charset as UTF-8.
 665 */
 666
 667 libmail_u_convert_handle_t
 668 libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
 669                                        char **cbufptr_ret,
 670                                        size_t *cbufsize_ret,
 671                                        int nullterminate);
 672
 673 /*
 674 ** Convert a character string to UTF-8.
 675 **
 676 ** Returns a malloc-ed buffer holding the UTF-8 string, or NULL if an
 677 ** error occured.
 678 */
 679 char *libmail_u_convert_toutf8(/* Text to convert to UTF-8 */
 680                                const char *text,
 681
 682                                /* Character set to convert to UTF-8 */
 683                                const char *charset,
 684
 685                                /*
 686                                ** If non-NULL, and a non-NULL pointer is
 687                                ** returned, *error is set to non-zero if
 688                                ** a character conversion error has occured.
 689                                */
 690                                int *error);
 691
 692 /*
 693 ** Convert UTF-8 text to another character set.
 694 **
 695 ** Returns a malloc-ed buffer holding the string converted to the specified
 696 ** character set, or NULL if an error occured.
 697 */
 698
 699 char *libmail_u_convert_fromutf8(/* A UTF-8 string */
 700                                  const char *text,
 701
 702                                  /*
 703                                  ** Convert the UTF-8 string to this character
 704                                  ** set.
 705                                  */
 706
 707                                  const char *charset,
 708
 709                                  /*
 710                                  ** If non-NULL, and a non-NULL pointer is
 711                                  ** returned, *error is set to non-zero if
 712                                  ** a character conversion error has occured.
 713                                  */
 714                                  int *error);
 715
 716 /*
 717 ** Convert one charset to another charset, placing the result in a malloc-ed
 718 ** buffer.
 719 **
 720 ** Returns a malloc-ed buffer holding the string converted to the specified
 721 ** character set, or NULL if an error occured.
 722 */
 723
 724 char *libmail_u_convert_tobuf(/* A string to convert */
 725                               const char *text,
 726
 727                               /*
 728                               ** String's charset.
 729                               */
 730
 731                               const char *charset,
 732
 733                               /*
 734                               ** Destination charset
 735                               */
 736                               const char *dstcharset,
 737
 738                               /*
 739                               ** If non-NULL, and a non-NULL pointer is
 740                               ** returned, *error is set to non-zero if
 741                               ** a character conversion error has occured.
 742                               */
 743                               int *error);
 744
 745 /*
 746 ** Convenience function: call libmail_u_convert_tou_init(), feed the
 747 ** character string through libmail_u_convert(), then call
 748 ** libmail_u_convert_deinit().
 749 **
 750 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
 751 ** holding the unicode char array.
 752 */
 753
 754 int libmail_u_convert_tou_tobuf(/* Character text to convert */
 755                                 const char *text,
 756
 757                                 /* Number of characters */
 758                                 size_t text_l,
 759
 760                                 /* text's charset */
 761                                 const char *charset,
 762
 763                                 /*
 764                                 ** If this function returns 0, this gets
 765                                 ** initialized
 766                                 */
 767                                 unicode_char **uc,
 768
 769                                 /*
 770                                 ** Size of the allocated buffer
 771                                 */
 772                                 size_t *ucsize,
 773
 774                                 /*
 775                                 ** If not null and this function returns 0,
 776                                 ** this is set to non-0 if there
 777                                 ** was a conversion error (but the output
 778                                 ** buffer gets still allocated and
 779                                 ** initialized)
 780                                 */
 781                                 int *err);
 782
 783 /*
 784 ** Convenience function: call libmail_u_convert_fromu_init(), feed the
 785 ** unicode_array through libmail_u_convert_uc(), then call
 786 ** libmail_u_convert_deinit().
 787 **
 788 ** If this function returns 0, *uc and *ucsize is set to a malloced buffer+size
 789 ** holding the converted character string
 790 */
 791
 792 int libmail_u_convert_fromu_tobuf(/* Unicode array to convert to a char str */
 793                                   const unicode_char *utext,
 794
 795                                   /*
 796                                   ** Size of the unicode array.
 797                                   ** If this is (size_t)-1, utext is a
 798                                   ** 0-terminated array.
 799                                   */
 800                                   size_t utext_l,
 801
 802                                   /*
 803                                   ** Convert the unicode array to this charset.
 804                                   */
 805                                   const char *charset,
 806
 807                                   /*
 808                                   ** If libmail_u_convert_fromu_tobuf()
 809                                   ** returns 0, this is initialized to a
 810                                   ** malloced buffer with a 0-terminated
 811                                   ** string is kept.
 812                                   */
 813                                   char **c,
 814
 815                                   /*
 816                                   ** Size of the initialized array, including
 817                                   ** the 0-terminator.
 818                                   */
 819                                   size_t *csize,
 820
 821                                   /*
 822                                   ** If libmail_u_convert_fromu_tobuf()
 823                                   ** returns 0 and this is not NULL,
 824                                   ** *err is set to non-0 if there was a
 825                                   ** conversion error to the requested
 826                                   ** character set.
 827                                   */
 828                                   int *err);
 829
 830 /*
 831 ** Convenience function: convert a string in a given character set
 832 ** to/from uppercase, lowercase, or something else.
 833 **
 834 ** This is done by calling libmail_u_convert_tou_tobuf() first,
 835 ** applying the title_func and char_func, then using
 836 ** libmail_u_convert_fromu_tobuf().
 837 **
 838 ** A NULL return indicates that the requested conversion cannot be performed.
 839 */
 840
 841 char *libmail_u_convert_tocase( /* String to convert */
 842                                const char *str,
 843
 844                                /* String's character set */
 845
 846                                const char *charset,
 847
 848                                /*
 849                                ** Conversion of the first character in
 850                                ** str: unicode_uc, unicode_lc, or unicode_tc:
 851                                */
 852
 853                                unicode_char (*first_char_func)(unicode_char),
 854
 855                                /*
 856                                ** Conversion of the second and the remaining
 857                                ** character in str. If NULL, same as
 858                                ** first_char_func.
 859                                */
 860                                unicode_char (*char_func)(unicode_char));
 861
 862
 863
 864 /* Either UCS-4BE or UCS-4LE, matching the native unicode_char endianness */
 865
 866 extern const char libmail_u_ucs4_native[];
 867
 868 /* Either UCS-2BE or UCS-2LE, matching the native unicode_char endianness */
 869
 870 extern const char libmail_u_ucs2_native[];
 871
 872 /*
 873 ** Modified-UTF7 encoding used for IMAP folder names. Pass it for a charset
 874 ** parameter.
 875 **
 876 ** This can be followed by a " " and up to 15 characters to be escaped in
 877 ** addition to unicode chars.
 878 */
 879
 880 #define unicode_x_imap_modutf7 "x-imap-modutf7"
 881
 882 #if 0
 883 {
 884 #endif
 885
 886 #ifdef  __cplusplus
 887 }
 888
 889 extern size_t unicode_wcwidth(const std::vector<unicode_char> &uc);
 890
 891 namespace mail {
 892
 893         /*
 894         ** Interface to iconv.
 895         **
 896         ** Subclass converted(). Invoke begin(), then operator(), repeatedly,
 897         ** then end().
 898         **
 899         ** converted() receives the converted text.
 900         */
 901
 902         class iconvert {
 903
 904                 libmail_u_convert_handle_t handle;
 905
 906         public:
 907                 iconvert();
 908                 virtual ~iconvert();
 909
 910                 /* Start conversion.
 911                 ** Returns false if the requested conversion cannot be done.
 912                 **/
 913
 914                 bool begin(/* Convert from */
 915                            const std::string &src_chset,
 916
 917                            /* Convert to */
 918                            const std::string &dst_chset);
 919
 920                 /* Feed iconv(3). Returns false if the conversion was aborted.
 921                  */
 922
 923                 bool operator()(const char *, size_t);
 924
 925                 bool operator()(const unicode_char *, size_t);
 926
 927                 /*
 928                 ** Get the results here. If the subclass returns a non-0
 929                 ** value, the conversion is aborted.
 930                 */
 931
 932                 virtual int converted(const char *, size_t);
 933
 934                 /*
 935                 ** End of conversion.
 936                 **
 937                 ** Returns true if all calls to converted() returned 0,
 938                 ** false if the conversion was aborted.
 939                 **
 940                 ** errflag is set to true if there was a character that could
 941                 ** not be converted, and passed to converted().
 942                 */
 943
 944                 bool end(bool &errflag)
 945                 {
 946                         return end(&errflag);
 947                 }
 948
 949                 bool end()
 950                 {
 951                         return end(NULL);
 952                 }
 953
 954                 /* Convert between two different charsets */
 955
 956                 static std::string convert(const std::string &text,
 957                                            const std::string &charset,
 958                                            const std::string &dstcharset,
 959                                            bool &errflag);
 960
 961                 /* Convert between two different charsets */
 962
 963                 static std::string convert(const std::string &text,
 964                                            const std::string &charset,
 965                                            const std::string &dstcharset)
 966                 {
 967                         bool dummy;
 968
 969                         return convert(text, charset, dstcharset, dummy);
 970                 }
 971
 972                 /* Convert from unicode to a charset */
 973
 974                 static std::string convert(const std::vector<unicode_char> &uc,
 975                                            const std::string &dstcharset,
 976                                            bool &errflag);
 977
 978                 /* Convert from unicode to a charset */
 979
 980                 static std::string convert(const std::vector<unicode_char> &uc,
 981                                            const std::string &dstcharset)
 982                 {
 983                         bool dummy;
 984
 985                         return convert(uc, dstcharset, dummy);
 986                 }
 987
 988                 /* Convert charset to unicode */
 989
 990                 static bool convert(const std::string &text,
 991                                     const std::string &charset,
 992                                     std::vector<unicode_char> &uc);
 993
 994
 995                 /* Convert to upper/lower/title case */
 996
 997                 static std::string
 998                         convert_tocase(/* Text string */
 999                                        const std::string &text,
1000
1001                                        /* Its charset */
1002                                        const std::string &charset,
1003
1004                                        /* First character: unicode_uc, unicode_lc, or unicode_tc */
1005                                        unicode_char (*first_char_func)(unicode_char),
1006
1007                                        /* If not NULL, second and subsequent chars */
1008                                        unicode_char (*char_func)(unicode_char)
1009                                        =NULL)
1010                 {
1011                         bool dummy;
1012
1013                         return convert_tocase(text, charset, dummy,
1014                                               first_char_func,
1015                                               char_func);
1016                 }
1017
1018                 /* Convert to upper/lower/title case */
1019
1020                 static std::string
1021                         convert_tocase(/* Text string */
1022                                        const std::string &text,
1023
1024                                        /* Its charset */
1025                                        const std::string &charset,
1026
1027                                        /* Set if there's a conversion error */
1028                                        bool &err,
1029
1030                                        /* First character: unicode_uc, unicode_lc, or unicode_tc */
1031                                        unicode_char (*first_char_func)(unicode_char),
1032
1033                                        /* If not NULL, second and subsequent chars */
1034                                        unicode_char (*char_func)(unicode_char)
1035                                        =NULL);
1036         private:
1037                 bool end(bool *);
1038
1039         public:
1040                 class tou;
1041                 class fromu;
1042         };
1043
1044         /* Convert output of iconvert to unicode_chars. */
1045
1046         class iconvert::tou : public iconvert {
1047
1048         public:
1049                 bool begin(const std::string &chset);
1050
1051                 virtual int converted(const unicode_char *, size_t);
1052
1053                 using iconvert::operator();
1054         private:
1055                 int converted(const char *ptr, size_t cnt);
1056
1057         public:
1058                 template<typename iter_t> class to_iter_class;
1059
1060                 template<typename input_iter_t,
1061                         typename output_iter_t>
1062                         static output_iter_t convert(input_iter_t from_iter,
1063                                                      input_iter_t to_iter,
1064                                                      const std::string &chset,
1065                                                      output_iter_t out_iter);
1066
1067                 template<typename input_iter_t>
1068                         static void convert(input_iter_t from_iter,
1069                                             input_iter_t to_iter,
1070                                             const std::string &chset,
1071                                             std::vector<unicode_char> &out_buf)
1072                 {
1073                         out_buf.clear();
1074                         std::back_insert_iterator<std::vector<unicode_char> >
1075                                 insert_iter(out_buf);
1076
1077                         convert(from_iter, to_iter, chset, insert_iter);
1078                 }
1079
1080                 static void convert(const std::string &str,
1081                                     const std::string &chset,
1082                                     std::vector<unicode_char> &out_buf);
1083         };
1084
1085         /* Helper class that saves unicode output into an output iterator */
1086
1087         template<typename iter_t>
1088                 class iconvert::tou::to_iter_class : public iconvert::tou {
1089
1090                 iter_t iter;
1091         public:
1092
1093         to_iter_class(iter_t iterValue)
1094                 : iter(iterValue) {}
1095
1096                 using tou::operator();
1097
1098                 operator iter_t() const { return iter; }
1099
1100         private:
1101                 int converted(const unicode_char *ptr, size_t cnt)
1102                 {
1103                         while (cnt)
1104                         {
1105                                 *iter=*ptr;
1106
1107                                 ++iter;
1108                                 ++ptr;
1109                                 --cnt;
1110                         }
1111                         return 0;
1112                 }
1113         };
1114
1115         template<typename input_iter_t,
1116                 typename output_iter_t>
1117                 output_iter_t iconvert::tou::convert(input_iter_t from_iter,
1118                                                      input_iter_t to_iter,
1119                                                      const std::string &chset,
1120                                                      output_iter_t out_iter)
1121                 {
1122                         class to_iter_class<output_iter_t> out(out_iter);
1123
1124                         if (!out.begin(chset))
1125                                 return out;
1126
1127                         std::vector<char> string;
1128
1129                         while (from_iter != to_iter)
1130                         {
1131                                 string.push_back(*from_iter++);
1132
1133                                 if (string.size() > 31)
1134                                 {
1135                                         out(&string[0], string.size());
1136                                         string.clear();
1137                                 }
1138                         }
1139
1140                         if (string.size() > 0)
1141                                 out(&string[0], string.size());
1142
1143                         out.end();
1144                         return out;
1145                 }
1146
1147         /* Convert output of iconvert from unicode_chars. */
1148
1149         class iconvert::fromu : public iconvert {
1150
1151         public:
1152                 bool begin(const std::string &chset);
1153
1154                 using iconvert::operator();
1155
1156                 template<typename iter_t> class to_iter_class;
1157
1158                 template<typename input_iter_t,
1159                         typename output_iter_t>
1160                         static output_iter_t convert(input_iter_t from_iter,
1161                                                      input_iter_t to_iter,
1162                                                      const std::string &chset,
1163                                                      output_iter_t out_iter);
1164
1165                 template<typename input_iter_t>
1166                         static void convert(input_iter_t from_iter,
1167                                             input_iter_t to_iter,
1168                                             const std::string &chset,
1169                                             std::string &out_buf)
1170                 {
1171                         out_buf="";
1172                         std::back_insert_iterator<std::string>
1173                                 insert_iter(out_buf);
1174
1175                         convert(from_iter, to_iter, chset, insert_iter);
1176                 }
1177
1178                 static void convert(const std::vector<unicode_char> &ubuf,
1179                                     const std::string &chset,
1180                                     std::string &out_buf);
1181
1182                 static std::string convert(const std::vector<unicode_char>
1183                                            &ubuf,
1184                                            const std::string &chset);
1185         };
1186
1187         /* Helper class that saves unicode output into an output iterator */
1188
1189         template<typename iter_t>
1190                 class iconvert::fromu::to_iter_class : public iconvert::fromu {
1191
1192                 iter_t iter;
1193         public:
1194
1195         to_iter_class(iter_t iterValue)
1196                 : iter(iterValue) {}
1197
1198                 using fromu::operator();
1199
1200                 operator iter_t() const { return iter; }
1201
1202         private:
1203                 int converted(const char *ptr, size_t cnt)
1204                 {
1205                         while (cnt)
1206                         {
1207                                 *iter=*ptr;
1208
1209                                 ++iter;
1210                                 ++ptr;
1211                                 --cnt;
1212                         }
1213                         return 0;
1214                 }
1215         };
1216
1217         template<typename input_iter_t,
1218                 typename output_iter_t>
1219                 output_iter_t iconvert::fromu::convert(input_iter_t from_iter,
1220                                                        input_iter_t to_iter,
1221                                                        const std::string &chset,
1222                                                        output_iter_t out_iter)
1223                 {
1224                         class to_iter_class<output_iter_t> out(out_iter);
1225
1226                         if (!out.begin(chset))
1227                                 return out;
1228
1229                         std::vector<unicode_char> string;
1230
1231                         while (from_iter != to_iter)
1232                         {
1233                                 string.push_back(*from_iter++);
1234
1235                                 if (string.size() > 31)
1236                                 {
1237                                         out(&string[0], string.size());
1238                                         string.clear();
1239                                 }
1240                         }
1241
1242                         if (string.size() > 0)
1243                                 out(&string[0], string.size());
1244
1245                         out.end();
1246                         return out;
1247                 }
1248
1249         /*
1250         ** Unicode linebreaking algorithm, tr14.
1251         */
1252
1253         extern "C" int linebreak_trampoline(int value, void *ptr);
1254         extern "C" int linebreakc_trampoline(int value, unicode_char ch,
1255                                              void *ptr);
1256
1257         /*
1258         ** Subclass linebreak_callback_base, implement operator()(int).
1259         **
1260         ** Use operator<< or operator()(iterator, iterator) to feed
1261         ** unicode_chars into the linebreaking algorithm. The subclass receives
1262         ** UNICODE_LB values, as they become available.
1263         */
1264
1265         class linebreak_callback_base {
1266
1267                 unicode_lb_info_t handle;
1268
1269                 int opts;
1270
1271                 linebreak_callback_base(const linebreak_callback_base &);
1272                 /* NOT IMPLEMENTED */
1273
1274                 linebreak_callback_base &operator==(const
1275                                                     linebreak_callback_base &);
1276                 /* NOT IMPLEMENTED */
1277
1278         public:
1279                 linebreak_callback_base();
1280                 virtual ~linebreak_callback_base();
1281
1282                 void finish();
1283
1284                 void set_opts(int opts);
1285
1286                 friend int linebreak_trampoline(int, void *);
1287
1288                 linebreak_callback_base &operator<<(unicode_char uc);
1289
1290                 template<typename iter_type>
1291                         linebreak_callback_base &operator()(iter_type beg_iter,
1292                                                             iter_type end_iter)
1293                 {
1294                         while (beg_iter != end_iter)
1295                                 operator<<(*beg_iter++);
1296                         return *this;
1297                 }
1298
1299                 linebreak_callback_base &operator<<(const
1300                                                     std::vector<unicode_char>
1301                                                     &vec)
1302                 {
1303                         return operator()(vec.begin(), vec.end());
1304                 }
1305         private:
1306                 virtual int operator()(int);
1307         };
1308
1309         class linebreak_callback_save_buf : public linebreak_callback_base {
1310
1311         public:
1312                 std::list<int> lb_buf;
1313
1314                 linebreak_callback_save_buf();
1315                 ~linebreak_callback_save_buf();
1316
1317         private:
1318                 int operator()(int value);
1319         };
1320
1321         /*
1322         ** Convert an input iterator sequence over unicode_chars into
1323         ** an input iterator sequence over linebreak values.
1324         */
1325
1326         template<typename input_t> class linebreak_iter
1327                 : public std::iterator<std::input_iterator_tag, int, void>
1328         {
1329                 mutable input_t iter_value, end_iter_value;
1330
1331                 mutable linebreak_callback_save_buf *buf;
1332
1333                 void fill() const
1334                 {
1335                         if (buf == NULL)
1336                                 return;
1337
1338                         while (buf->lb_buf.empty())
1339                         {
1340                                 if (iter_value == end_iter_value)
1341                                 {
1342                                         buf->finish();
1343                                         if (buf->lb_buf.empty())
1344                                         {
1345                                                 delete buf;
1346                                                 buf=NULL;
1347                                         }
1348                                         break;
1349                                 }
1350
1351                                 buf->operator<<(*iter_value++);
1352                         }
1353                 }
1354
1355                 mutable value_type bufvalue;
1356
1357         public:
1358                 linebreak_iter(const input_t &iter_valueArg,
1359                                const input_t &iter_endvalueArg)
1360                         : iter_value(iter_valueArg),
1361                         end_iter_value(iter_endvalueArg),
1362                         buf(new linebreak_callback_save_buf)
1363                         {
1364                         }
1365
1366                 linebreak_iter() : buf(NULL)
1367                 {
1368                 }
1369
1370                 void set_opts(int opts)
1371                 {
1372                         if (buf)
1373                                 buf->set_opts(opts);
1374                 }
1375
1376                 ~linebreak_iter()
1377                 {
1378                         if (buf)
1379                                 delete buf;
1380                 }
1381
1382                 linebreak_iter(const linebreak_iter<input_t> &v)
1383                         : buf(NULL)
1384                 {
1385                         operator=(v);
1386                 }
1387
1388                 linebreak_iter<input_t> &operator=(const
1389                                                    linebreak_iter<input_t> &v)
1390                 {
1391                         if (buf)
1392                                 delete buf;
1393                         buf=v.buf;
1394                         iter_value=v.iter_value;
1395                         end_iter_value=v.end_iter_value;
1396                         v.buf=NULL;
1397                         return *this;
1398                 }
1399
1400                 bool operator==(const linebreak_iter<input_t> &v) const
1401                 {
1402                         fill();
1403                         v.fill();
1404
1405                         return buf == NULL && v.buf == NULL;
1406                 }
1407
1408                 bool operator!=(const linebreak_iter<input_t> &v) const
1409                 {
1410                         return !operator==(v);
1411                 }
1412
1413                 value_type operator*() const
1414                 {
1415                         fill();
1416                         return buf == NULL ? UNICODE_LB_MANDATORY:
1417                                 buf->lb_buf.front();
1418                 }
1419
1420                 linebreak_iter<input_t> &operator++()
1421                 {
1422                         bufvalue=operator*();
1423
1424                         if (buf)
1425                                 buf->lb_buf.pop_front();
1426                         return *this;
1427                 }
1428
1429                 const value_type *operator++(int)
1430                 {
1431                         operator++();
1432                         return &bufvalue;
1433                 }
1434         };
1435
1436         /*
1437         ** Like linebreak_callback_base, except the subclass receives both
1438         ** the linebreaking value, and the unicode character.
1439         */
1440
1441         class linebreakc_callback_base {
1442
1443                 unicode_lbc_info_t handle;
1444
1445                 int opts;
1446
1447                 linebreakc_callback_base(const linebreakc_callback_base &);
1448                 /* NOT IMPLEMENTED */
1449
1450                 linebreakc_callback_base &operator==(const
1451                                                      linebreakc_callback_base
1452                                                      &);
1453                 /* NOT IMPLEMENTED */
1454
1455
1456         public:
1457                 linebreakc_callback_base();
1458                 virtual ~linebreakc_callback_base();
1459
1460                 void finish();
1461
1462                 void set_opts(int opts);
1463
1464                 friend int linebreakc_trampoline(int, unicode_char, void *);
1465
1466                 linebreakc_callback_base &operator<<(unicode_char uc);
1467
1468                 template<typename iter_type>
1469                         linebreakc_callback_base &operator()(iter_type beg_iter,
1470                                                             iter_type end_iter)
1471                 {
1472                         while (beg_iter != end_iter)
1473                                 operator<<(*beg_iter++);
1474                         return *this;
1475                 }
1476
1477                 linebreakc_callback_base &operator<<(const
1478                                                     std::vector<unicode_char>
1479                                                     &vec)
1480                 {
1481                         return operator()(vec.begin(), vec.end());
1482                 }
1483         private:
1484                 virtual int operator()(int, unicode_char);
1485         };
1486
1487         class linebreakc_callback_save_buf : public linebreakc_callback_base {
1488
1489         public:
1490                 std::list<std::pair<int, unicode_char> > lb_buf;
1491
1492                 linebreakc_callback_save_buf();
1493                 ~linebreakc_callback_save_buf();
1494
1495         private:
1496                 int operator()(int, unicode_char);
1497         };
1498
1499
1500         /*
1501         ** Convert an input iterator sequence over unicode_chars into
1502         ** an input iterator sequence over std::pair<int, unicode_char>,
1503         ** the original unicode character, and the linebreaking value before
1504         ** the character.
1505         */
1506
1507         template<typename input_t> class linebreakc_iter
1508                 : public std::iterator<std::input_iterator_tag,
1509                 std::pair<int, unicode_char>, void>
1510         {
1511                 mutable input_t iter_value, end_iter_value;
1512
1513                 mutable linebreakc_callback_save_buf *buf;
1514
1515                 void fill() const
1516                 {
1517                         if (buf == NULL)
1518                                 return;
1519
1520                         while (buf->lb_buf.empty())
1521                         {
1522                                 if (iter_value == end_iter_value)
1523                                 {
1524                                         buf->finish();
1525                                         if (buf->lb_buf.empty())
1526                                         {
1527                                                 delete buf;
1528                                                 buf=NULL;
1529                                         }
1530                                         break;
1531                                 }
1532
1533                                 buf->operator<<(*iter_value);
1534                                 ++iter_value;
1535                         }
1536                 }
1537
1538                 mutable value_type bufvalue;
1539
1540         public:
1541                 linebreakc_iter(const input_t &iter_valueArg,
1542                                 const input_t &iter_endvalueArg)
1543                         : iter_value(iter_valueArg),
1544                         end_iter_value(iter_endvalueArg),
1545                         buf(new linebreakc_callback_save_buf)
1546                         {
1547                         }
1548
1549                 linebreakc_iter() : buf(NULL)
1550                 {
1551                 }
1552
1553                 ~linebreakc_iter()
1554                 {
1555                         if (buf)
1556                                 delete buf;
1557                 }
1558
1559                 linebreakc_iter(const linebreakc_iter<input_t> &v)
1560                         : buf(NULL)
1561                 {
1562                         operator=(v);
1563                 }
1564
1565                 linebreakc_iter<input_t> &operator=(const
1566                                                    linebreakc_iter<input_t> &v)
1567                 {
1568                         if (buf)
1569                                 delete buf;
1570                         buf=v.buf;
1571                         iter_value=v.iter_value;
1572                         end_iter_value=v.end_iter_value;
1573                         v.buf=NULL;
1574                         return *this;
1575                 }
1576
1577                 bool operator==(const linebreakc_iter<input_t> &v) const
1578                 {
1579                         fill();
1580                         v.fill();
1581
1582                         return buf == NULL && v.buf == NULL;
1583                 }
1584
1585                 bool operator!=(const linebreakc_iter<input_t> &v) const
1586                 {
1587                         return !operator==(v);
1588                 }
1589
1590                 value_type operator*() const
1591                 {
1592                         fill();
1593                         return buf == NULL ?
1594                                 std::make_pair(UNICODE_LB_MANDATORY,
1595                                                (unicode_char)0):
1596                                 buf->lb_buf.front();
1597                 }
1598
1599                 linebreakc_iter<input_t> &operator++()
1600                 {
1601                         bufvalue=operator*();
1602
1603                         if (buf)
1604                                 buf->lb_buf.pop_front();
1605                         return *this;
1606                 }
1607
1608                 const value_type *operator++(int)
1609                 {
1610                         operator++();
1611                         return &bufvalue;
1612                 }
1613         };
1614
1615
1616         /*
1617         ** Subclass wordbreak_callback_base, implement operator()(int).
1618         **
1619         ** Use operator<< or operator()(iterator, iterator) to feed
1620         ** unicode_chars into the wordbreaking algorithm. The subclass receives
1621         ** word flags, as they become available.
1622         */
1623
1624         extern "C" int wordbreak_trampoline(int value, void *ptr);
1625
1626         class wordbreak_callback_base {
1627
1628                 unicode_wb_info_t handle;
1629
1630                 wordbreak_callback_base(const wordbreak_callback_base &);
1631                 /* NOT IMPLEMENTED */
1632
1633                 wordbreak_callback_base &operator==(const
1634                                                     wordbreak_callback_base &);
1635                 /* NOT IMPLEMENTED */
1636
1637         public:
1638                 wordbreak_callback_base();
1639                 virtual ~wordbreak_callback_base();
1640
1641                 void finish();
1642
1643                 friend int wordbreak_trampoline(int, void *);
1644
1645                 wordbreak_callback_base &operator<<(unicode_char uc);
1646
1647                 template<typename iter_type>
1648                         wordbreak_callback_base &operator()(iter_type beg_iter,
1649                                                             iter_type end_iter)
1650                 {
1651                         while (beg_iter != end_iter)
1652                                 operator<<(*beg_iter++);
1653                         return *this;
1654                 }
1655
1656                 wordbreak_callback_base &operator<<(const
1657                                                     std::vector<unicode_char>
1658                                                     &vec)
1659                 {
1660                         return operator()(vec.begin(), vec.end());
1661                 }
1662         private:
1663                 virtual int operator()(bool);
1664         };
1665
1666         /*
1667         ** A C++ wrapper for unicode_wbscan.
1668         */
1669
1670         class wordbreakscan {
1671
1672                 unicode_wbscan_info_t handle;
1673
1674                 wordbreakscan(const wordbreakscan &);
1675                 /* NOT IMPLEMENTED */
1676
1677                 wordbreakscan &operator==(const wordbreakscan &);
1678                 /* NOT IMPLEMENTED */
1679         public:
1680
1681                 wordbreakscan();
1682                 ~wordbreakscan();
1683
1684                 bool operator<<(unicode_char uc);
1685
1686                 size_t finish();
1687         };
1688
1689 }
1690 #endif
1691
1692 #endif