Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode.c
1 /*
2 ** Copyright 2000-2011 Double Precision, Inc.
3 ** See COPYING for distribution information.
4 **
5 */
6
7 #include "unicode_config.h"
8 #include "unicode.h"
9 #include "../rfc822/rfc822hdr.h"
10 #include <string.h>
11 #include <ctype.h>
12 #include <stdlib.h>
13 #include <iconv.h>
14 #include <errno.h>
15 #if HAVE_LOCALE_H
16 #if HAVE_SETLOCALE
17 #include <locale.h>
18 #if USE_LIBCHARSET
19 #if HAVE_LOCALCHARSET_H
20 #include <localcharset.h>
21 #elif HAVE_LIBCHARSET_H
22 #include <libcharset.h>
23 #endif /* HAVE_LOCALCHARSET_H */
24 #elif HAVE_LANGINFO_CODESET
25 #include <langinfo.h>
26 #endif /* USE_LIBCHARSET */
27 #endif /* HAVE_SETLOCALE */
28 #endif /* HAVE_LOCALE_H */
29
30 static char default_chset_buf[32];
31
32 static void init_default_chset()
33 {
34 const char *old_locale=NULL;
35 const char *chset=NULL;
36 char *locale_cpy=NULL;
37 char buf[sizeof(default_chset_buf)];
38
39 chset=getenv("MM_CHARSET");
40
41 if (chset == NULL)
42 chset=getenv("CHARSET");
43
44 if (chset == NULL)
45 {
46 #if HAVE_LOCALE_H
47 #if HAVE_SETLOCALE
48 old_locale=setlocale(LC_ALL, "");
49 locale_cpy=old_locale ? strdup(old_locale):NULL;
50 #if USE_LIBCHARSET
51 chset = locale_charset();
52 #elif HAVE_LANGINFO_CODESET
53 chset=nl_langinfo(CODESET);
54 #endif
55 #endif
56 #endif
57 }
58
59 memset(buf, 0, sizeof(buf));
60
61 if (chset &&
62
63 /* Map GNU libc iconv oddity to us-ascii */
64
65 (strcmp(chset, "ANSI_X3.4") == 0 ||
66 strncmp(chset, "ANSI_X3.4-", 10) == 0))
67 chset="US-ASCII";
68
69 if (chset)
70 {
71 strncat(buf, chset, sizeof(buf)-1);
72 }
73 else
74 {
75 const char *p=getenv("LANG");
76
77 /* LANG is xx_yy.CHARSET@modifier */
78
79 if (p && *p && (p=strchr(p, '.')) != NULL)
80 {
81 const char *q=strchr(++p, '@');
82
83 if (!q)
84 q=p+strlen(p);
85
86 if (q-p >= sizeof(buf)-1)
87 q=p+sizeof(buf)-1;
88
89 memcpy(buf, p, q-p);
90 buf[q-p]=0;
91 }
92 else
93 strcpy(buf, "US-ASCII");
94 }
95
96 memcpy(default_chset_buf, buf, sizeof(buf));
97
98 #if HAVE_LOCALE_H
99 #if HAVE_SETLOCALE
100 if (locale_cpy)
101 {
102 setlocale(LC_ALL, locale_cpy);
103 free(locale_cpy);
104 }
105 #endif
106 #endif
107
108 }
109
110 const char *unicode_default_chset()
111 {
112 if (default_chset_buf[0] == 0)
113 init_default_chset();
114
115 return default_chset_buf;
116 }
117
118
119 /*****************************************************************************/
120
121 const char libmail_u_ucs4_native[]=
122 #if WORDS_BIGENDIAN
123 "UCS-4BE"
124 #else
125 "UCS-4LE"
126 #endif
127 ;
128
129 const char libmail_u_ucs2_native[]=
130 #if WORDS_BIGENDIAN
131 "UCS-2BE"
132 #else
133 "UCS-2LE"
134 #endif
135 ;
136
137 /* A stack of conversion modules */
138
139 struct libmail_u_convert_hdr {
140
141 int (*convert_handler)(void *ptr,
142 const char *text, size_t cnt);
143 int (*deinit_handler)(void *ptr, int *errptr);
144 void *ptr;
145
146 struct libmail_u_convert_hdr *next;
147 };
148
149 /* Decoding table for modified UTF7-encoding as used in imap */
150
151 static const char mbase64_lookup[]={
152 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
153 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
154 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,63,-1,-1,-1,
155 52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,
156 -1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
157 15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,
158 -1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
159 41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1,
160 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
161 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
162 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
163 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
164 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
165 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
166 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
167 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
168
169 static const char mbase64[]=
170 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
171
172 /*
173 ** Conversion wrapper for converting to modified-utf7 IMAP encoding.
174 **
175 ** This is done by converting to UCS2, then stacking on a module that
176 ** takes that and converts UCS2 to modified-UTF7.
177 **
178 ** init_nottoimaputf7() returns an opaque stack for converting to ucs2.
179 */
180
181 static libmail_u_convert_handle_t
182 init_nottoimaputf7(const char *src_chset,
183 const char *dst_chset,
184 int (*output_func)(const char *, size_t, void *),
185 void *convert_arg);
186
187 /*
188 ** The to modified UTF7 module
189 */
190
191 struct libmail_u_convert_toimaputf7 {
192
193 struct libmail_u_convert_hdr hdr;
194
195 /* Accumulated output buffer */
196
197 char utf7encodebuf[1024];
198 size_t utf7encodebuf_cnt;
199
200 /* Accumulated bits for base64 encoding */
201 uint32_t utf7bits;
202
203 /* How many bits in utf7bits */
204 uint16_t utf7bitcount;
205
206 /* Flag: in base64mode */
207 uint16_t utfmode;
208
209 int errflag;
210
211 /* Any extra characters that should be munged */
212
213 char smapmunge[16];
214
215 /* Remembered output function */
216
217 int (*output_func)(const char *, size_t, void *);
218
219 /* Remembered arg to the output function */
220 void *convert_arg;
221 };
222
223 /* Macro - flush the output buffer */
224 #define toimaputf7_encode_flush(p) do { \
225 int rc; \
226 \
227 rc=(*(p)->output_func)((p)->utf7encodebuf, \
228 (p)->utf7encodebuf_cnt, \
229 (p)->convert_arg); \
230 if (rc) \
231 return ((p)->errflag=(rc)); \
232 \
233 (p)->utf7encodebuf_cnt=0; \
234 } while (0)
235
236 static int toimaputf7_encode_flushfinal(struct libmail_u_convert_toimaputf7 *p)
237 {
238 if (p->utf7encodebuf_cnt > 0)
239 toimaputf7_encode_flush(p);
240 return 0;
241 }
242
243 /* Macro - add one char to the output buffer */
244
245 #define toimaputf7_encode_add(p,c) do { \
246 if ((p)->utf7encodebuf_cnt >= sizeof((p)->utf7encodebuf)) \
247 toimaputf7_encode_flush((p)); \
248 \
249 (p)->utf7encodebuf[(p)->utf7encodebuf_cnt++]=(c); \
250 } while (0);
251
252 static int deinit_toimaputf7(void *ptr, int *errptr);
253
254 static int do_convert_toutf7(const char *text, size_t cnt, void *arg);
255 static int convert_utf7_handler(void *ptr, const char *text, size_t cnt);
256
257 /*
258 ** Create a conversion module stack
259 */
260
261 libmail_u_convert_handle_t
262 libmail_u_convert_init(const char *src_chset,
263 const char *dst_chset,
264 int (*output_func)(const char *, size_t, void *),
265 void *convert_arg)
266 {
267 struct libmail_u_convert_toimaputf7 *toutf7;
268 libmail_u_convert_handle_t h;
269 const char *smapmunge;
270 size_t l=strlen(unicode_x_imap_modutf7);
271
272 if (strncmp(dst_chset, unicode_x_imap_modutf7, l) == 0 &&
273 (dst_chset[l] == 0 || dst_chset[l] == ' '))
274 {
275 smapmunge=dst_chset + l;
276
277 if (*smapmunge)
278 ++smapmunge;
279 }
280 else
281 return init_nottoimaputf7(src_chset, dst_chset,
282 output_func,
283 convert_arg);
284
285 toutf7=malloc(sizeof(struct libmail_u_convert_toimaputf7));
286
287 if (!toutf7)
288 return NULL;
289
290 memset(toutf7, 0, sizeof(*toutf7));
291
292 h=init_nottoimaputf7(src_chset, libmail_u_ucs2_native,
293 do_convert_toutf7, toutf7);
294 if (!h)
295 {
296 free(toutf7);
297 return (NULL);
298 }
299
300 toutf7->output_func=output_func;
301 toutf7->convert_arg=convert_arg;
302
303 strncat(toutf7->smapmunge, smapmunge, sizeof(toutf7->smapmunge)-1);
304
305 toutf7->hdr.convert_handler=convert_utf7_handler;
306 toutf7->hdr.deinit_handler=deinit_toimaputf7;
307 toutf7->hdr.ptr=toutf7;
308 toutf7->hdr.next=h;
309 return &toutf7->hdr;
310 }
311
312 /* Passthrough to the wrapped stack */
313
314 static int convert_utf7_handler(void *ptr, const char *text, size_t cnt)
315 {
316 struct libmail_u_convert_toimaputf7 *toutf7=
317 (struct libmail_u_convert_toimaputf7 *)ptr;
318
319 return (*toutf7->hdr.next->convert_handler)(toutf7->hdr.next->ptr,
320 text, cnt);
321 }
322
323 static int utf7off(struct libmail_u_convert_toimaputf7 *toutf7)
324 {
325 if (!toutf7->utfmode)
326 return 0;
327 toutf7->utfmode=0;
328
329 if (toutf7->utf7bitcount > 0)
330 toimaputf7_encode_add(toutf7,
331 mbase64[(toutf7->utf7bits
332 << (6-toutf7->utf7bitcount))
333 & 63]);
334 toimaputf7_encode_add(toutf7, '-');
335 return 0;
336 }
337
338
339 static int do_convert_toutf7(const char *text, size_t cnt, void *arg)
340 {
341 struct libmail_u_convert_toimaputf7 *toutf7=
342 (struct libmail_u_convert_toimaputf7 *)arg;
343
344 /* We better be getting UCS-2 here! */
345
346 const uint16_t *utext=(const uint16_t *)text;
347 cnt /= 2;
348
349 while (cnt)
350 {
351 if (toutf7->errflag)
352 return toutf7->errflag;
353
354 if (*utext >= 0x20 && *utext <= 0x7F
355 && strchr( toutf7->smapmunge, (char)*utext) == NULL)
356
357 /*
358 && (!toutf7->smapmunge || (*utext != '.' && *utext != '/' &&
359 *utext != '~' && *utext != ':')))
360 */
361 {
362 if (utf7off(toutf7))
363 return toutf7->errflag;
364
365 toimaputf7_encode_add(toutf7, *utext);
366
367 if (*utext == '&')
368 toimaputf7_encode_add(toutf7, '-');
369
370 ++utext;
371 --cnt;
372 continue;
373 }
374
375 if (!toutf7->utfmode)
376 {
377 toutf7->utfmode=1;
378 toutf7->utf7bitcount=0;
379 toimaputf7_encode_add(toutf7, '&');
380 continue;
381 }
382
383 toutf7->utf7bits = (toutf7->utf7bits << 16) |
384 (((uint32_t)*utext) & 0xFFFF);
385 toutf7->utf7bitcount += 16;
386
387 ++utext;
388 --cnt;
389
390 /* If there's at least 6 bits, output base64-encoded char */
391
392 while (toutf7->utf7bitcount >= 6)
393 {
394 uint32_t v;
395 int n;
396
397 if (toutf7->errflag)
398 return toutf7->errflag;
399
400 v=toutf7->utf7bits;
401 n=toutf7->utf7bitcount-6;
402 toutf7->utf7bitcount -= 6;
403
404 if (n > 0)
405 v >>= n;
406
407 toimaputf7_encode_add(toutf7, mbase64[v & 63]);
408 }
409 }
410
411 return 0;
412 }
413
414 static int deinit_toimaputf7(void *ptr, int *errptr)
415 {
416 int rc;
417
418 struct libmail_u_convert_toimaputf7 *toutf7=
419 (struct libmail_u_convert_toimaputf7 *)ptr;
420
421 /* Flush out the downstream stack */
422 rc=(*toutf7->hdr.next->deinit_handler)(toutf7->hdr.next->ptr, errptr);
423
424 /* Make sure we're out of modified base64 */
425
426 if (rc == 0)
427 rc=utf7off(toutf7);
428
429 if (rc == 0 && toutf7->utf7encodebuf_cnt > 0)
430 rc=toimaputf7_encode_flushfinal(toutf7);
431
432 free(toutf7);
433 return rc;
434 }
435
436 /************/
437
438 /*
439 ** Convert from modified-utf7 IMAP encoding.
440 **
441 ** This module converts it to UCS-2, then this is attached to a stack that
442 ** converts UCS-2 to the requested charset.
443 */
444
445 static libmail_u_convert_handle_t
446 init_notfromimaputf7(const char *src_chset,
447 const char *dst_chset,
448 int (*output_func)(const char *, size_t, void *),
449 void *convert_arg);
450
451 struct libmail_u_convert_fromimaputf7 {
452
453 struct libmail_u_convert_hdr hdr;
454
455 /* Accumulated UCS-2 stream */
456 uint16_t convbuf[512];
457 size_t convbuf_cnt;
458
459 /* Accumulated base64 bits */
460 uint32_t modbits;
461
462 /* How many bits extracted from a base64 stream */
463
464 short modcnt;
465
466 /* Flag: seen the & */
467 char seenamp;
468
469 /* Flag: seen the &, and the next char wasn't - */
470
471 char inmod;
472 int errflag;
473 int converr;
474 };
475
476 /* Flush the accumulated UCS-2 stream */
477
478 #define convert_fromutf7_flush(p) do { \
479 (p)->errflag=(*(p)->hdr.next->convert_handler) \
480 ((p)->hdr.next->ptr, \
481 (const char *)(p)->convbuf, \
482 (p)->convbuf_cnt * \
483 sizeof((p)->convbuf[0])); \
484 (p)->convbuf_cnt=0; \
485 } while (0)
486
487 /* Accumulated a UCS-2 char */
488
489 #define convert_fromutf7_add(p,c) do { \
490 if ((p)->convbuf_cnt >= \
491 sizeof((p)->convbuf)/sizeof((p)->convbuf[0])) \
492 convert_fromutf7_flush((p)); \
493 (p)->convbuf[(p)->convbuf_cnt++]=(c); \
494 } while (0)
495
496
497 static int convert_fromutf7(void *ptr,
498 const char *text, size_t cnt);
499 static int deinit_fromutf7(void *ptr, int *errptr);
500
501 static libmail_u_convert_handle_t
502 init_nottoimaputf7(const char *src_chset,
503 const char *dst_chset,
504 int (*output_func)(const char *, size_t, void *),
505 void *convert_arg)
506 {
507 struct libmail_u_convert_fromimaputf7 *fromutf7;
508 libmail_u_convert_handle_t h;
509 size_t l=strlen(unicode_x_imap_modutf7);
510
511 if (strncmp(src_chset, unicode_x_imap_modutf7, l) == 0 &&
512 (src_chset[l] == 0 || src_chset[l] == ' '))
513 ;
514 else
515 return init_notfromimaputf7(src_chset, dst_chset,
516 output_func,
517 convert_arg);
518
519 fromutf7=(struct libmail_u_convert_fromimaputf7 *)
520 malloc(sizeof(struct libmail_u_convert_fromimaputf7));
521
522 if (!fromutf7)
523 return NULL;
524
525 memset(fromutf7, 0, sizeof(*fromutf7));
526
527 /* Create a stack for converting UCS-2 to the dest charset */
528
529 h=init_notfromimaputf7(libmail_u_ucs2_native, dst_chset,
530 output_func, convert_arg);
531
532 if (!h)
533 {
534 free(fromutf7);
535 return (NULL);
536 }
537
538 fromutf7->hdr.next=h;
539 fromutf7->hdr.convert_handler=convert_fromutf7;
540 fromutf7->hdr.deinit_handler=deinit_fromutf7;
541 fromutf7->hdr.ptr=fromutf7;
542 return &fromutf7->hdr;
543 }
544
545 static int convert_fromutf7(void *ptr,
546 const char *text, size_t cnt)
547 {
548 struct libmail_u_convert_fromimaputf7 *fromutf7=
549 (struct libmail_u_convert_fromimaputf7 *)ptr;
550 int bits;
551
552 while (cnt)
553 {
554 if (fromutf7->errflag)
555 return fromutf7->errflag;
556
557 if (!fromutf7->seenamp && *text == '&')
558 {
559 fromutf7->seenamp=1;
560 fromutf7->inmod=0;
561 fromutf7->modcnt=0;
562 ++text;
563 --cnt;
564 continue;
565 }
566
567 if (fromutf7->seenamp)
568 {
569 if (*text == '-')
570 {
571 convert_fromutf7_add(fromutf7, '&');
572 ++text;
573 --cnt;
574 fromutf7->seenamp=0;
575 continue;
576 }
577 fromutf7->seenamp=0;
578 fromutf7->inmod=1;
579 }
580
581 if (!fromutf7->inmod)
582 {
583 /* Not in the base64 encoded stream */
584
585 convert_fromutf7_add(fromutf7,
586 ((uint16_t)*text) & 0xFFFF);
587 ++text;
588 --cnt;
589 continue;
590 }
591
592 if (*text == '-')
593 {
594 /* End of the base64 encoded stream */
595 fromutf7->inmod=0;
596 ++text;
597 --cnt;
598 continue;
599 }
600
601 /* Got 6 more bits */
602
603 bits=mbase64_lookup[(unsigned char)*text];
604
605 ++text;
606 --cnt;
607
608 if (bits < 0)
609 {
610 errno=EILSEQ;
611 return fromutf7->errflag=-1;
612 }
613
614 fromutf7->modbits = (fromutf7->modbits << 6) | bits;
615 fromutf7->modcnt += 6;
616
617 if (fromutf7->modcnt >= 16)
618 {
619 /* Got a UCS-2 char */
620
621 int shiftcnt=fromutf7->modcnt - 16;
622 uint32_t v=fromutf7->modbits;
623
624 if (shiftcnt)
625 v >>= shiftcnt;
626
627 fromutf7->modcnt -= 16;
628
629 convert_fromutf7_add(fromutf7, v);
630 }
631 }
632 return 0;
633 }
634
635 static int deinit_fromutf7(void *ptr, int *errptr)
636 {
637 struct libmail_u_convert_fromimaputf7 *fromutf7=
638 (struct libmail_u_convert_fromimaputf7 *)ptr;
639 int rc;
640
641 if (fromutf7->seenamp || fromutf7->inmod)
642 {
643 if (fromutf7->errflag == 0)
644 {
645 fromutf7->errflag= -1;
646 errno=EILSEQ;
647 }
648 }
649
650 if (fromutf7->convbuf_cnt)
651 convert_fromutf7_flush(fromutf7);
652
653 rc=fromutf7->hdr.next->deinit_handler(fromutf7->hdr.next->ptr, errptr);
654
655 if (fromutf7->errflag && rc == 0)
656 rc=fromutf7->errflag;
657
658 if (errptr && fromutf7->converr)
659 *errptr=1;
660
661 free(fromutf7);
662 return rc;
663 }
664
665 /************/
666
667 /* A real conversion module, via iconv */
668
669 struct libmail_u_convert_iconv {
670
671 struct libmail_u_convert_hdr hdr;
672
673 iconv_t h;
674 int errflag; /* Accumulated errors */
675
676 int (*output_func)(const char *, size_t, void *);
677 void *convert_arg;
678
679 char buffer[1024]; /* Input buffer */
680 size_t bufcnt; /* Accumulated input in buffer */
681 char skipcnt; /* Skip this many bytes upon encountering EILSEQ */
682 char skipleft; /* How many bytes are currently left to skip */
683 char converr; /* Flag - an EILSEQ was encountered */
684 } ;
685
686 static int init_iconv(struct libmail_u_convert_iconv *h,
687 const char *src_chset,
688 const char *dst_chset,
689 int (*output_func)(const char *, size_t, void *),
690 void *convert_arg);
691
692 static libmail_u_convert_handle_t
693 init_notfromimaputf7(const char *src_chset,
694 const char *dst_chset,
695 int (*output_func)(const char *, size_t, void *),
696 void *convert_arg)
697 {
698
699
700 struct libmail_u_convert_iconv *h=
701 malloc(sizeof(struct libmail_u_convert_iconv));
702
703 if (!h)
704 return NULL;
705
706 memset(h, 0, sizeof(*h));
707
708 if (init_iconv(h, src_chset, dst_chset, output_func, convert_arg))
709 {
710 free(h);
711 return NULL;
712 }
713 return &h->hdr;
714 }
715
716 /* Run the stack */
717
718 int libmail_u_convert(libmail_u_convert_handle_t h,
719 const char *text, size_t cnt)
720 {
721 return (*h->convert_handler)(h->ptr, text, cnt);
722 }
723
724 /* Destroy the stack */
725
726 int libmail_u_convert_deinit(libmail_u_convert_handle_t h, int *errptr)
727 {
728 return (*h->deinit_handler)(h, errptr);
729 }
730
731 static int deinit_iconv(void *ptr, int *errptr);
732 static int convert_iconv(void *ptr,
733 const char *text, size_t cnt);
734
735 /* Initialize a single conversion module, in the stack */
736
737 static int init_iconv(struct libmail_u_convert_iconv *h,
738 const char *src_chset,
739 const char *dst_chset,
740 int (*output_func)(const char *, size_t, void *),
741 void *convert_arg)
742 {
743 if ((h->h=iconv_open(dst_chset, src_chset)) == (iconv_t)-1)
744 return -1;
745
746 h->hdr.convert_handler=convert_iconv;
747 h->hdr.deinit_handler=deinit_iconv;
748 h->hdr.ptr=h;
749
750 h->output_func=output_func;
751 h->convert_arg=convert_arg;
752
753 /* Heuristically determine how many octets to skip upon an EILSEQ */
754
755 h->skipcnt=1;
756 switch (src_chset[0]) {
757 case 'u':
758 case 'U':
759 switch (src_chset[1]) {
760 case 'c':
761 case 'C':
762 switch (src_chset[2]) {
763 case 's':
764 case 'S':
765 if (src_chset[3] == '-')
766 switch (src_chset[4]) {
767 case '4':
768 /* UCS-4 */
769 h->skipcnt=4;
770 break;
771 case '2':
772 /* UCS-2 */
773 h->skipcnt=2;
774 break;
775 }
776 }
777 break;
778 case 't':
779 case 'T':
780 switch (src_chset[2]) {
781 case 'f':
782 case 'F':
783 if (src_chset[3] == '-')
784 switch (src_chset[4]) {
785 case '3':
786 /* UTF-32 */
787 h->skipcnt=4;
788 break;
789 case '1':
790 /* UTF-16 */
791 h->skipcnt=2;
792 break;
793 }
794 }
795 }
796 }
797
798 return 0;
799 }
800
801 static void convert_flush(struct libmail_u_convert_iconv *);
802 static void convert_flush_iconv(struct libmail_u_convert_iconv *, const char **,
803 size_t *);
804
805 /*
806 ** iconv conversion module. Accumulate input in an input buffer. When the
807 ** input buffer is full, invoke convert_flush().
808 */
809
810 static int convert_iconv(void *ptr,
811 const char *text, size_t cnt)
812 {
813 struct libmail_u_convert_iconv *h=(struct libmail_u_convert_iconv *)ptr;
814
815 while (cnt && h->errflag == 0)
816 {
817 if (h->bufcnt >= sizeof(h->buffer)-1)
818 {
819 convert_flush(h);
820
821 if (h->errflag)
822 break;
823 }
824
825 h->buffer[h->bufcnt++]= *text++;
826 --cnt;
827 }
828
829 return h->errflag;
830 }
831
832 /*
833 ** Finish an iconv conversion module. Invoke convert_flush() to flush any
834 ** buffered input. Invoke convert_flush_iconv() to return state to the initial
835 ** conversion state.
836 */
837
838 static int deinit_iconv(void *ptr, int *errptr)
839 {
840 int rc;
841 int converr;
842 struct libmail_u_convert_iconv *h=(struct libmail_u_convert_iconv *)ptr;
843 libmail_u_convert_handle_t next;
844
845 if (h->errflag == 0)
846 convert_flush(h);
847
848 if (h->bufcnt && h->errflag == 0)
849 h->converr=1;
850
851 if (h->errflag == 0)
852 convert_flush_iconv(h, NULL, NULL);
853
854 rc=h->errflag;
855 converr=h->converr != 0;
856 iconv_close(h->h);
857 next=h->hdr.next;
858 free(h);
859 if (errptr)
860 *errptr=converr;
861
862 /* If there's another module in the stack, clean that up */
863
864 if (next)
865 {
866 int converrnext;
867 int rcnext=libmail_u_convert_deinit(next, &converrnext);
868
869 if (converrnext && errptr && *errptr == 0)
870 *errptr=converr;
871
872 if (rcnext && rc == 0)
873 rc=rcnext;
874 }
875 return rc;
876 }
877
878 /*
879 ** Invoke convert_flush_iconv() to flush the input buffer. If there's
880 ** unconverted text remaining, reposition it at the beginning of the input
881 ** buffer.
882 */
883
884 static void convert_flush(struct libmail_u_convert_iconv *h)
885 {
886 const char *p;
887 size_t n;
888
889 if (h->bufcnt == 0 || h->errflag)
890 return;
891
892 p=h->buffer;
893 n=h->bufcnt;
894
895 convert_flush_iconv(h, &p, &n);
896
897 if (h->errflag)
898 return;
899
900 if (h->bufcnt == n)
901 n=0; /* Unexpected error, dunno what to do, punt */
902
903 h->bufcnt=0;
904
905 while (n)
906 {
907 h->buffer[h->bufcnt]= *p;
908
909 ++h->bufcnt;
910 ++p;
911 --n;
912 }
913 }
914
915 /*
916 ** Convert text via iconv.
917 */
918
919 static void convert_flush_iconv(struct libmail_u_convert_iconv *h,
920 const char **inbuf, size_t *inbytesleft)
921 {
922 int save_errno;
923
924 while (1)
925 {
926 char outbuf[1024];
927 char *outp;
928 size_t outleft;
929 size_t n;
930 size_t origin=0;
931
932 if (inbytesleft)
933 {
934 if ((origin=*inbytesleft) == 0)
935 return;
936
937 if (inbuf && h->skipleft && origin)
938 {
939 /* Skipping after an EILSEQ */
940
941 --h->skipleft;
942 --*inbytesleft;
943 ++*inbuf;
944 continue;
945 }
946
947 }
948
949 if (h->errflag)
950 {
951 /* Quietly eat everything after a previous error */
952
953 if (inbytesleft)
954 *inbytesleft=0;
955
956 return;
957 }
958
959 outp=outbuf;
960 outleft=sizeof(outbuf);
961
962 n=iconv(h->h, (char **)inbuf, inbytesleft, &outp, &outleft);
963
964 save_errno=errno;
965
966 /* Anything produced by iconv() gets pushed down the stack */
967
968 if (outp > outbuf)
969 {
970 int rc=(*h->output_func)(outbuf, outp-outbuf,
971 h->convert_arg);
972 if (rc)
973 {
974 h->errflag=rc;
975 return;
976 }
977 }
978
979 if (n != (size_t)-1)
980 {
981 /* iconv(3) reason #2 */
982
983 break;
984 }
985
986 if (inbytesleft == 0)
987 {
988 /*
989 ** An error when generating the shift sequence to
990 ** return to the initial state. We don't know what to
991 ** do, now.
992 */
993
994 errno=EINVAL;
995 h->errflag= -1;
996 return;
997 }
998
999 /*
1000 ** convert_flush() gets invoked when the 1024 char input buffer
1001 ** fills or to convert input that has been buffered when
1002 ** convert_chset_end() gets invoked.
1003 **
1004 ** A return code of EINVAL from iconv() is iconv() encountering
1005 ** an incomplete multibyte sequence.
1006 **
1007 ** If iconv() failed without consuming any input:
1008 **
1009 ** - iconv(3) reason #1, EILSEQ, invalid multibyte sequence
1010 ** that starts at the beginning of the string we wish to
1011 ** convert. Discard one character, and try again.
1012 **
1013 ** - iconv(3) reason #3, EINVAL, incomplete multibyte sequence.
1014 ** If it's possible to have an incomplete 1024 character long
1015 ** multibyte sequence, we're in trouble. Or we've encountered
1016 ** an EINVAL when flushing out the remaining buffered input,
1017 ** in convert_chset_end(). In either case, it's ok to sicard
1018 ** one character at a time, until we either reach the end,
1019 ** or get some other result.
1020 **
1021 ** - iconv(3) reason #4, E2BIG. If the 1024 character output
1022 ** buffer, above, is insufficient to produce the output from a
1023 ** single converted character, we're in trouble.
1024 */
1025
1026 if (*inbytesleft == origin)
1027 {
1028 h->skipleft=h->skipcnt;
1029 h->converr=1;
1030 }
1031
1032 /*
1033 ** Stopped at an incomplete multibyte sequence, try again on
1034 ** the next round.
1035 */
1036 else if (save_errno == EINVAL)
1037 break;
1038
1039 if (save_errno == EILSEQ)
1040 h->converr=1; /* Another possibility this can happen */
1041
1042 /*
1043 ** If we get here because of iconv(3) reason #4, filled out
1044 ** the output buffer, we should continue with the conversion.
1045 ** Otherwise, upon encountering any other error condition,
1046 ** reset the conversion state.
1047 */
1048 if (save_errno != E2BIG)
1049 iconv(h->h, NULL, NULL, NULL, NULL);
1050 }
1051 }
1052
1053 /*****************************************************************************/
1054
1055 /*
1056 ** A wrapper for libmail_u_convert() that collects the converted character
1057 ** text into a buffer. This is done by passing an output function to
1058 ** libmail_u_convert() that saves converted text in a linked-list
1059 ** of buffers.
1060 **
1061 ** Then, in the deinitialization function, the buffers get concatenated into
1062 ** the final character buffer.
1063 */
1064
1065 struct libmail_u_convert_cbuf {
1066 struct libmail_u_convert_cbuf *next;
1067 char *fragment;
1068 size_t fragment_size;
1069 };
1070
1071 struct libmail_u_convert_tocbuf {
1072 struct libmail_u_convert_hdr hdr;
1073
1074 char **cbufptr_ret;
1075 size_t *cbufsize_ret;
1076 int errflag;
1077 size_t tot_size;
1078 int nullterminate;
1079
1080 struct libmail_u_convert_cbuf *first, **last;
1081 };
1082
1083 static int save_tocbuf(const char *, size_t, void *);
1084 static int convert_tocbuf(void *ptr,
1085 const char *text, size_t cnt);
1086 static int deinit_tocbuf(void *ptr, int *errptr);
1087
1088 libmail_u_convert_handle_t
1089 libmail_u_convert_tocbuf_init(const char *src_chset,
1090 const char *dst_chset,
1091 char **cbufptr_ret,
1092 size_t *cbufsize_ret,
1093 int nullterminate
1094 )
1095 {
1096 struct libmail_u_convert_tocbuf *p=
1097 malloc(sizeof(struct libmail_u_convert_tocbuf));
1098 libmail_u_convert_handle_t h;
1099
1100 if (!p)
1101 return NULL;
1102
1103 memset(p, 0, sizeof(*p));
1104
1105 h=libmail_u_convert_init(src_chset, dst_chset, save_tocbuf, p);
1106
1107 if (!h)
1108 {
1109 free(p);
1110 return NULL;
1111 }
1112
1113 p->cbufptr_ret=cbufptr_ret;
1114 p->cbufsize_ret=cbufsize_ret;
1115 p->last= &p->first;
1116 p->nullterminate=nullterminate;
1117 p->hdr.next=h;
1118 p->hdr.convert_handler=convert_tocbuf;
1119 p->hdr.deinit_handler=deinit_tocbuf;
1120 p->hdr.ptr=p;
1121 return &p->hdr;
1122 }
1123
1124 /* Capture the output of the conversion stack */
1125
1126 static int save_tocbuf(const char *text, size_t cnt, void *ptr)
1127 {
1128 struct libmail_u_convert_tocbuf *p=
1129 (struct libmail_u_convert_tocbuf *)ptr;
1130 struct libmail_u_convert_cbuf *fragment=
1131 malloc(sizeof(struct libmail_u_convert_cbuf)+cnt);
1132 size_t tot_size;
1133
1134 if (!fragment)
1135 {
1136 p->errflag=1;
1137 return 1;
1138 }
1139
1140 fragment->next=NULL;
1141 fragment->fragment=(char *)(fragment+1);
1142 if ((fragment->fragment_size=cnt) > 0)
1143 memcpy(fragment->fragment, text, cnt);
1144
1145 *(p->last)=fragment;
1146 p->last=&fragment->next;
1147
1148 tot_size=p->tot_size + cnt; /* Keep track of the total size saved */
1149
1150 if (tot_size < p->tot_size) /* Overflow? */
1151 {
1152 errno=E2BIG;
1153 return 1;
1154 }
1155 p->tot_size=tot_size;
1156 return 0;
1157 }
1158
1159 /* Punt converted text down the stack */
1160
1161 static int convert_tocbuf(void *ptr, const char *text, size_t cnt)
1162 {
1163 struct libmail_u_convert_tocbuf *p=
1164 (struct libmail_u_convert_tocbuf *)ptr;
1165
1166 return libmail_u_convert(p->hdr.next, text, cnt);
1167 }
1168
1169 /*
1170 ** Destroy the conversion stack. Destroy the downstream, then assemble the
1171 ** final array.
1172 */
1173
1174 static int deinit_tocbuf(void *ptr, int *errptr)
1175 {
1176 struct libmail_u_convert_tocbuf *p=
1177 (struct libmail_u_convert_tocbuf *)ptr;
1178 int rc=libmail_u_convert_deinit(p->hdr.next, errptr);
1179 struct libmail_u_convert_cbuf *bufptr;
1180
1181 if (rc == 0 && p->nullterminate)
1182 {
1183 char zero=0;
1184
1185 rc=save_tocbuf( &zero, sizeof(zero), p->hdr.ptr);
1186 }
1187
1188 if (rc == 0)
1189 {
1190 if (((*p->cbufptr_ret)=malloc(p->tot_size ? p->tot_size:1)) !=
1191 NULL)
1192 {
1193 size_t i=0;
1194
1195 for (bufptr=p->first; bufptr; bufptr=bufptr->next)
1196 {
1197 if (bufptr->fragment_size)
1198 memcpy(&(*p->cbufptr_ret)[i],
1199 bufptr->fragment,
1200 bufptr->fragment_size);
1201 i += bufptr->fragment_size;
1202 }
1203 (*p->cbufsize_ret)=i;
1204 }
1205 else
1206 {
1207 rc= -1;
1208 }
1209 }
1210
1211 for (bufptr=p->first; bufptr; )
1212 {
1213 struct libmail_u_convert_cbuf *b=bufptr;
1214
1215 bufptr=bufptr->next;
1216
1217 free(b);
1218 }
1219 free(p);
1220
1221 return rc;
1222 }
1223
1224 libmail_u_convert_handle_t
1225 libmail_u_convert_tocbuf_toutf8_init(const char *src_chset,
1226 char **cbufptr_ret,
1227 size_t *cbufsize_ret,
1228 int nullterminate
1229 )
1230 {
1231 return libmail_u_convert_tocbuf_init(src_chset, "utf-8",
1232 cbufptr_ret, cbufsize_ret,
1233 nullterminate);
1234 }
1235
1236 libmail_u_convert_handle_t
1237 libmail_u_convert_tocbuf_fromutf8_init(const char *dst_chset,
1238 char **cbufptr_ret,
1239 size_t *cbufsize_ret,
1240 int nullterminate
1241 )
1242 {
1243 return libmail_u_convert_tocbuf_init("utf-8", dst_chset,
1244 cbufptr_ret, cbufsize_ret,
1245 nullterminate);
1246 }
1247
1248 char *libmail_u_convert_toutf8(const char *text,
1249 const char *charset,
1250 int *error)
1251 {
1252 char *cbufptr;
1253 size_t cbufsize;
1254 libmail_u_convert_handle_t h=
1255 libmail_u_convert_tocbuf_toutf8_init(charset,
1256 &cbufptr,
1257 &cbufsize, 1);
1258
1259 if (!h)
1260 return NULL;
1261
1262 libmail_u_convert(h, text, strlen(text));
1263
1264 if (libmail_u_convert_deinit(h, error) == 0)
1265 return cbufptr;
1266
1267 return NULL;
1268 }
1269
1270 char *libmail_u_convert_fromutf8(const char *text,
1271 const char *charset,
1272 int *error)
1273 {
1274 char *cbufptr;
1275 size_t cbufsize;
1276 libmail_u_convert_handle_t h=
1277 libmail_u_convert_tocbuf_fromutf8_init(charset,
1278 &cbufptr,
1279 &cbufsize, 1);
1280
1281 if (!h)
1282 return NULL;
1283
1284 libmail_u_convert(h, text, strlen(text));
1285
1286 if (libmail_u_convert_deinit(h, error) == 0)
1287 return cbufptr;
1288
1289 return NULL;
1290 }
1291
1292 char *libmail_u_convert_tobuf(const char *text,
1293 const char *charset,
1294 const char *dstcharset,
1295 int *error)
1296 {
1297 char *cbufptr;
1298 size_t cbufsize;
1299 libmail_u_convert_handle_t h=
1300 libmail_u_convert_tocbuf_init(charset,
1301 dstcharset,
1302 &cbufptr,
1303 &cbufsize, 1);
1304
1305 if (!h)
1306 return NULL;
1307
1308 libmail_u_convert(h, text, strlen(text));
1309
1310 if (libmail_u_convert_deinit(h, error) == 0)
1311 return cbufptr;
1312
1313 return NULL;
1314 }
1315
1316 /*****************************************************************************/
1317
1318 /*
1319 ** Convert text to unicode_chars. Same basic approach as
1320 ** libmail_u_convert_tocbuf_init(). The output character set gets specified
1321 ** as UCS-4, the final output size is divided by 4, and the output buffer gets
1322 ** typed as a unicode_char array.
1323 */
1324
1325 struct libmail_u_convert_buf {
1326 struct libmail_u_convert_buf *next;
1327 unicode_char *fragment;
1328 size_t fragment_size;
1329 size_t max_fragment_size;
1330 };
1331
1332 struct libmail_u_convert_tou {
1333 struct libmail_u_convert_hdr hdr;
1334
1335 unicode_char **ucptr_ret;
1336 size_t *ucsize_ret;
1337 int errflag;
1338 size_t tot_size;
1339 int nullterminate;
1340
1341 struct libmail_u_convert_buf *first, *tail, **last;
1342 };
1343
1344 static int save_unicode(const char *, size_t, void *);
1345 static int convert_tounicode(void *ptr,
1346 const char *text, size_t cnt);
1347 static int deinit_tounicode(void *ptr, int *errptr);
1348
1349 libmail_u_convert_handle_t
1350 libmail_u_convert_tou_init(const char *src_chset,
1351 unicode_char **ucptr_ret,
1352 size_t *ucsize_ret,
1353 int nullterminate
1354 )
1355 {
1356 struct libmail_u_convert_tou *p=
1357 malloc(sizeof(struct libmail_u_convert_tou));
1358 libmail_u_convert_handle_t h;
1359
1360 if (!p)
1361 return NULL;
1362
1363 memset(p, 0, sizeof(*p));
1364
1365 h=libmail_u_convert_init(src_chset, libmail_u_ucs4_native,
1366 save_unicode, p);
1367
1368 if (!h)
1369 {
1370 free(p);
1371 return NULL;
1372 }
1373
1374 p->ucptr_ret=ucptr_ret;
1375 p->ucsize_ret=ucsize_ret;
1376 p->last= &p->first;
1377 p->nullterminate=nullterminate;
1378 p->hdr.next=h;
1379 p->hdr.convert_handler=convert_tounicode;
1380 p->hdr.deinit_handler=deinit_tounicode;
1381 p->hdr.ptr=p;
1382 return &p->hdr;
1383 }
1384
1385 libmail_u_convert_handle_t
1386 libmail_u_convert_fromu_init(const char *dst_chset,
1387 char **cbufptr_ret,
1388 size_t *csize_ret,
1389 int nullterminate
1390 )
1391 {
1392 return libmail_u_convert_tocbuf_init(libmail_u_ucs4_native,
1393 dst_chset,
1394 cbufptr_ret,
1395 csize_ret,
1396 nullterminate);
1397 }
1398
1399 int libmail_u_convert_uc(libmail_u_convert_handle_t handle,
1400 const unicode_char *text,
1401 size_t cnt)
1402 {
1403 return libmail_u_convert(handle, (const char *)text,
1404 cnt * sizeof(*text));
1405 }
1406
1407 /* Capture the output of the conversion stack */
1408
1409 static int save_unicode(const char *text, size_t cnt, void *ptr)
1410 {
1411 struct libmail_u_convert_tou *p=
1412 (struct libmail_u_convert_tou *)ptr;
1413 struct libmail_u_convert_buf *fragment;
1414 size_t tot_size;
1415
1416 cnt /= sizeof(unicode_char);
1417
1418 tot_size=p->tot_size + cnt*sizeof(unicode_char);
1419 /* Keep track of the total size saved */
1420
1421 if (p->tail)
1422 {
1423 size_t n=p->tail->max_fragment_size-p->tail->fragment_size;
1424
1425 if (n > cnt)
1426 n=cnt;
1427
1428 if (n)
1429 {
1430 memcpy(p->tail->fragment+p->tail->fragment_size,
1431 text, n*sizeof(unicode_char));
1432
1433 cnt -= n;
1434 text += n*sizeof(unicode_char);
1435 p->tail->fragment_size += n;
1436 }
1437 }
1438
1439 if (cnt > 0)
1440 {
1441 size_t cnt_alloc=cnt;
1442
1443 if (cnt_alloc < 16)
1444 cnt_alloc=16;
1445
1446 if ((fragment=malloc(sizeof(struct libmail_u_convert_buf)
1447 +cnt_alloc*sizeof(unicode_char)))
1448 == NULL)
1449 {
1450 p->errflag=1;
1451 return 1;
1452 }
1453
1454 fragment->next=NULL;
1455 fragment->fragment=(unicode_char *)(fragment+1);
1456 fragment->max_fragment_size=cnt_alloc;
1457 fragment->fragment_size=cnt;
1458 memcpy(fragment->fragment, text, cnt*sizeof(unicode_char));
1459
1460 *(p->last)=fragment;
1461 p->last=&fragment->next;
1462 p->tail=fragment;
1463 }
1464
1465 if (tot_size < p->tot_size) /* Overflow? */
1466 {
1467 errno=E2BIG;
1468 return 1;
1469 }
1470 p->tot_size=tot_size;
1471 return 0;
1472 }
1473
1474 /* Punt converted text down the stack */
1475
1476 static int convert_tounicode(void *ptr,
1477 const char *text, size_t cnt)
1478 {
1479 struct libmail_u_convert_tou *p=
1480 (struct libmail_u_convert_tou *)ptr;
1481
1482 return libmail_u_convert(p->hdr.next, text, cnt);
1483 }
1484
1485 /*
1486 ** Destroy the conversion stack. Destroy the downstream, then assemble the
1487 ** final array.
1488 */
1489
1490 static int deinit_tounicode(void *ptr, int *errptr)
1491 {
1492 struct libmail_u_convert_tou *p=
1493 (struct libmail_u_convert_tou *)ptr;
1494 int rc=libmail_u_convert_deinit(p->hdr.next, errptr);
1495 struct libmail_u_convert_buf *bufptr;
1496
1497 if (rc == 0 && p->nullterminate)
1498 {
1499 unicode_char zero=0;
1500
1501 rc=save_unicode( (const char *)&zero, sizeof(zero),
1502 p->hdr.ptr);
1503 }
1504
1505 if (rc == 0)
1506 {
1507 if (((*p->ucptr_ret)=malloc(p->tot_size ? p->tot_size:1)) !=
1508 NULL)
1509 {
1510 size_t i=0;
1511
1512 for (bufptr=p->first; bufptr; bufptr=bufptr->next)
1513 {
1514 if (bufptr->fragment_size)
1515 memcpy(&(*p->ucptr_ret)[i],
1516 bufptr->fragment,
1517 bufptr->fragment_size
1518 *sizeof(*bufptr->fragment));
1519 i += bufptr->fragment_size;
1520 }
1521 (*p->ucsize_ret)=i;
1522 }
1523 else
1524 {
1525 rc= -1;
1526 }
1527 }
1528
1529 for (bufptr=p->first; bufptr; )
1530 {
1531 struct libmail_u_convert_buf *b=bufptr;
1532
1533 bufptr=bufptr->next;
1534
1535 free(b);
1536 }
1537 free(p);
1538
1539 return rc;
1540 }
1541
1542 int libmail_u_convert_tou_tobuf(const char *text,
1543 size_t text_l,
1544 const char *charset,
1545 unicode_char **uc,
1546 size_t *ucsize,
1547 int *err)
1548 {
1549 libmail_u_convert_handle_t h;
1550
1551 if ((h=libmail_u_convert_tou_init(charset, uc, ucsize, 0)) == NULL)
1552 return -1;
1553
1554 if (libmail_u_convert(h, text, text_l) < 0)
1555 {
1556 libmail_u_convert_deinit(h, NULL);
1557 return -1;
1558 }
1559
1560 if (libmail_u_convert_deinit(h, err))
1561 return -1;
1562
1563 return 0;
1564 }
1565
1566 int libmail_u_convert_fromu_tobuf(const unicode_char *utext,
1567 size_t utext_l,
1568 const char *charset,
1569 char **c,
1570 size_t *csize,
1571 int *err)
1572 {
1573 libmail_u_convert_handle_t h;
1574
1575 if (utext_l == (size_t)-1)
1576 {
1577 for (utext_l=0; utext[utext_l]; ++utext_l)
1578 ;
1579 }
1580
1581 if ((h=libmail_u_convert_fromu_init(charset, c, csize, 1)) == NULL)
1582 return -1;
1583
1584 if (libmail_u_convert_uc(h, utext, utext_l) < 0)
1585 {
1586 libmail_u_convert_deinit(h, NULL);
1587 return -1;
1588 }
1589
1590 if (libmail_u_convert_deinit(h, err))
1591 return -1;
1592
1593 return 0;
1594 }
1595
1596 char *libmail_u_convert_tocase(const char *str,
1597 const char *charset,
1598 unicode_char (*first_char_func)(unicode_char),
1599 unicode_char (*char_func)(unicode_char))
1600 {
1601 unicode_char *uc;
1602 size_t ucsize;
1603 size_t i;
1604 int err;
1605 char *c;
1606 size_t csize;
1607
1608 if (libmail_u_convert_tou_tobuf(str, strlen(str),
1609 charset, &uc, &ucsize, &err))
1610 return NULL;
1611
1612 if (err)
1613 {
1614 free(uc);
1615 return NULL;
1616 }
1617
1618 for (i=0; i<ucsize; ++i)
1619 {
1620 uc[i]=(*first_char_func)(uc[i]);
1621
1622 if (char_func)
1623 first_char_func=char_func;
1624 }
1625
1626 if (libmail_u_convert_fromu_tobuf(uc, ucsize,
1627 charset,
1628 &c, &csize, &err))
1629 {
1630 free(uc);
1631 return NULL;
1632 }
1633
1634 free(uc);
1635
1636 if (err)
1637 {
1638 free(c);
1639 return NULL;
1640 }
1641
1642 return c;
1643 }