GOOPS cosmetics
[bpt/guile.git] / lib / striconveh.c
1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2014 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include <config.h>
19
20 /* Specification. */
21 #include "striconveh.h"
22
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 #if HAVE_ICONV
29 # include <iconv.h>
30 # include "unistr.h"
31 #endif
32
33 #include "c-strcase.h"
34 #include "c-strcaseeq.h"
35
36 #ifndef SIZE_MAX
37 # define SIZE_MAX ((size_t) -1)
38 #endif
39
40
41 #if HAVE_ICONV
42
43 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
44 conversion error occurs, we may have to determine the Unicode representation
45 of the inconvertible character. */
46
47 int
48 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49 {
50 iconv_t cd;
51 iconv_t cd1;
52 iconv_t cd2;
53
54 /* Avoid glibc-2.1 bug with EUC-KR. */
55 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
56 && !defined _LIBICONV_VERSION
57 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
58 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
59 {
60 errno = EINVAL;
61 return -1;
62 }
63 # endif
64
65 cd = iconv_open (to_codeset, from_codeset);
66
67 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
68 cd1 = (iconv_t)(-1);
69 else
70 {
71 cd1 = iconv_open ("UTF-8", from_codeset);
72 if (cd1 == (iconv_t)(-1))
73 {
74 int saved_errno = errno;
75 if (cd != (iconv_t)(-1))
76 iconv_close (cdp->cd);
77 errno = saved_errno;
78 return -1;
79 }
80 }
81
82 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
83 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
84 && !defined __UCLIBC__) \
85 || _LIBICONV_VERSION >= 0x0105
86 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
87 # endif
88 )
89 cd2 = (iconv_t)(-1);
90 else
91 {
92 cd2 = iconv_open (to_codeset, "UTF-8");
93 if (cd2 == (iconv_t)(-1))
94 {
95 int saved_errno = errno;
96 if (cd1 != (iconv_t)(-1))
97 iconv_close (cd1);
98 if (cd != (iconv_t)(-1))
99 iconv_close (cd);
100 errno = saved_errno;
101 return -1;
102 }
103 }
104
105 cdp->cd = cd;
106 cdp->cd1 = cd1;
107 cdp->cd2 = cd2;
108 return 0;
109 }
110
111 int
112 iconveh_close (const iconveh_t *cd)
113 {
114 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
115 {
116 /* Return -1, but preserve the errno from iconv_close. */
117 int saved_errno = errno;
118 if (cd->cd1 != (iconv_t)(-1))
119 iconv_close (cd->cd1);
120 if (cd->cd != (iconv_t)(-1))
121 iconv_close (cd->cd);
122 errno = saved_errno;
123 return -1;
124 }
125 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
126 {
127 /* Return -1, but preserve the errno from iconv_close. */
128 int saved_errno = errno;
129 if (cd->cd != (iconv_t)(-1))
130 iconv_close (cd->cd);
131 errno = saved_errno;
132 return -1;
133 }
134 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
135 return -1;
136 return 0;
137 }
138
139 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
140 a conversion error, and it returns in *INCREMENTED a boolean telling whether
141 it has incremented the input pointers past the error location. */
142 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
143 /* Irix iconv() inserts a NUL byte if it cannot convert.
144 NetBSD iconv() inserts a question mark if it cannot convert.
145 Only GNU libiconv and GNU libc are known to prefer to fail rather
146 than doing a lossy conversion. */
147 static size_t
148 iconv_carefully (iconv_t cd,
149 const char **inbuf, size_t *inbytesleft,
150 char **outbuf, size_t *outbytesleft,
151 bool *incremented)
152 {
153 const char *inptr = *inbuf;
154 const char *inptr_end = inptr + *inbytesleft;
155 char *outptr = *outbuf;
156 size_t outsize = *outbytesleft;
157 const char *inptr_before;
158 size_t res;
159
160 do
161 {
162 size_t insize;
163
164 inptr_before = inptr;
165 res = (size_t)(-1);
166
167 for (insize = 1; inptr + insize <= inptr_end; insize++)
168 {
169 res = iconv (cd,
170 (ICONV_CONST char **) &inptr, &insize,
171 &outptr, &outsize);
172 if (!(res == (size_t)(-1) && errno == EINVAL))
173 break;
174 /* iconv can eat up a shift sequence but give EINVAL while attempting
175 to convert the first character. E.g. libiconv does this. */
176 if (inptr > inptr_before)
177 {
178 res = 0;
179 break;
180 }
181 }
182
183 if (res == 0)
184 {
185 *outbuf = outptr;
186 *outbytesleft = outsize;
187 }
188 }
189 while (res == 0 && inptr < inptr_end);
190
191 *inbuf = inptr;
192 *inbytesleft = inptr_end - inptr;
193 if (res != (size_t)(-1) && res > 0)
194 {
195 /* iconv() has already incremented INPTR. We cannot go back to a
196 previous INPTR, otherwise the state inside CD would become invalid,
197 if FROM_CODESET is a stateful encoding. So, tell the caller that
198 *INBUF has already been incremented. */
199 *incremented = (inptr > inptr_before);
200 errno = EILSEQ;
201 return (size_t)(-1);
202 }
203 else
204 {
205 *incremented = false;
206 return res;
207 }
208 }
209 # else
210 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
211 (*(incremented) = false, \
212 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
213 # endif
214
215 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
216 converting one character or one shift sequence. */
217 static size_t
218 iconv_carefully_1 (iconv_t cd,
219 const char **inbuf, size_t *inbytesleft,
220 char **outbuf, size_t *outbytesleft,
221 bool *incremented)
222 {
223 const char *inptr_before = *inbuf;
224 const char *inptr = inptr_before;
225 const char *inptr_end = inptr_before + *inbytesleft;
226 char *outptr = *outbuf;
227 size_t outsize = *outbytesleft;
228 size_t res = (size_t)(-1);
229 size_t insize;
230
231 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
232 {
233 inptr = inptr_before;
234 res = iconv (cd,
235 (ICONV_CONST char **) &inptr, &insize,
236 &outptr, &outsize);
237 if (!(res == (size_t)(-1) && errno == EINVAL))
238 break;
239 /* iconv can eat up a shift sequence but give EINVAL while attempting
240 to convert the first character. E.g. libiconv does this. */
241 if (inptr > inptr_before)
242 {
243 res = 0;
244 break;
245 }
246 }
247
248 *inbuf = inptr;
249 *inbytesleft = inptr_end - inptr;
250 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
251 /* Irix iconv() inserts a NUL byte if it cannot convert.
252 NetBSD iconv() inserts a question mark if it cannot convert.
253 Only GNU libiconv and GNU libc are known to prefer to fail rather
254 than doing a lossy conversion. */
255 if (res != (size_t)(-1) && res > 0)
256 {
257 /* iconv() has already incremented INPTR. We cannot go back to a
258 previous INPTR, otherwise the state inside CD would become invalid,
259 if FROM_CODESET is a stateful encoding. So, tell the caller that
260 *INBUF has already been incremented. */
261 *incremented = (inptr > inptr_before);
262 errno = EILSEQ;
263 return (size_t)(-1);
264 }
265 # endif
266
267 if (res != (size_t)(-1))
268 {
269 *outbuf = outptr;
270 *outbytesleft = outsize;
271 }
272 *incremented = false;
273 return res;
274 }
275
276 /* utf8conv_carefully is like iconv, except that
277 - it converts from UTF-8 to UTF-8,
278 - it stops as soon as it encounters a conversion error, and it returns
279 in *INCREMENTED a boolean telling whether it has incremented the input
280 pointers past the error location,
281 - if one_character_only is true, it stops after converting one
282 character. */
283 static size_t
284 utf8conv_carefully (bool one_character_only,
285 const char **inbuf, size_t *inbytesleft,
286 char **outbuf, size_t *outbytesleft,
287 bool *incremented)
288 {
289 const char *inptr = *inbuf;
290 size_t insize = *inbytesleft;
291 char *outptr = *outbuf;
292 size_t outsize = *outbytesleft;
293 size_t res;
294
295 res = 0;
296 do
297 {
298 ucs4_t uc;
299 int n;
300 int m;
301
302 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
303 if (n < 0)
304 {
305 errno = (n == -2 ? EINVAL : EILSEQ);
306 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
307 inptr += n;
308 insize -= n;
309 res = (size_t)(-1);
310 *incremented = true;
311 break;
312 }
313 if (outsize == 0)
314 {
315 errno = E2BIG;
316 res = (size_t)(-1);
317 *incremented = false;
318 break;
319 }
320 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
321 if (m == -2)
322 {
323 errno = E2BIG;
324 res = (size_t)(-1);
325 *incremented = false;
326 break;
327 }
328 inptr += n;
329 insize -= n;
330 if (m == -1)
331 {
332 errno = EILSEQ;
333 res = (size_t)(-1);
334 *incremented = true;
335 break;
336 }
337 outptr += m;
338 outsize -= m;
339 }
340 while (!one_character_only && insize > 0);
341
342 *inbuf = inptr;
343 *inbytesleft = insize;
344 *outbuf = outptr;
345 *outbytesleft = outsize;
346 return res;
347 }
348
349 static int
350 mem_cd_iconveh_internal (const char *src, size_t srclen,
351 iconv_t cd, iconv_t cd1, iconv_t cd2,
352 enum iconv_ilseq_handler handler,
353 size_t extra_alloc,
354 size_t *offsets,
355 char **resultp, size_t *lengthp)
356 {
357 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
358 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
359 Instead, we have to start afresh from the beginning of SRC. */
360 /* Use a temporary buffer, so that for small strings, a single malloc()
361 call will be sufficient. */
362 # define tmpbufsize 4096
363 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
364 libiconv's UCS-4-INTERNAL encoding. */
365 union { unsigned int align; char buf[tmpbufsize]; } tmp;
366 # define tmpbuf tmp.buf
367
368 char *initial_result;
369 char *result;
370 size_t allocated;
371 size_t length;
372 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
373
374 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
375 {
376 initial_result = *resultp;
377 allocated = *lengthp;
378 }
379 else
380 {
381 initial_result = tmpbuf;
382 allocated = sizeof (tmpbuf);
383 }
384 result = initial_result;
385
386 /* Test whether a direct conversion is possible at all. */
387 if (cd == (iconv_t)(-1))
388 goto indirectly;
389
390 if (offsets != NULL)
391 {
392 size_t i;
393
394 for (i = 0; i < srclen; i++)
395 offsets[i] = (size_t)(-1);
396
397 last_length = (size_t)(-1);
398 }
399 length = 0;
400
401 /* First, try a direct conversion, and see whether a conversion error
402 occurs at all. */
403 {
404 const char *inptr = src;
405 size_t insize = srclen;
406
407 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
408 # if defined _LIBICONV_VERSION \
409 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
410 || defined __sun)
411 /* Set to the initial state. */
412 iconv (cd, NULL, NULL, NULL, NULL);
413 # endif
414
415 while (insize > 0)
416 {
417 char *outptr = result + length;
418 size_t outsize = allocated - extra_alloc - length;
419 bool incremented;
420 size_t res;
421 bool grow;
422
423 if (offsets != NULL)
424 {
425 if (length != last_length) /* ensure that offset[] be increasing */
426 {
427 offsets[inptr - src] = length;
428 last_length = length;
429 }
430 res = iconv_carefully_1 (cd,
431 &inptr, &insize,
432 &outptr, &outsize,
433 &incremented);
434 }
435 else
436 /* Use iconv_carefully instead of iconv here, because:
437 - If TO_CODESET is UTF-8, we can do the error handling in this
438 loop, no need for a second loop,
439 - With iconv() implementations other than GNU libiconv and GNU
440 libc, if we use iconv() in a big swoop, checking for an E2BIG
441 return, we lose the number of irreversible conversions. */
442 res = iconv_carefully (cd,
443 &inptr, &insize,
444 &outptr, &outsize,
445 &incremented);
446
447 length = outptr - result;
448 grow = (length + extra_alloc > allocated / 2);
449 if (res == (size_t)(-1))
450 {
451 if (errno == E2BIG)
452 grow = true;
453 else if (errno == EINVAL)
454 break;
455 else if (errno == EILSEQ && handler != iconveh_error)
456 {
457 if (cd2 == (iconv_t)(-1))
458 {
459 /* TO_CODESET is UTF-8. */
460 /* Error handling can produce up to 1 byte of output. */
461 if (length + 1 + extra_alloc > allocated)
462 {
463 char *memory;
464
465 allocated = 2 * allocated;
466 if (length + 1 + extra_alloc > allocated)
467 abort ();
468 if (result == initial_result)
469 memory = (char *) malloc (allocated);
470 else
471 memory = (char *) realloc (result, allocated);
472 if (memory == NULL)
473 {
474 if (result != initial_result)
475 free (result);
476 errno = ENOMEM;
477 return -1;
478 }
479 if (result == initial_result)
480 memcpy (memory, initial_result, length);
481 result = memory;
482 grow = false;
483 }
484 /* The input is invalid in FROM_CODESET. Eat up one byte
485 and emit a question mark. */
486 if (!incremented)
487 {
488 if (insize == 0)
489 abort ();
490 inptr++;
491 insize--;
492 }
493 result[length] = '?';
494 length++;
495 }
496 else
497 goto indirectly;
498 }
499 else
500 {
501 if (result != initial_result)
502 {
503 int saved_errno = errno;
504 free (result);
505 errno = saved_errno;
506 }
507 return -1;
508 }
509 }
510 if (insize == 0)
511 break;
512 if (grow)
513 {
514 char *memory;
515
516 allocated = 2 * allocated;
517 if (result == initial_result)
518 memory = (char *) malloc (allocated);
519 else
520 memory = (char *) realloc (result, allocated);
521 if (memory == NULL)
522 {
523 if (result != initial_result)
524 free (result);
525 errno = ENOMEM;
526 return -1;
527 }
528 if (result == initial_result)
529 memcpy (memory, initial_result, length);
530 result = memory;
531 }
532 }
533 }
534
535 /* Now get the conversion state back to the initial state.
536 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
537 #if defined _LIBICONV_VERSION \
538 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
539 || defined __sun)
540 for (;;)
541 {
542 char *outptr = result + length;
543 size_t outsize = allocated - extra_alloc - length;
544 size_t res;
545
546 res = iconv (cd, NULL, NULL, &outptr, &outsize);
547 length = outptr - result;
548 if (res == (size_t)(-1))
549 {
550 if (errno == E2BIG)
551 {
552 char *memory;
553
554 allocated = 2 * allocated;
555 if (result == initial_result)
556 memory = (char *) malloc (allocated);
557 else
558 memory = (char *) realloc (result, allocated);
559 if (memory == NULL)
560 {
561 if (result != initial_result)
562 free (result);
563 errno = ENOMEM;
564 return -1;
565 }
566 if (result == initial_result)
567 memcpy (memory, initial_result, length);
568 result = memory;
569 }
570 else
571 {
572 if (result != initial_result)
573 {
574 int saved_errno = errno;
575 free (result);
576 errno = saved_errno;
577 }
578 return -1;
579 }
580 }
581 else
582 break;
583 }
584 #endif
585
586 /* The direct conversion succeeded. */
587 goto done;
588
589 indirectly:
590 /* The direct conversion failed.
591 Use a conversion through UTF-8. */
592 if (offsets != NULL)
593 {
594 size_t i;
595
596 for (i = 0; i < srclen; i++)
597 offsets[i] = (size_t)(-1);
598
599 last_length = (size_t)(-1);
600 }
601 length = 0;
602 {
603 const bool slowly = (offsets != NULL || handler == iconveh_error);
604 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
605 char utf8buf[utf8bufsize + 1];
606 size_t utf8len = 0;
607 const char *in1ptr = src;
608 size_t in1size = srclen;
609 bool do_final_flush1 = true;
610 bool do_final_flush2 = true;
611
612 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
613 # if defined _LIBICONV_VERSION \
614 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
615 || defined __sun)
616 /* Set to the initial state. */
617 if (cd1 != (iconv_t)(-1))
618 iconv (cd1, NULL, NULL, NULL, NULL);
619 if (cd2 != (iconv_t)(-1))
620 iconv (cd2, NULL, NULL, NULL, NULL);
621 # endif
622
623 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
624 {
625 char *out1ptr = utf8buf + utf8len;
626 size_t out1size = utf8bufsize - utf8len;
627 bool incremented1;
628 size_t res1;
629 int errno1;
630
631 /* Conversion step 1: from FROM_CODESET to UTF-8. */
632 if (in1size > 0)
633 {
634 if (offsets != NULL
635 && length != last_length) /* ensure that offset[] be increasing */
636 {
637 offsets[in1ptr - src] = length;
638 last_length = length;
639 }
640 if (cd1 != (iconv_t)(-1))
641 {
642 if (slowly)
643 res1 = iconv_carefully_1 (cd1,
644 &in1ptr, &in1size,
645 &out1ptr, &out1size,
646 &incremented1);
647 else
648 res1 = iconv_carefully (cd1,
649 &in1ptr, &in1size,
650 &out1ptr, &out1size,
651 &incremented1);
652 }
653 else
654 {
655 /* FROM_CODESET is UTF-8. */
656 res1 = utf8conv_carefully (slowly,
657 &in1ptr, &in1size,
658 &out1ptr, &out1size,
659 &incremented1);
660 }
661 }
662 else if (do_final_flush1)
663 {
664 /* Now get the conversion state of CD1 back to the initial state.
665 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
666 # if defined _LIBICONV_VERSION \
667 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
668 || defined __sun)
669 if (cd1 != (iconv_t)(-1))
670 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
671 else
672 # endif
673 res1 = 0;
674 do_final_flush1 = false;
675 incremented1 = true;
676 }
677 else
678 {
679 res1 = 0;
680 incremented1 = true;
681 }
682 if (res1 == (size_t)(-1)
683 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
684 {
685 if (result != initial_result)
686 {
687 int saved_errno = errno;
688 free (result);
689 errno = saved_errno;
690 }
691 return -1;
692 }
693 if (res1 == (size_t)(-1)
694 && errno == EILSEQ && handler != iconveh_error)
695 {
696 /* The input is invalid in FROM_CODESET. Eat up one byte and
697 emit a question mark. Room for the question mark was allocated
698 at the end of utf8buf. */
699 if (!incremented1)
700 {
701 if (in1size == 0)
702 abort ();
703 in1ptr++;
704 in1size--;
705 }
706 *out1ptr++ = '?';
707 res1 = 0;
708 }
709 errno1 = errno;
710 utf8len = out1ptr - utf8buf;
711
712 if (offsets != NULL
713 || in1size == 0
714 || utf8len > utf8bufsize / 2
715 || (res1 == (size_t)(-1) && errno1 == E2BIG))
716 {
717 /* Conversion step 2: from UTF-8 to TO_CODESET. */
718 const char *in2ptr = utf8buf;
719 size_t in2size = utf8len;
720
721 while (in2size > 0
722 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
723 {
724 char *out2ptr = result + length;
725 size_t out2size = allocated - extra_alloc - length;
726 bool incremented2;
727 size_t res2;
728 bool grow;
729
730 if (in2size > 0)
731 {
732 if (cd2 != (iconv_t)(-1))
733 res2 = iconv_carefully (cd2,
734 &in2ptr, &in2size,
735 &out2ptr, &out2size,
736 &incremented2);
737 else
738 /* TO_CODESET is UTF-8. */
739 res2 = utf8conv_carefully (false,
740 &in2ptr, &in2size,
741 &out2ptr, &out2size,
742 &incremented2);
743 }
744 else /* in1size == 0 && !do_final_flush1
745 && in2size == 0 && do_final_flush2 */
746 {
747 /* Now get the conversion state of CD1 back to the initial
748 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
749 # if defined _LIBICONV_VERSION \
750 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
751 || defined __sun)
752 if (cd2 != (iconv_t)(-1))
753 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
754 else
755 # endif
756 res2 = 0;
757 do_final_flush2 = false;
758 incremented2 = true;
759 }
760
761 length = out2ptr - result;
762 grow = (length + extra_alloc > allocated / 2);
763 if (res2 == (size_t)(-1))
764 {
765 if (errno == E2BIG)
766 grow = true;
767 else if (errno == EINVAL)
768 break;
769 else if (errno == EILSEQ && handler != iconveh_error)
770 {
771 /* Error handling can produce up to 10 bytes of ASCII
772 output. But TO_CODESET may be UCS-2, UTF-16 or
773 UCS-4, so use CD2 here as well. */
774 char scratchbuf[10];
775 size_t scratchlen;
776 ucs4_t uc;
777 const char *inptr;
778 size_t insize;
779 size_t res;
780
781 if (incremented2)
782 {
783 if (u8_prev (&uc, (const uint8_t *) in2ptr,
784 (const uint8_t *) utf8buf)
785 == NULL)
786 abort ();
787 }
788 else
789 {
790 int n;
791 if (in2size == 0)
792 abort ();
793 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
794 in2size);
795 in2ptr += n;
796 in2size -= n;
797 }
798
799 if (handler == iconveh_escape_sequence)
800 {
801 static char hex[16] = "0123456789ABCDEF";
802 scratchlen = 0;
803 scratchbuf[scratchlen++] = '\\';
804 if (uc < 0x10000)
805 scratchbuf[scratchlen++] = 'u';
806 else
807 {
808 scratchbuf[scratchlen++] = 'U';
809 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
810 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
811 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
812 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
813 }
814 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
815 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
816 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
817 scratchbuf[scratchlen++] = hex[uc & 15];
818 }
819 else
820 {
821 scratchbuf[0] = '?';
822 scratchlen = 1;
823 }
824
825 inptr = scratchbuf;
826 insize = scratchlen;
827 if (cd2 != (iconv_t)(-1))
828 res = iconv (cd2,
829 (ICONV_CONST char **) &inptr, &insize,
830 &out2ptr, &out2size);
831 else
832 {
833 /* TO_CODESET is UTF-8. */
834 if (out2size >= insize)
835 {
836 memcpy (out2ptr, inptr, insize);
837 out2ptr += insize;
838 out2size -= insize;
839 inptr += insize;
840 insize = 0;
841 res = 0;
842 }
843 else
844 {
845 errno = E2BIG;
846 res = (size_t)(-1);
847 }
848 }
849 length = out2ptr - result;
850 if (res == (size_t)(-1) && errno == E2BIG)
851 {
852 char *memory;
853
854 allocated = 2 * allocated;
855 if (length + 1 + extra_alloc > allocated)
856 abort ();
857 if (result == initial_result)
858 memory = (char *) malloc (allocated);
859 else
860 memory = (char *) realloc (result, allocated);
861 if (memory == NULL)
862 {
863 if (result != initial_result)
864 free (result);
865 errno = ENOMEM;
866 return -1;
867 }
868 if (result == initial_result)
869 memcpy (memory, initial_result, length);
870 result = memory;
871 grow = false;
872
873 out2ptr = result + length;
874 out2size = allocated - extra_alloc - length;
875 if (cd2 != (iconv_t)(-1))
876 res = iconv (cd2,
877 (ICONV_CONST char **) &inptr,
878 &insize,
879 &out2ptr, &out2size);
880 else
881 {
882 /* TO_CODESET is UTF-8. */
883 if (!(out2size >= insize))
884 abort ();
885 memcpy (out2ptr, inptr, insize);
886 out2ptr += insize;
887 out2size -= insize;
888 inptr += insize;
889 insize = 0;
890 res = 0;
891 }
892 length = out2ptr - result;
893 }
894 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
895 /* Irix iconv() inserts a NUL byte if it cannot convert.
896 NetBSD iconv() inserts a question mark if it cannot
897 convert.
898 Only GNU libiconv and GNU libc are known to prefer
899 to fail rather than doing a lossy conversion. */
900 if (res != (size_t)(-1) && res > 0)
901 {
902 errno = EILSEQ;
903 res = (size_t)(-1);
904 }
905 # endif
906 if (res == (size_t)(-1))
907 {
908 /* Failure converting the ASCII replacement. */
909 if (result != initial_result)
910 {
911 int saved_errno = errno;
912 free (result);
913 errno = saved_errno;
914 }
915 return -1;
916 }
917 }
918 else
919 {
920 if (result != initial_result)
921 {
922 int saved_errno = errno;
923 free (result);
924 errno = saved_errno;
925 }
926 return -1;
927 }
928 }
929 if (!(in2size > 0
930 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
931 break;
932 if (grow)
933 {
934 char *memory;
935
936 allocated = 2 * allocated;
937 if (result == initial_result)
938 memory = (char *) malloc (allocated);
939 else
940 memory = (char *) realloc (result, allocated);
941 if (memory == NULL)
942 {
943 if (result != initial_result)
944 free (result);
945 errno = ENOMEM;
946 return -1;
947 }
948 if (result == initial_result)
949 memcpy (memory, initial_result, length);
950 result = memory;
951 }
952 }
953
954 /* Move the remaining bytes to the beginning of utf8buf. */
955 if (in2size > 0)
956 memmove (utf8buf, in2ptr, in2size);
957 utf8len = in2size;
958 }
959
960 if (res1 == (size_t)(-1))
961 {
962 if (errno1 == EINVAL)
963 in1size = 0;
964 else if (errno1 == EILSEQ)
965 {
966 if (result != initial_result)
967 free (result);
968 errno = errno1;
969 return -1;
970 }
971 }
972 }
973 # undef utf8bufsize
974 }
975
976 done:
977 /* Now the final memory allocation. */
978 if (result == tmpbuf)
979 {
980 size_t memsize = length + extra_alloc;
981
982 if (*resultp != NULL && *lengthp >= memsize)
983 result = *resultp;
984 else
985 {
986 char *memory;
987
988 memory = (char *) malloc (memsize > 0 ? memsize : 1);
989 if (memory != NULL)
990 result = memory;
991 else
992 {
993 errno = ENOMEM;
994 return -1;
995 }
996 }
997 memcpy (result, tmpbuf, length);
998 }
999 else if (result != *resultp && length + extra_alloc < allocated)
1000 {
1001 /* Shrink the allocated memory if possible. */
1002 size_t memsize = length + extra_alloc;
1003 char *memory;
1004
1005 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1006 if (memory != NULL)
1007 result = memory;
1008 }
1009 *resultp = result;
1010 *lengthp = length;
1011 return 0;
1012 # undef tmpbuf
1013 # undef tmpbufsize
1014 }
1015
1016 int
1017 mem_cd_iconveh (const char *src, size_t srclen,
1018 const iconveh_t *cd,
1019 enum iconv_ilseq_handler handler,
1020 size_t *offsets,
1021 char **resultp, size_t *lengthp)
1022 {
1023 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1024 handler, 0, offsets, resultp, lengthp);
1025 }
1026
1027 char *
1028 str_cd_iconveh (const char *src,
1029 const iconveh_t *cd,
1030 enum iconv_ilseq_handler handler)
1031 {
1032 /* For most encodings, a trailing NUL byte in the input will be converted
1033 to a trailing NUL byte in the output. But not for UTF-7. So that this
1034 function is usable for UTF-7, we have to exclude the NUL byte from the
1035 conversion and add it by hand afterwards. */
1036 char *result = NULL;
1037 size_t length = 0;
1038 int retval = mem_cd_iconveh_internal (src, strlen (src),
1039 cd->cd, cd->cd1, cd->cd2, handler, 1,
1040 NULL, &result, &length);
1041
1042 if (retval < 0)
1043 {
1044 if (result != NULL)
1045 {
1046 int saved_errno = errno;
1047 free (result);
1048 errno = saved_errno;
1049 }
1050 return NULL;
1051 }
1052
1053 /* Add the terminating NUL byte. */
1054 result[length] = '\0';
1055
1056 return result;
1057 }
1058
1059 #endif
1060
1061 int
1062 mem_iconveh (const char *src, size_t srclen,
1063 const char *from_codeset, const char *to_codeset,
1064 enum iconv_ilseq_handler handler,
1065 size_t *offsets,
1066 char **resultp, size_t *lengthp)
1067 {
1068 if (srclen == 0)
1069 {
1070 /* Nothing to convert. */
1071 *lengthp = 0;
1072 return 0;
1073 }
1074 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1075 {
1076 char *result;
1077
1078 if (*resultp != NULL && *lengthp >= srclen)
1079 result = *resultp;
1080 else
1081 {
1082 result = (char *) malloc (srclen);
1083 if (result == NULL)
1084 {
1085 errno = ENOMEM;
1086 return -1;
1087 }
1088 }
1089 memcpy (result, src, srclen);
1090 *resultp = result;
1091 *lengthp = srclen;
1092 return 0;
1093 }
1094 else
1095 {
1096 #if HAVE_ICONV
1097 iconveh_t cd;
1098 char *result;
1099 size_t length;
1100 int retval;
1101
1102 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1103 return -1;
1104
1105 result = *resultp;
1106 length = *lengthp;
1107 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1108 &result, &length);
1109
1110 if (retval < 0)
1111 {
1112 /* Close cd, but preserve the errno from str_cd_iconv. */
1113 int saved_errno = errno;
1114 iconveh_close (&cd);
1115 errno = saved_errno;
1116 }
1117 else
1118 {
1119 if (iconveh_close (&cd) < 0)
1120 {
1121 /* Return -1, but free the allocated memory, and while doing
1122 that, preserve the errno from iconveh_close. */
1123 int saved_errno = errno;
1124 if (result != *resultp && result != NULL)
1125 free (result);
1126 errno = saved_errno;
1127 return -1;
1128 }
1129 *resultp = result;
1130 *lengthp = length;
1131 }
1132 return retval;
1133 #else
1134 /* This is a different error code than if iconv_open existed but didn't
1135 support from_codeset and to_codeset, so that the caller can emit
1136 an error message such as
1137 "iconv() is not supported. Installing GNU libiconv and
1138 then reinstalling this package would fix this." */
1139 errno = ENOSYS;
1140 return -1;
1141 #endif
1142 }
1143 }
1144
1145 char *
1146 str_iconveh (const char *src,
1147 const char *from_codeset, const char *to_codeset,
1148 enum iconv_ilseq_handler handler)
1149 {
1150 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1151 {
1152 char *result = strdup (src);
1153
1154 if (result == NULL)
1155 errno = ENOMEM;
1156 return result;
1157 }
1158 else
1159 {
1160 #if HAVE_ICONV
1161 iconveh_t cd;
1162 char *result;
1163
1164 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1165 return NULL;
1166
1167 result = str_cd_iconveh (src, &cd, handler);
1168
1169 if (result == NULL)
1170 {
1171 /* Close cd, but preserve the errno from str_cd_iconv. */
1172 int saved_errno = errno;
1173 iconveh_close (&cd);
1174 errno = saved_errno;
1175 }
1176 else
1177 {
1178 if (iconveh_close (&cd) < 0)
1179 {
1180 /* Return NULL, but free the allocated memory, and while doing
1181 that, preserve the errno from iconveh_close. */
1182 int saved_errno = errno;
1183 free (result);
1184 errno = saved_errno;
1185 return NULL;
1186 }
1187 }
1188 return result;
1189 #else
1190 /* This is a different error code than if iconv_open existed but didn't
1191 support from_codeset and to_codeset, so that the caller can emit
1192 an error message such as
1193 "iconv() is not supported. Installing GNU libiconv and
1194 then reinstalling this package would fix this." */
1195 errno = ENOSYS;
1196 return NULL;
1197 #endif
1198 }
1199 }