Explicitly use Gnulib's `verify' module.
[bpt/guile.git] / lib / striconveh.c
CommitLineData
24d56127
LC
1/* Character set conversion with error handling.
2 Copyright (C) 2001-2008 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include "striconveh.h"
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdlib.h>
26#include <string.h>
27
28#if HAVE_ICONV
29# include <iconv.h>
30# include "unistr.h"
31#endif
32
33#include "c-strcase.h"
34#include "c-strcaseeq.h"
35
36#ifndef SIZE_MAX
37# define SIZE_MAX ((size_t) -1)
38#endif
39
40
41#if HAVE_ICONV
42
43/* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
44 error occurs, we may have to determine the Unicode representation of the
45 inconvertible character. */
46
47/* iconv_carefully is like iconv, except that it stops as soon as it encounters
48 a conversion error, and it returns in *INCREMENTED a boolean telling whether
49 it has incremented the input pointers past the error location. */
50# if !defined _LIBICONV_VERSION && !defined __GLIBC__
51/* Irix iconv() inserts a NUL byte if it cannot convert.
52 NetBSD iconv() inserts a question mark if it cannot convert.
53 Only GNU libiconv and GNU libc are known to prefer to fail rather
54 than doing a lossy conversion. */
55static size_t
56iconv_carefully (iconv_t cd,
57 const char **inbuf, size_t *inbytesleft,
58 char **outbuf, size_t *outbytesleft,
59 bool *incremented)
60{
61 const char *inptr = *inbuf;
62 const char *inptr_end = inptr + *inbytesleft;
63 char *outptr = *outbuf;
64 size_t outsize = *outbytesleft;
65 const char *inptr_before;
66 size_t res;
67
68 do
69 {
70 size_t insize;
71
72 inptr_before = inptr;
73 res = (size_t)(-1);
74
75 for (insize = 1; inptr + insize <= inptr_end; insize++)
76 {
77 res = iconv (cd,
78 (ICONV_CONST char **) &inptr, &insize,
79 &outptr, &outsize);
80 if (!(res == (size_t)(-1) && errno == EINVAL))
81 break;
82 /* iconv can eat up a shift sequence but give EINVAL while attempting
83 to convert the first character. E.g. libiconv does this. */
84 if (inptr > inptr_before)
85 {
86 res = 0;
87 break;
88 }
89 }
90
91 if (res == 0)
92 {
93 *outbuf = outptr;
94 *outbytesleft = outsize;
95 }
96 }
97 while (res == 0 && inptr < inptr_end);
98
99 *inbuf = inptr;
100 *inbytesleft = inptr_end - inptr;
101 if (res != (size_t)(-1) && res > 0)
102 {
103 /* iconv() has already incremented INPTR. We cannot go back to a
104 previous INPTR, otherwise the state inside CD would become invalid,
105 if FROM_CODESET is a stateful encoding. So, tell the caller that
106 *INBUF has already been incremented. */
107 *incremented = (inptr > inptr_before);
108 errno = EILSEQ;
109 return (size_t)(-1);
110 }
111 else
112 {
113 *incremented = false;
114 return res;
115 }
116}
117# else
118# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
119 (*(incremented) = false, \
120 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
121# endif
122
123/* iconv_carefully_1 is like iconv_carefully, except that it stops after
124 converting one character or one shift sequence. */
125static size_t
126iconv_carefully_1 (iconv_t cd,
127 const char **inbuf, size_t *inbytesleft,
128 char **outbuf, size_t *outbytesleft,
129 bool *incremented)
130{
131 const char *inptr_before = *inbuf;
132 const char *inptr = inptr_before;
133 const char *inptr_end = inptr_before + *inbytesleft;
134 char *outptr = *outbuf;
135 size_t outsize = *outbytesleft;
136 size_t res = (size_t)(-1);
137 size_t insize;
138
139 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
140 {
141 inptr = inptr_before;
142 res = iconv (cd,
143 (ICONV_CONST char **) &inptr, &insize,
144 &outptr, &outsize);
145 if (!(res == (size_t)(-1) && errno == EINVAL))
146 break;
147 /* iconv can eat up a shift sequence but give EINVAL while attempting
148 to convert the first character. E.g. libiconv does this. */
149 if (inptr > inptr_before)
150 {
151 res = 0;
152 break;
153 }
154 }
155
156 *inbuf = inptr;
157 *inbytesleft = inptr_end - inptr;
158# if !defined _LIBICONV_VERSION && !defined __GLIBC__
159 /* Irix iconv() inserts a NUL byte if it cannot convert.
160 NetBSD iconv() inserts a question mark if it cannot convert.
161 Only GNU libiconv and GNU libc are known to prefer to fail rather
162 than doing a lossy conversion. */
163 if (res != (size_t)(-1) && res > 0)
164 {
165 /* iconv() has already incremented INPTR. We cannot go back to a
166 previous INPTR, otherwise the state inside CD would become invalid,
167 if FROM_CODESET is a stateful encoding. So, tell the caller that
168 *INBUF has already been incremented. */
169 *incremented = (inptr > inptr_before);
170 errno = EILSEQ;
171 return (size_t)(-1);
172 }
173# endif
174
175 if (res != (size_t)(-1))
176 {
177 *outbuf = outptr;
178 *outbytesleft = outsize;
179 }
180 *incremented = false;
181 return res;
182}
183
184/* utf8conv_carefully is like iconv, except that
185 - it converts from UTF-8 to UTF-8,
186 - it stops as soon as it encounters a conversion error, and it returns
187 in *INCREMENTED a boolean telling whether it has incremented the input
188 pointers past the error location,
189 - if one_character_only is true, it stops after converting one
190 character. */
191static size_t
192utf8conv_carefully (bool one_character_only,
193 const char **inbuf, size_t *inbytesleft,
194 char **outbuf, size_t *outbytesleft,
195 bool *incremented)
196{
197 const char *inptr = *inbuf;
198 size_t insize = *inbytesleft;
199 char *outptr = *outbuf;
200 size_t outsize = *outbytesleft;
201 size_t res;
202
203 res = 0;
204 do
205 {
206 ucs4_t uc;
207 int n;
208 int m;
209
210 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
211 if (n < 0)
212 {
213 errno = (n == -2 ? EINVAL : EILSEQ);
214 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
215 inptr += n;
216 insize -= n;
217 res = (size_t)(-1);
218 *incremented = true;
219 break;
220 }
221 if (outsize == 0)
222 {
223 errno = E2BIG;
224 res = (size_t)(-1);
225 *incremented = false;
226 break;
227 }
228 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
229 if (m == -2)
230 {
231 errno = E2BIG;
232 res = (size_t)(-1);
233 *incremented = false;
234 break;
235 }
236 inptr += n;
237 insize -= n;
238 if (m == -1)
239 {
240 errno = EILSEQ;
241 res = (size_t)(-1);
242 *incremented = true;
243 break;
244 }
245 outptr += m;
246 outsize -= m;
247 }
248 while (!one_character_only && insize > 0);
249
250 *inbuf = inptr;
251 *inbytesleft = insize;
252 *outbuf = outptr;
253 *outbytesleft = outsize;
254 return res;
255}
256
257static int
258mem_cd_iconveh_internal (const char *src, size_t srclen,
259 iconv_t cd, iconv_t cd1, iconv_t cd2,
260 enum iconv_ilseq_handler handler,
261 size_t extra_alloc,
262 size_t *offsets,
263 char **resultp, size_t *lengthp)
264{
265 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
266 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
267 Instead, we have to start afresh from the beginning of SRC. */
268 /* Use a temporary buffer, so that for small strings, a single malloc()
269 call will be sufficient. */
270# define tmpbufsize 4096
271 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
272 libiconv's UCS-4-INTERNAL encoding. */
273 union { unsigned int align; char buf[tmpbufsize]; } tmp;
274# define tmpbuf tmp.buf
275
276 char *initial_result;
277 char *result;
278 size_t allocated;
279 size_t length;
280 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
281
282 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
283 {
284 initial_result = *resultp;
285 allocated = *lengthp;
286 }
287 else
288 {
289 initial_result = tmpbuf;
290 allocated = sizeof (tmpbuf);
291 }
292 result = initial_result;
293
294 /* Test whether a direct conversion is possible at all. */
295 if (cd == (iconv_t)(-1))
296 goto indirectly;
297
298 if (offsets != NULL)
299 {
300 size_t i;
301
302 for (i = 0; i < srclen; i++)
303 offsets[i] = (size_t)(-1);
304
305 last_length = (size_t)(-1);
306 }
307 length = 0;
308
309 /* First, try a direct conversion, and see whether a conversion error
310 occurs at all. */
311 {
312 const char *inptr = src;
313 size_t insize = srclen;
314
315 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
316# if defined _LIBICONV_VERSION \
317 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
318 /* Set to the initial state. */
319 iconv (cd, NULL, NULL, NULL, NULL);
320# endif
321
322 while (insize > 0)
323 {
324 char *outptr = result + length;
325 size_t outsize = allocated - extra_alloc - length;
326 bool incremented;
327 size_t res;
328 bool grow;
329
330 if (offsets != NULL)
331 {
332 if (length != last_length) /* ensure that offset[] be increasing */
333 {
334 offsets[inptr - src] = length;
335 last_length = length;
336 }
337 res = iconv_carefully_1 (cd,
338 &inptr, &insize,
339 &outptr, &outsize,
340 &incremented);
341 }
342 else
343 /* Use iconv_carefully instead of iconv here, because:
344 - If TO_CODESET is UTF-8, we can do the error handling in this
345 loop, no need for a second loop,
346 - With iconv() implementations other than GNU libiconv and GNU
347 libc, if we use iconv() in a big swoop, checking for an E2BIG
348 return, we lose the number of irreversible conversions. */
349 res = iconv_carefully (cd,
350 &inptr, &insize,
351 &outptr, &outsize,
352 &incremented);
353
354 length = outptr - result;
355 grow = (length + extra_alloc > allocated / 2);
356 if (res == (size_t)(-1))
357 {
358 if (errno == E2BIG)
359 grow = true;
360 else if (errno == EINVAL)
361 break;
362 else if (errno == EILSEQ && handler != iconveh_error)
363 {
364 if (cd2 == (iconv_t)(-1))
365 {
366 /* TO_CODESET is UTF-8. */
367 /* Error handling can produce up to 1 byte of output. */
368 if (length + 1 + extra_alloc > allocated)
369 {
370 char *memory;
371
372 allocated = 2 * allocated;
373 if (length + 1 + extra_alloc > allocated)
374 abort ();
375 if (result == initial_result)
376 memory = (char *) malloc (allocated);
377 else
378 memory = (char *) realloc (result, allocated);
379 if (memory == NULL)
380 {
381 if (result != initial_result)
382 free (result);
383 errno = ENOMEM;
384 return -1;
385 }
386 if (result == initial_result)
387 memcpy (memory, initial_result, length);
388 result = memory;
389 grow = false;
390 }
391 /* The input is invalid in FROM_CODESET. Eat up one byte
392 and emit a question mark. */
393 if (!incremented)
394 {
395 if (insize == 0)
396 abort ();
397 inptr++;
398 insize--;
399 }
400 result[length] = '?';
401 length++;
402 }
403 else
404 goto indirectly;
405 }
406 else
407 {
408 if (result != initial_result)
409 {
410 int saved_errno = errno;
411 free (result);
412 errno = saved_errno;
413 }
414 return -1;
415 }
416 }
417 if (insize == 0)
418 break;
419 if (grow)
420 {
421 char *memory;
422
423 allocated = 2 * allocated;
424 if (result == initial_result)
425 memory = (char *) malloc (allocated);
426 else
427 memory = (char *) realloc (result, allocated);
428 if (memory == NULL)
429 {
430 if (result != initial_result)
431 free (result);
432 errno = ENOMEM;
433 return -1;
434 }
435 if (result == initial_result)
436 memcpy (memory, initial_result, length);
437 result = memory;
438 }
439 }
440 }
441
442 /* Now get the conversion state back to the initial state.
443 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
444#if defined _LIBICONV_VERSION \
445 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
446 for (;;)
447 {
448 char *outptr = result + length;
449 size_t outsize = allocated - extra_alloc - length;
450 size_t res;
451
452 res = iconv (cd, NULL, NULL, &outptr, &outsize);
453 length = outptr - result;
454 if (res == (size_t)(-1))
455 {
456 if (errno == E2BIG)
457 {
458 char *memory;
459
460 allocated = 2 * allocated;
461 if (result == initial_result)
462 memory = (char *) malloc (allocated);
463 else
464 memory = (char *) realloc (result, allocated);
465 if (memory == NULL)
466 {
467 if (result != initial_result)
468 free (result);
469 errno = ENOMEM;
470 return -1;
471 }
472 if (result == initial_result)
473 memcpy (memory, initial_result, length);
474 result = memory;
475 }
476 else
477 {
478 if (result != initial_result)
479 {
480 int saved_errno = errno;
481 free (result);
482 errno = saved_errno;
483 }
484 return -1;
485 }
486 }
487 else
488 break;
489 }
490#endif
491
492 /* The direct conversion succeeded. */
493 goto done;
494
495 indirectly:
496 /* The direct conversion failed.
497 Use a conversion through UTF-8. */
498 if (offsets != NULL)
499 {
500 size_t i;
501
502 for (i = 0; i < srclen; i++)
503 offsets[i] = (size_t)(-1);
504
505 last_length = (size_t)(-1);
506 }
507 length = 0;
508 {
509 const bool slowly = (offsets != NULL || handler == iconveh_error);
510# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
511 char utf8buf[utf8bufsize + 1];
512 size_t utf8len = 0;
513 const char *in1ptr = src;
514 size_t in1size = srclen;
515 bool do_final_flush1 = true;
516 bool do_final_flush2 = true;
517
518 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
519# if defined _LIBICONV_VERSION \
520 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
521 /* Set to the initial state. */
522 if (cd1 != (iconv_t)(-1))
523 iconv (cd1, NULL, NULL, NULL, NULL);
524 if (cd2 != (iconv_t)(-1))
525 iconv (cd2, NULL, NULL, NULL, NULL);
526# endif
527
528 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
529 {
530 char *out1ptr = utf8buf + utf8len;
531 size_t out1size = utf8bufsize - utf8len;
532 bool incremented1;
533 size_t res1;
534 int errno1;
535
536 /* Conversion step 1: from FROM_CODESET to UTF-8. */
537 if (in1size > 0)
538 {
539 if (offsets != NULL
540 && length != last_length) /* ensure that offset[] be increasing */
541 {
542 offsets[in1ptr - src] = length;
543 last_length = length;
544 }
545 if (cd1 != (iconv_t)(-1))
546 {
547 if (slowly)
548 res1 = iconv_carefully_1 (cd1,
549 &in1ptr, &in1size,
550 &out1ptr, &out1size,
551 &incremented1);
552 else
553 res1 = iconv_carefully (cd1,
554 &in1ptr, &in1size,
555 &out1ptr, &out1size,
556 &incremented1);
557 }
558 else
559 {
560 /* FROM_CODESET is UTF-8. */
561 res1 = utf8conv_carefully (slowly,
562 &in1ptr, &in1size,
563 &out1ptr, &out1size,
564 &incremented1);
565 }
566 }
567 else if (do_final_flush1)
568 {
569 /* Now get the conversion state of CD1 back to the initial state.
570 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
571# if defined _LIBICONV_VERSION \
572 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
573 if (cd1 != (iconv_t)(-1))
574 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
575 else
576# endif
577 res1 = 0;
578 do_final_flush1 = false;
579 incremented1 = true;
580 }
581 else
582 {
583 res1 = 0;
584 incremented1 = true;
585 }
586 if (res1 == (size_t)(-1)
587 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
588 {
589 if (result != initial_result)
590 {
591 int saved_errno = errno;
592 free (result);
593 errno = saved_errno;
594 }
595 return -1;
596 }
597 if (res1 == (size_t)(-1)
598 && errno == EILSEQ && handler != iconveh_error)
599 {
600 /* The input is invalid in FROM_CODESET. Eat up one byte and
601 emit a question mark. Room for the question mark was allocated
602 at the end of utf8buf. */
603 if (!incremented1)
604 {
605 if (in1size == 0)
606 abort ();
607 in1ptr++;
608 in1size--;
609 }
610 utf8buf[utf8len++] = '?';
611 }
612 errno1 = errno;
613 utf8len = out1ptr - utf8buf;
614
615 if (offsets != NULL
616 || in1size == 0
617 || utf8len > utf8bufsize / 2
618 || (res1 == (size_t)(-1) && errno1 == E2BIG))
619 {
620 /* Conversion step 2: from UTF-8 to TO_CODESET. */
621 const char *in2ptr = utf8buf;
622 size_t in2size = utf8len;
623
624 while (in2size > 0
625 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
626 {
627 char *out2ptr = result + length;
628 size_t out2size = allocated - extra_alloc - length;
629 bool incremented2;
630 size_t res2;
631 bool grow;
632
633 if (in2size > 0)
634 {
635 if (cd2 != (iconv_t)(-1))
636 res2 = iconv_carefully (cd2,
637 &in2ptr, &in2size,
638 &out2ptr, &out2size,
639 &incremented2);
640 else
641 /* TO_CODESET is UTF-8. */
642 res2 = utf8conv_carefully (false,
643 &in2ptr, &in2size,
644 &out2ptr, &out2size,
645 &incremented2);
646 }
647 else /* in1size == 0 && !do_final_flush1
648 && in2size == 0 && do_final_flush2 */
649 {
650 /* Now get the conversion state of CD1 back to the initial
651 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
652# if defined _LIBICONV_VERSION \
653 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
654 if (cd2 != (iconv_t)(-1))
655 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
656 else
657# endif
658 res2 = 0;
659 do_final_flush2 = false;
660 incremented2 = true;
661 }
662
663 length = out2ptr - result;
664 grow = (length + extra_alloc > allocated / 2);
665 if (res2 == (size_t)(-1))
666 {
667 if (errno == E2BIG)
668 grow = true;
669 else if (errno == EINVAL)
670 break;
671 else if (errno == EILSEQ && handler != iconveh_error)
672 {
673 /* Error handling can produce up to 10 bytes of ASCII
674 output. But TO_CODESET may be UCS-2, UTF-16 or
675 UCS-4, so use CD2 here as well. */
676 char scratchbuf[10];
677 size_t scratchlen;
678 ucs4_t uc;
679 const char *inptr;
680 size_t insize;
681 size_t res;
682
683 if (incremented2)
684 {
685 if (u8_prev (&uc, (const uint8_t *) in2ptr,
686 (const uint8_t *) utf8buf)
687 == NULL)
688 abort ();
689 }
690 else
691 {
692 int n;
693 if (in2size == 0)
694 abort ();
695 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
696 in2size);
697 in2ptr += n;
698 in2size -= n;
699 }
700
701 if (handler == iconveh_escape_sequence)
702 {
703 static char hex[16] = "0123456789ABCDEF";
704 scratchlen = 0;
705 scratchbuf[scratchlen++] = '\\';
706 if (uc < 0x10000)
707 scratchbuf[scratchlen++] = 'u';
708 else
709 {
710 scratchbuf[scratchlen++] = 'U';
711 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
712 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
713 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
714 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
715 }
716 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
717 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
718 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
719 scratchbuf[scratchlen++] = hex[uc & 15];
720 }
721 else
722 {
723 scratchbuf[0] = '?';
724 scratchlen = 1;
725 }
726
727 inptr = scratchbuf;
728 insize = scratchlen;
729 if (cd2 != (iconv_t)(-1))
730 res = iconv (cd2,
731 (ICONV_CONST char **) &inptr, &insize,
732 &out2ptr, &out2size);
733 else
734 {
735 /* TO_CODESET is UTF-8. */
736 if (out2size >= insize)
737 {
738 memcpy (out2ptr, inptr, insize);
739 out2ptr += insize;
740 out2size -= insize;
741 inptr += insize;
742 insize = 0;
743 res = 0;
744 }
745 else
746 {
747 errno = E2BIG;
748 res = (size_t)(-1);
749 }
750 }
751 length = out2ptr - result;
752 if (res == (size_t)(-1) && errno == E2BIG)
753 {
754 char *memory;
755
756 allocated = 2 * allocated;
757 if (length + 1 + extra_alloc > allocated)
758 abort ();
759 if (result == initial_result)
760 memory = (char *) malloc (allocated);
761 else
762 memory = (char *) realloc (result, allocated);
763 if (memory == NULL)
764 {
765 if (result != initial_result)
766 free (result);
767 errno = ENOMEM;
768 return -1;
769 }
770 if (result == initial_result)
771 memcpy (memory, initial_result, length);
772 result = memory;
773 grow = false;
774
775 out2ptr = result + length;
776 out2size = allocated - extra_alloc - length;
777 if (cd2 != (iconv_t)(-1))
778 res = iconv (cd2,
779 (ICONV_CONST char **) &inptr,
780 &insize,
781 &out2ptr, &out2size);
782 else
783 {
784 /* TO_CODESET is UTF-8. */
785 if (!(out2size >= insize))
786 abort ();
787 memcpy (out2ptr, inptr, insize);
788 out2ptr += insize;
789 out2size -= insize;
790 inptr += insize;
791 insize = 0;
792 res = 0;
793 }
794 length = out2ptr - result;
795 }
796# if !defined _LIBICONV_VERSION && !defined __GLIBC__
797 /* Irix iconv() inserts a NUL byte if it cannot convert.
798 NetBSD iconv() inserts a question mark if it cannot
799 convert.
800 Only GNU libiconv and GNU libc are known to prefer
801 to fail rather than doing a lossy conversion. */
802 if (res != (size_t)(-1) && res > 0)
803 {
804 errno = EILSEQ;
805 res = (size_t)(-1);
806 }
807# endif
808 if (res == (size_t)(-1))
809 {
810 /* Failure converting the ASCII replacement. */
811 if (result != initial_result)
812 {
813 int saved_errno = errno;
814 free (result);
815 errno = saved_errno;
816 }
817 return -1;
818 }
819 }
820 else
821 {
822 if (result != initial_result)
823 {
824 int saved_errno = errno;
825 free (result);
826 errno = saved_errno;
827 }
828 return -1;
829 }
830 }
831 if (!(in2size > 0
832 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
833 break;
834 if (grow)
835 {
836 char *memory;
837
838 allocated = 2 * allocated;
839 if (result == initial_result)
840 memory = (char *) malloc (allocated);
841 else
842 memory = (char *) realloc (result, allocated);
843 if (memory == NULL)
844 {
845 if (result != initial_result)
846 free (result);
847 errno = ENOMEM;
848 return -1;
849 }
850 if (result == initial_result)
851 memcpy (memory, initial_result, length);
852 result = memory;
853 }
854 }
855
856 /* Move the remaining bytes to the beginning of utf8buf. */
857 if (in2size > 0)
858 memmove (utf8buf, in2ptr, in2size);
859 utf8len = in2size;
860 }
861
862 if (res1 == (size_t)(-1))
863 {
864 if (errno1 == EINVAL)
865 in1size = 0;
866 else if (errno1 == EILSEQ)
867 {
868 if (result != initial_result)
869 free (result);
870 errno = errno1;
871 return -1;
872 }
873 }
874 }
875# undef utf8bufsize
876 }
877
878 done:
879 /* Now the final memory allocation. */
880 if (result == tmpbuf)
881 {
882 size_t memsize = length + extra_alloc;
883 char *memory;
884
885 memory = (char *) malloc (memsize > 0 ? memsize : 1);
886 if (memory != NULL)
887 {
888 memcpy (memory, tmpbuf, length);
889 result = memory;
890 }
891 else
892 {
893 errno = ENOMEM;
894 return -1;
895 }
896 }
897 else if (result != *resultp && length + extra_alloc < allocated)
898 {
899 /* Shrink the allocated memory if possible. */
900 size_t memsize = length + extra_alloc;
901 char *memory;
902
903 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
904 if (memory != NULL)
905 result = memory;
906 }
907 *resultp = result;
908 *lengthp = length;
909 return 0;
910# undef tmpbuf
911# undef tmpbufsize
912}
913
914int
915mem_cd_iconveh (const char *src, size_t srclen,
916 iconv_t cd, iconv_t cd1, iconv_t cd2,
917 enum iconv_ilseq_handler handler,
918 size_t *offsets,
919 char **resultp, size_t *lengthp)
920{
921 return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
922 offsets, resultp, lengthp);
923}
924
925char *
926str_cd_iconveh (const char *src,
927 iconv_t cd, iconv_t cd1, iconv_t cd2,
928 enum iconv_ilseq_handler handler)
929{
930 /* For most encodings, a trailing NUL byte in the input will be converted
931 to a trailing NUL byte in the output. But not for UTF-7. So that this
932 function is usable for UTF-7, we have to exclude the NUL byte from the
933 conversion and add it by hand afterwards. */
934 char *result = NULL;
935 size_t length = 0;
936 int retval = mem_cd_iconveh_internal (src, strlen (src),
937 cd, cd1, cd2, handler, 1, NULL,
938 &result, &length);
939
940 if (retval < 0)
941 {
942 if (result != NULL)
943 {
944 int saved_errno = errno;
945 free (result);
946 errno = saved_errno;
947 }
948 return NULL;
949 }
950
951 /* Add the terminating NUL byte. */
952 result[length] = '\0';
953
954 return result;
955}
956
957#endif
958
959int
960mem_iconveh (const char *src, size_t srclen,
961 const char *from_codeset, const char *to_codeset,
962 enum iconv_ilseq_handler handler,
963 size_t *offsets,
964 char **resultp, size_t *lengthp)
965{
966 if (srclen == 0)
967 {
968 /* Nothing to convert. */
969 *lengthp = 0;
970 return 0;
971 }
972 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
973 {
974 char *result;
975
976 if (*resultp != NULL && *lengthp >= srclen)
977 result = *resultp;
978 else
979 {
980 result = (char *) malloc (srclen);
981 if (result == NULL)
982 {
983 errno = ENOMEM;
984 return -1;
985 }
986 }
987 memcpy (result, src, srclen);
988 *resultp = result;
989 *lengthp = srclen;
990 return 0;
991 }
992 else
993 {
994#if HAVE_ICONV
995 iconv_t cd;
996 iconv_t cd1;
997 iconv_t cd2;
998 char *result;
999 size_t length;
1000 int retval;
1001
1002 /* Avoid glibc-2.1 bug with EUC-KR. */
1003# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1004 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
1005 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
1006 {
1007 errno = EINVAL;
1008 return -1;
1009 }
1010# endif
1011
1012 cd = iconv_open (to_codeset, from_codeset);
1013
1014 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1015 cd1 = (iconv_t)(-1);
1016 else
1017 {
1018 cd1 = iconv_open ("UTF-8", from_codeset);
1019 if (cd1 == (iconv_t)(-1))
1020 {
1021 int saved_errno = errno;
1022 if (cd != (iconv_t)(-1))
1023 iconv_close (cd);
1024 errno = saved_errno;
1025 return -1;
1026 }
1027 }
1028
1029 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
1030# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
1031 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
1032# endif
1033 )
1034 cd2 = (iconv_t)(-1);
1035 else
1036 {
1037 cd2 = iconv_open (to_codeset, "UTF-8");
1038 if (cd2 == (iconv_t)(-1))
1039 {
1040 int saved_errno = errno;
1041 if (cd1 != (iconv_t)(-1))
1042 iconv_close (cd1);
1043 if (cd != (iconv_t)(-1))
1044 iconv_close (cd);
1045 errno = saved_errno;
1046 return -1;
1047 }
1048 }
1049
1050 result = *resultp;
1051 length = *lengthp;
1052 retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
1053 &result, &length);
1054
1055 if (retval < 0)
1056 {
1057 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
1058 int saved_errno = errno;
1059 if (cd2 != (iconv_t)(-1))
1060 iconv_close (cd2);
1061 if (cd1 != (iconv_t)(-1))
1062 iconv_close (cd1);
1063 if (cd != (iconv_t)(-1))
1064 iconv_close (cd);
1065 errno = saved_errno;
1066 }
1067 else
1068 {
1069 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
1070 {
1071 /* Return -1, but free the allocated memory, and while doing
1072 that, preserve the errno from iconv_close. */
1073 int saved_errno = errno;
1074 if (cd1 != (iconv_t)(-1))
1075 iconv_close (cd1);
1076 if (cd != (iconv_t)(-1))
1077 iconv_close (cd);
1078 if (result != *resultp && result != NULL)
1079 free (result);
1080 errno = saved_errno;
1081 return -1;
1082 }
1083 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
1084 {
1085 /* Return -1, but free the allocated memory, and while doing
1086 that, preserve the errno from iconv_close. */
1087 int saved_errno = errno;
1088 if (cd != (iconv_t)(-1))
1089 iconv_close (cd);
1090 if (result != *resultp && result != NULL)
1091 free (result);
1092 errno = saved_errno;
1093 return -1;
1094 }
1095 if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
1096 {
1097 /* Return -1, but free the allocated memory, and while doing
1098 that, preserve the errno from iconv_close. */
1099 int saved_errno = errno;
1100 if (result != *resultp && result != NULL)
1101 free (result);
1102 errno = saved_errno;
1103 return -1;
1104 }
1105 *resultp = result;
1106 *lengthp = length;
1107 }
1108 return retval;
1109#else
1110 /* This is a different error code than if iconv_open existed but didn't
1111 support from_codeset and to_codeset, so that the caller can emit
1112 an error message such as
1113 "iconv() is not supported. Installing GNU libiconv and
1114 then reinstalling this package would fix this." */
1115 errno = ENOSYS;
1116 return -1;
1117#endif
1118 }
1119}
1120
1121char *
1122str_iconveh (const char *src,
1123 const char *from_codeset, const char *to_codeset,
1124 enum iconv_ilseq_handler handler)
1125{
1126 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1127 {
1128 char *result = strdup (src);
1129
1130 if (result == NULL)
1131 errno = ENOMEM;
1132 return result;
1133 }
1134 else
1135 {
1136#if HAVE_ICONV
1137 iconv_t cd;
1138 iconv_t cd1;
1139 iconv_t cd2;
1140 char *result;
1141
1142 /* Avoid glibc-2.1 bug with EUC-KR. */
1143# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1144 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
1145 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
1146 {
1147 errno = EINVAL;
1148 return NULL;
1149 }
1150# endif
1151
1152 cd = iconv_open (to_codeset, from_codeset);
1153
1154 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
1155 cd1 = (iconv_t)(-1);
1156 else
1157 {
1158 cd1 = iconv_open ("UTF-8", from_codeset);
1159 if (cd1 == (iconv_t)(-1))
1160 {
1161 int saved_errno = errno;
1162 if (cd != (iconv_t)(-1))
1163 iconv_close (cd);
1164 errno = saved_errno;
1165 return NULL;
1166 }
1167 }
1168
1169 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
1170# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
1171 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
1172# endif
1173 )
1174 cd2 = (iconv_t)(-1);
1175 else
1176 {
1177 cd2 = iconv_open (to_codeset, "UTF-8");
1178 if (cd2 == (iconv_t)(-1))
1179 {
1180 int saved_errno = errno;
1181 if (cd1 != (iconv_t)(-1))
1182 iconv_close (cd1);
1183 if (cd != (iconv_t)(-1))
1184 iconv_close (cd);
1185 errno = saved_errno;
1186 return NULL;
1187 }
1188 }
1189
1190 result = str_cd_iconveh (src, cd, cd1, cd2, handler);
1191
1192 if (result == NULL)
1193 {
1194 /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv. */
1195 int saved_errno = errno;
1196 if (cd2 != (iconv_t)(-1))
1197 iconv_close (cd2);
1198 if (cd1 != (iconv_t)(-1))
1199 iconv_close (cd1);
1200 if (cd != (iconv_t)(-1))
1201 iconv_close (cd);
1202 errno = saved_errno;
1203 }
1204 else
1205 {
1206 if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
1207 {
1208 /* Return NULL, but free the allocated memory, and while doing
1209 that, preserve the errno from iconv_close. */
1210 int saved_errno = errno;
1211 if (cd1 != (iconv_t)(-1))
1212 iconv_close (cd1);
1213 if (cd != (iconv_t)(-1))
1214 iconv_close (cd);
1215 free (result);
1216 errno = saved_errno;
1217 return NULL;
1218 }
1219 if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
1220 {
1221 /* Return NULL, but free the allocated memory, and while doing
1222 that, preserve the errno from iconv_close. */
1223 int saved_errno = errno;
1224 if (cd != (iconv_t)(-1))
1225 iconv_close (cd);
1226 free (result);
1227 errno = saved_errno;
1228 return NULL;
1229 }
1230 if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
1231 {
1232 /* Return NULL, but free the allocated memory, and while doing
1233 that, preserve the errno from iconv_close. */
1234 int saved_errno = errno;
1235 free (result);
1236 errno = saved_errno;
1237 return NULL;
1238 }
1239 }
1240 return result;
1241#else
1242 /* This is a different error code than if iconv_open existed but didn't
1243 support from_codeset and to_codeset, so that the caller can emit
1244 an error message such as
1245 "iconv() is not supported. Installing GNU libiconv and
1246 then reinstalling this package would fix this." */
1247 errno = ENOSYS;
1248 return NULL;
1249#endif
1250 }
1251}