Merge remote-tracking branch 'local-2.0/stable-2.0'
[bpt/guile.git] / lib / striconveh.c
CommitLineData
24d56127 1/* Character set conversion with error handling.
f0007cad 2 Copyright (C) 2001-2012 Free Software Foundation, Inc.
24d56127
LC
3 Written by Bruno Haible and Simon Josefsson.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include "striconveh.h"
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdlib.h>
26#include <string.h>
27
28#if HAVE_ICONV
29# include <iconv.h>
30# include "unistr.h"
31#endif
32
33#include "c-strcase.h"
34#include "c-strcaseeq.h"
35
36#ifndef SIZE_MAX
37# define SIZE_MAX ((size_t) -1)
38#endif
39
40
41#if HAVE_ICONV
42
8912421c
LC
43/* The caller must provide an iconveh_t, not just an iconv_t, because when a
44 conversion error occurs, we may have to determine the Unicode representation
45 of the inconvertible character. */
46
47int
48iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49{
50 iconv_t cd;
51 iconv_t cd1;
52 iconv_t cd2;
53
54 /* Avoid glibc-2.1 bug with EUC-KR. */
0f00f2c3
LC
55# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
56 && !defined _LIBICONV_VERSION
8912421c
LC
57 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
58 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
59 {
60 errno = EINVAL;
61 return -1;
62 }
63# endif
64
65 cd = iconv_open (to_codeset, from_codeset);
66
67 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
68 cd1 = (iconv_t)(-1);
69 else
70 {
71 cd1 = iconv_open ("UTF-8", from_codeset);
72 if (cd1 == (iconv_t)(-1))
1cd4fffc
LC
73 {
74 int saved_errno = errno;
75 if (cd != (iconv_t)(-1))
76 iconv_close (cdp->cd);
77 errno = saved_errno;
78 return -1;
79 }
8912421c
LC
80 }
81
82 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
0f00f2c3
LC
83# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
84 && !defined __UCLIBC__) \
85 || _LIBICONV_VERSION >= 0x0105
8912421c
LC
86 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
87# endif
88 )
89 cd2 = (iconv_t)(-1);
90 else
91 {
92 cd2 = iconv_open (to_codeset, "UTF-8");
93 if (cd2 == (iconv_t)(-1))
1cd4fffc
LC
94 {
95 int saved_errno = errno;
96 if (cd1 != (iconv_t)(-1))
97 iconv_close (cd1);
98 if (cd != (iconv_t)(-1))
99 iconv_close (cd);
100 errno = saved_errno;
101 return -1;
102 }
8912421c
LC
103 }
104
105 cdp->cd = cd;
106 cdp->cd1 = cd1;
107 cdp->cd2 = cd2;
108 return 0;
109}
110
111int
112iconveh_close (const iconveh_t *cd)
113{
114 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
115 {
116 /* Return -1, but preserve the errno from iconv_close. */
117 int saved_errno = errno;
118 if (cd->cd1 != (iconv_t)(-1))
1cd4fffc 119 iconv_close (cd->cd1);
8912421c 120 if (cd->cd != (iconv_t)(-1))
1cd4fffc 121 iconv_close (cd->cd);
8912421c
LC
122 errno = saved_errno;
123 return -1;
124 }
125 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
126 {
127 /* Return -1, but preserve the errno from iconv_close. */
128 int saved_errno = errno;
129 if (cd->cd != (iconv_t)(-1))
1cd4fffc 130 iconv_close (cd->cd);
8912421c
LC
131 errno = saved_errno;
132 return -1;
133 }
134 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
135 return -1;
136 return 0;
137}
24d56127
LC
138
139/* iconv_carefully is like iconv, except that it stops as soon as it encounters
140 a conversion error, and it returns in *INCREMENTED a boolean telling whether
141 it has incremented the input pointers past the error location. */
0f00f2c3 142# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
24d56127
LC
143/* Irix iconv() inserts a NUL byte if it cannot convert.
144 NetBSD iconv() inserts a question mark if it cannot convert.
145 Only GNU libiconv and GNU libc are known to prefer to fail rather
146 than doing a lossy conversion. */
147static size_t
148iconv_carefully (iconv_t cd,
1cd4fffc
LC
149 const char **inbuf, size_t *inbytesleft,
150 char **outbuf, size_t *outbytesleft,
151 bool *incremented)
24d56127
LC
152{
153 const char *inptr = *inbuf;
154 const char *inptr_end = inptr + *inbytesleft;
155 char *outptr = *outbuf;
156 size_t outsize = *outbytesleft;
157 const char *inptr_before;
158 size_t res;
159
160 do
161 {
162 size_t insize;
163
164 inptr_before = inptr;
165 res = (size_t)(-1);
166
167 for (insize = 1; inptr + insize <= inptr_end; insize++)
1cd4fffc
LC
168 {
169 res = iconv (cd,
170 (ICONV_CONST char **) &inptr, &insize,
171 &outptr, &outsize);
172 if (!(res == (size_t)(-1) && errno == EINVAL))
173 break;
174 /* iconv can eat up a shift sequence but give EINVAL while attempting
175 to convert the first character. E.g. libiconv does this. */
176 if (inptr > inptr_before)
177 {
178 res = 0;
179 break;
180 }
181 }
24d56127
LC
182
183 if (res == 0)
1cd4fffc
LC
184 {
185 *outbuf = outptr;
186 *outbytesleft = outsize;
187 }
24d56127
LC
188 }
189 while (res == 0 && inptr < inptr_end);
190
191 *inbuf = inptr;
192 *inbytesleft = inptr_end - inptr;
193 if (res != (size_t)(-1) && res > 0)
194 {
195 /* iconv() has already incremented INPTR. We cannot go back to a
1cd4fffc
LC
196 previous INPTR, otherwise the state inside CD would become invalid,
197 if FROM_CODESET is a stateful encoding. So, tell the caller that
198 *INBUF has already been incremented. */
24d56127
LC
199 *incremented = (inptr > inptr_before);
200 errno = EILSEQ;
201 return (size_t)(-1);
202 }
203 else
204 {
205 *incremented = false;
206 return res;
207 }
208}
209# else
210# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
211 (*(incremented) = false, \
212 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
213# endif
214
215/* iconv_carefully_1 is like iconv_carefully, except that it stops after
216 converting one character or one shift sequence. */
217static size_t
218iconv_carefully_1 (iconv_t cd,
1cd4fffc
LC
219 const char **inbuf, size_t *inbytesleft,
220 char **outbuf, size_t *outbytesleft,
221 bool *incremented)
24d56127
LC
222{
223 const char *inptr_before = *inbuf;
224 const char *inptr = inptr_before;
225 const char *inptr_end = inptr_before + *inbytesleft;
226 char *outptr = *outbuf;
227 size_t outsize = *outbytesleft;
228 size_t res = (size_t)(-1);
229 size_t insize;
230
231 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
232 {
233 inptr = inptr_before;
234 res = iconv (cd,
1cd4fffc
LC
235 (ICONV_CONST char **) &inptr, &insize,
236 &outptr, &outsize);
24d56127 237 if (!(res == (size_t)(-1) && errno == EINVAL))
1cd4fffc 238 break;
24d56127 239 /* iconv can eat up a shift sequence but give EINVAL while attempting
1cd4fffc 240 to convert the first character. E.g. libiconv does this. */
24d56127 241 if (inptr > inptr_before)
1cd4fffc
LC
242 {
243 res = 0;
244 break;
245 }
24d56127
LC
246 }
247
248 *inbuf = inptr;
249 *inbytesleft = inptr_end - inptr;
0f00f2c3 250# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
24d56127
LC
251 /* Irix iconv() inserts a NUL byte if it cannot convert.
252 NetBSD iconv() inserts a question mark if it cannot convert.
253 Only GNU libiconv and GNU libc are known to prefer to fail rather
254 than doing a lossy conversion. */
255 if (res != (size_t)(-1) && res > 0)
256 {
257 /* iconv() has already incremented INPTR. We cannot go back to a
1cd4fffc
LC
258 previous INPTR, otherwise the state inside CD would become invalid,
259 if FROM_CODESET is a stateful encoding. So, tell the caller that
260 *INBUF has already been incremented. */
24d56127
LC
261 *incremented = (inptr > inptr_before);
262 errno = EILSEQ;
263 return (size_t)(-1);
264 }
265# endif
266
267 if (res != (size_t)(-1))
268 {
269 *outbuf = outptr;
270 *outbytesleft = outsize;
271 }
272 *incremented = false;
273 return res;
274}
275
276/* utf8conv_carefully is like iconv, except that
277 - it converts from UTF-8 to UTF-8,
278 - it stops as soon as it encounters a conversion error, and it returns
279 in *INCREMENTED a boolean telling whether it has incremented the input
280 pointers past the error location,
281 - if one_character_only is true, it stops after converting one
282 character. */
283static size_t
284utf8conv_carefully (bool one_character_only,
1cd4fffc
LC
285 const char **inbuf, size_t *inbytesleft,
286 char **outbuf, size_t *outbytesleft,
287 bool *incremented)
24d56127
LC
288{
289 const char *inptr = *inbuf;
290 size_t insize = *inbytesleft;
291 char *outptr = *outbuf;
292 size_t outsize = *outbytesleft;
293 size_t res;
294
295 res = 0;
296 do
297 {
298 ucs4_t uc;
299 int n;
300 int m;
301
302 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
303 if (n < 0)
1cd4fffc
LC
304 {
305 errno = (n == -2 ? EINVAL : EILSEQ);
306 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
307 inptr += n;
308 insize -= n;
309 res = (size_t)(-1);
310 *incremented = true;
311 break;
312 }
24d56127 313 if (outsize == 0)
1cd4fffc
LC
314 {
315 errno = E2BIG;
316 res = (size_t)(-1);
317 *incremented = false;
318 break;
319 }
24d56127
LC
320 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
321 if (m == -2)
1cd4fffc
LC
322 {
323 errno = E2BIG;
324 res = (size_t)(-1);
325 *incremented = false;
326 break;
327 }
24d56127
LC
328 inptr += n;
329 insize -= n;
330 if (m == -1)
1cd4fffc
LC
331 {
332 errno = EILSEQ;
333 res = (size_t)(-1);
334 *incremented = true;
335 break;
336 }
24d56127
LC
337 outptr += m;
338 outsize -= m;
339 }
340 while (!one_character_only && insize > 0);
341
342 *inbuf = inptr;
343 *inbytesleft = insize;
344 *outbuf = outptr;
345 *outbytesleft = outsize;
346 return res;
347}
348
349static int
350mem_cd_iconveh_internal (const char *src, size_t srclen,
1cd4fffc
LC
351 iconv_t cd, iconv_t cd1, iconv_t cd2,
352 enum iconv_ilseq_handler handler,
353 size_t extra_alloc,
354 size_t *offsets,
355 char **resultp, size_t *lengthp)
24d56127
LC
356{
357 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
358 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
359 Instead, we have to start afresh from the beginning of SRC. */
360 /* Use a temporary buffer, so that for small strings, a single malloc()
361 call will be sufficient. */
362# define tmpbufsize 4096
363 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
364 libiconv's UCS-4-INTERNAL encoding. */
365 union { unsigned int align; char buf[tmpbufsize]; } tmp;
366# define tmpbuf tmp.buf
367
368 char *initial_result;
369 char *result;
370 size_t allocated;
371 size_t length;
372 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
373
374 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
375 {
376 initial_result = *resultp;
377 allocated = *lengthp;
378 }
379 else
380 {
381 initial_result = tmpbuf;
382 allocated = sizeof (tmpbuf);
383 }
384 result = initial_result;
385
386 /* Test whether a direct conversion is possible at all. */
387 if (cd == (iconv_t)(-1))
388 goto indirectly;
389
390 if (offsets != NULL)
391 {
392 size_t i;
393
394 for (i = 0; i < srclen; i++)
1cd4fffc 395 offsets[i] = (size_t)(-1);
24d56127
LC
396
397 last_length = (size_t)(-1);
398 }
399 length = 0;
400
401 /* First, try a direct conversion, and see whether a conversion error
402 occurs at all. */
403 {
404 const char *inptr = src;
405 size_t insize = srclen;
406
407 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
408# if defined _LIBICONV_VERSION \
0f00f2c3
LC
409 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
410 || defined __sun)
24d56127
LC
411 /* Set to the initial state. */
412 iconv (cd, NULL, NULL, NULL, NULL);
413# endif
414
415 while (insize > 0)
416 {
1cd4fffc
LC
417 char *outptr = result + length;
418 size_t outsize = allocated - extra_alloc - length;
419 bool incremented;
420 size_t res;
421 bool grow;
422
423 if (offsets != NULL)
424 {
425 if (length != last_length) /* ensure that offset[] be increasing */
426 {
427 offsets[inptr - src] = length;
428 last_length = length;
429 }
430 res = iconv_carefully_1 (cd,
431 &inptr, &insize,
432 &outptr, &outsize,
433 &incremented);
434 }
435 else
436 /* Use iconv_carefully instead of iconv here, because:
437 - If TO_CODESET is UTF-8, we can do the error handling in this
438 loop, no need for a second loop,
439 - With iconv() implementations other than GNU libiconv and GNU
440 libc, if we use iconv() in a big swoop, checking for an E2BIG
441 return, we lose the number of irreversible conversions. */
442 res = iconv_carefully (cd,
443 &inptr, &insize,
444 &outptr, &outsize,
445 &incremented);
446
447 length = outptr - result;
448 grow = (length + extra_alloc > allocated / 2);
449 if (res == (size_t)(-1))
450 {
451 if (errno == E2BIG)
452 grow = true;
453 else if (errno == EINVAL)
454 break;
455 else if (errno == EILSEQ && handler != iconveh_error)
456 {
457 if (cd2 == (iconv_t)(-1))
458 {
459 /* TO_CODESET is UTF-8. */
460 /* Error handling can produce up to 1 byte of output. */
461 if (length + 1 + extra_alloc > allocated)
462 {
463 char *memory;
464
465 allocated = 2 * allocated;
466 if (length + 1 + extra_alloc > allocated)
467 abort ();
468 if (result == initial_result)
469 memory = (char *) malloc (allocated);
470 else
471 memory = (char *) realloc (result, allocated);
472 if (memory == NULL)
473 {
474 if (result != initial_result)
475 free (result);
476 errno = ENOMEM;
477 return -1;
478 }
479 if (result == initial_result)
480 memcpy (memory, initial_result, length);
481 result = memory;
482 grow = false;
483 }
484 /* The input is invalid in FROM_CODESET. Eat up one byte
485 and emit a question mark. */
486 if (!incremented)
487 {
488 if (insize == 0)
489 abort ();
490 inptr++;
491 insize--;
492 }
493 result[length] = '?';
494 length++;
495 }
496 else
497 goto indirectly;
498 }
499 else
500 {
501 if (result != initial_result)
502 {
503 int saved_errno = errno;
504 free (result);
505 errno = saved_errno;
506 }
507 return -1;
508 }
509 }
510 if (insize == 0)
511 break;
512 if (grow)
513 {
514 char *memory;
515
516 allocated = 2 * allocated;
517 if (result == initial_result)
518 memory = (char *) malloc (allocated);
519 else
520 memory = (char *) realloc (result, allocated);
521 if (memory == NULL)
522 {
523 if (result != initial_result)
524 free (result);
525 errno = ENOMEM;
526 return -1;
527 }
528 if (result == initial_result)
529 memcpy (memory, initial_result, length);
530 result = memory;
531 }
24d56127
LC
532 }
533 }
534
535 /* Now get the conversion state back to the initial state.
536 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
537#if defined _LIBICONV_VERSION \
0f00f2c3
LC
538 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
539 || defined __sun)
24d56127
LC
540 for (;;)
541 {
542 char *outptr = result + length;
543 size_t outsize = allocated - extra_alloc - length;
544 size_t res;
545
546 res = iconv (cd, NULL, NULL, &outptr, &outsize);
547 length = outptr - result;
548 if (res == (size_t)(-1))
1cd4fffc
LC
549 {
550 if (errno == E2BIG)
551 {
552 char *memory;
553
554 allocated = 2 * allocated;
555 if (result == initial_result)
556 memory = (char *) malloc (allocated);
557 else
558 memory = (char *) realloc (result, allocated);
559 if (memory == NULL)
560 {
561 if (result != initial_result)
562 free (result);
563 errno = ENOMEM;
564 return -1;
565 }
566 if (result == initial_result)
567 memcpy (memory, initial_result, length);
568 result = memory;
569 }
570 else
571 {
572 if (result != initial_result)
573 {
574 int saved_errno = errno;
575 free (result);
576 errno = saved_errno;
577 }
578 return -1;
579 }
580 }
24d56127 581 else
1cd4fffc 582 break;
24d56127
LC
583 }
584#endif
585
586 /* The direct conversion succeeded. */
587 goto done;
588
589 indirectly:
590 /* The direct conversion failed.
591 Use a conversion through UTF-8. */
592 if (offsets != NULL)
593 {
594 size_t i;
595
596 for (i = 0; i < srclen; i++)
1cd4fffc 597 offsets[i] = (size_t)(-1);
24d56127
LC
598
599 last_length = (size_t)(-1);
600 }
601 length = 0;
602 {
603 const bool slowly = (offsets != NULL || handler == iconveh_error);
604# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
605 char utf8buf[utf8bufsize + 1];
606 size_t utf8len = 0;
607 const char *in1ptr = src;
608 size_t in1size = srclen;
609 bool do_final_flush1 = true;
610 bool do_final_flush2 = true;
611
612 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
613# if defined _LIBICONV_VERSION \
0f00f2c3
LC
614 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
615 || defined __sun)
24d56127
LC
616 /* Set to the initial state. */
617 if (cd1 != (iconv_t)(-1))
618 iconv (cd1, NULL, NULL, NULL, NULL);
619 if (cd2 != (iconv_t)(-1))
620 iconv (cd2, NULL, NULL, NULL, NULL);
621# endif
622
623 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
624 {
1cd4fffc
LC
625 char *out1ptr = utf8buf + utf8len;
626 size_t out1size = utf8bufsize - utf8len;
627 bool incremented1;
628 size_t res1;
629 int errno1;
630
631 /* Conversion step 1: from FROM_CODESET to UTF-8. */
632 if (in1size > 0)
633 {
634 if (offsets != NULL
635 && length != last_length) /* ensure that offset[] be increasing */
636 {
637 offsets[in1ptr - src] = length;
638 last_length = length;
639 }
640 if (cd1 != (iconv_t)(-1))
641 {
642 if (slowly)
643 res1 = iconv_carefully_1 (cd1,
644 &in1ptr, &in1size,
645 &out1ptr, &out1size,
646 &incremented1);
647 else
648 res1 = iconv_carefully (cd1,
649 &in1ptr, &in1size,
650 &out1ptr, &out1size,
651 &incremented1);
652 }
653 else
654 {
655 /* FROM_CODESET is UTF-8. */
656 res1 = utf8conv_carefully (slowly,
657 &in1ptr, &in1size,
658 &out1ptr, &out1size,
659 &incremented1);
660 }
661 }
662 else if (do_final_flush1)
663 {
664 /* Now get the conversion state of CD1 back to the initial state.
665 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
24d56127 666# if defined _LIBICONV_VERSION \
0f00f2c3
LC
667 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
668 || defined __sun)
1cd4fffc
LC
669 if (cd1 != (iconv_t)(-1))
670 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
671 else
24d56127 672# endif
1cd4fffc
LC
673 res1 = 0;
674 do_final_flush1 = false;
675 incremented1 = true;
676 }
677 else
678 {
679 res1 = 0;
680 incremented1 = true;
681 }
682 if (res1 == (size_t)(-1)
683 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
684 {
685 if (result != initial_result)
686 {
687 int saved_errno = errno;
688 free (result);
689 errno = saved_errno;
690 }
691 return -1;
692 }
693 if (res1 == (size_t)(-1)
694 && errno == EILSEQ && handler != iconveh_error)
695 {
696 /* The input is invalid in FROM_CODESET. Eat up one byte and
697 emit a question mark. Room for the question mark was allocated
698 at the end of utf8buf. */
699 if (!incremented1)
700 {
701 if (in1size == 0)
702 abort ();
703 in1ptr++;
704 in1size--;
705 }
706 *out1ptr++ = '?';
707 res1 = 0;
708 }
709 errno1 = errno;
710 utf8len = out1ptr - utf8buf;
711
712 if (offsets != NULL
713 || in1size == 0
714 || utf8len > utf8bufsize / 2
715 || (res1 == (size_t)(-1) && errno1 == E2BIG))
716 {
717 /* Conversion step 2: from UTF-8 to TO_CODESET. */
718 const char *in2ptr = utf8buf;
719 size_t in2size = utf8len;
720
721 while (in2size > 0
722 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
723 {
724 char *out2ptr = result + length;
725 size_t out2size = allocated - extra_alloc - length;
726 bool incremented2;
727 size_t res2;
728 bool grow;
729
730 if (in2size > 0)
731 {
732 if (cd2 != (iconv_t)(-1))
733 res2 = iconv_carefully (cd2,
734 &in2ptr, &in2size,
735 &out2ptr, &out2size,
736 &incremented2);
737 else
738 /* TO_CODESET is UTF-8. */
739 res2 = utf8conv_carefully (false,
740 &in2ptr, &in2size,
741 &out2ptr, &out2size,
742 &incremented2);
743 }
744 else /* in1size == 0 && !do_final_flush1
745 && in2size == 0 && do_final_flush2 */
746 {
747 /* Now get the conversion state of CD1 back to the initial
748 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
24d56127 749# if defined _LIBICONV_VERSION \
0f00f2c3
LC
750 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
751 || defined __sun)
1cd4fffc
LC
752 if (cd2 != (iconv_t)(-1))
753 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
754 else
24d56127 755# endif
1cd4fffc
LC
756 res2 = 0;
757 do_final_flush2 = false;
758 incremented2 = true;
759 }
760
761 length = out2ptr - result;
762 grow = (length + extra_alloc > allocated / 2);
763 if (res2 == (size_t)(-1))
764 {
765 if (errno == E2BIG)
766 grow = true;
767 else if (errno == EINVAL)
768 break;
769 else if (errno == EILSEQ && handler != iconveh_error)
770 {
771 /* Error handling can produce up to 10 bytes of ASCII
772 output. But TO_CODESET may be UCS-2, UTF-16 or
773 UCS-4, so use CD2 here as well. */
774 char scratchbuf[10];
775 size_t scratchlen;
776 ucs4_t uc;
777 const char *inptr;
778 size_t insize;
779 size_t res;
780
781 if (incremented2)
782 {
783 if (u8_prev (&uc, (const uint8_t *) in2ptr,
784 (const uint8_t *) utf8buf)
785 == NULL)
786 abort ();
787 }
788 else
789 {
790 int n;
791 if (in2size == 0)
792 abort ();
793 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
794 in2size);
795 in2ptr += n;
796 in2size -= n;
797 }
798
799 if (handler == iconveh_escape_sequence)
800 {
801 static char hex[16] = "0123456789ABCDEF";
802 scratchlen = 0;
803 scratchbuf[scratchlen++] = '\\';
804 if (uc < 0x10000)
805 scratchbuf[scratchlen++] = 'u';
806 else
807 {
808 scratchbuf[scratchlen++] = 'U';
809 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
810 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
811 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
812 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
813 }
814 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
815 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
816 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
817 scratchbuf[scratchlen++] = hex[uc & 15];
818 }
819 else
820 {
821 scratchbuf[0] = '?';
822 scratchlen = 1;
823 }
824
825 inptr = scratchbuf;
826 insize = scratchlen;
827 if (cd2 != (iconv_t)(-1))
828 res = iconv (cd2,
829 (ICONV_CONST char **) &inptr, &insize,
830 &out2ptr, &out2size);
831 else
832 {
833 /* TO_CODESET is UTF-8. */
834 if (out2size >= insize)
835 {
836 memcpy (out2ptr, inptr, insize);
837 out2ptr += insize;
838 out2size -= insize;
839 inptr += insize;
840 insize = 0;
841 res = 0;
842 }
843 else
844 {
845 errno = E2BIG;
846 res = (size_t)(-1);
847 }
848 }
849 length = out2ptr - result;
850 if (res == (size_t)(-1) && errno == E2BIG)
851 {
852 char *memory;
853
854 allocated = 2 * allocated;
855 if (length + 1 + extra_alloc > allocated)
856 abort ();
857 if (result == initial_result)
858 memory = (char *) malloc (allocated);
859 else
860 memory = (char *) realloc (result, allocated);
861 if (memory == NULL)
862 {
863 if (result != initial_result)
864 free (result);
865 errno = ENOMEM;
866 return -1;
867 }
868 if (result == initial_result)
869 memcpy (memory, initial_result, length);
870 result = memory;
871 grow = false;
872
873 out2ptr = result + length;
874 out2size = allocated - extra_alloc - length;
875 if (cd2 != (iconv_t)(-1))
876 res = iconv (cd2,
877 (ICONV_CONST char **) &inptr,
878 &insize,
879 &out2ptr, &out2size);
880 else
881 {
882 /* TO_CODESET is UTF-8. */
883 if (!(out2size >= insize))
884 abort ();
885 memcpy (out2ptr, inptr, insize);
886 out2ptr += insize;
887 out2size -= insize;
888 inptr += insize;
889 insize = 0;
890 res = 0;
891 }
892 length = out2ptr - result;
893 }
0f00f2c3 894# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
1cd4fffc
LC
895 /* Irix iconv() inserts a NUL byte if it cannot convert.
896 NetBSD iconv() inserts a question mark if it cannot
897 convert.
898 Only GNU libiconv and GNU libc are known to prefer
899 to fail rather than doing a lossy conversion. */
900 if (res != (size_t)(-1) && res > 0)
901 {
902 errno = EILSEQ;
903 res = (size_t)(-1);
904 }
24d56127 905# endif
1cd4fffc
LC
906 if (res == (size_t)(-1))
907 {
908 /* Failure converting the ASCII replacement. */
909 if (result != initial_result)
910 {
911 int saved_errno = errno;
912 free (result);
913 errno = saved_errno;
914 }
915 return -1;
916 }
917 }
918 else
919 {
920 if (result != initial_result)
921 {
922 int saved_errno = errno;
923 free (result);
924 errno = saved_errno;
925 }
926 return -1;
927 }
928 }
929 if (!(in2size > 0
930 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
931 break;
932 if (grow)
933 {
934 char *memory;
935
936 allocated = 2 * allocated;
937 if (result == initial_result)
938 memory = (char *) malloc (allocated);
939 else
940 memory = (char *) realloc (result, allocated);
941 if (memory == NULL)
942 {
943 if (result != initial_result)
944 free (result);
945 errno = ENOMEM;
946 return -1;
947 }
948 if (result == initial_result)
949 memcpy (memory, initial_result, length);
950 result = memory;
951 }
952 }
953
954 /* Move the remaining bytes to the beginning of utf8buf. */
955 if (in2size > 0)
956 memmove (utf8buf, in2ptr, in2size);
957 utf8len = in2size;
958 }
959
960 if (res1 == (size_t)(-1))
961 {
962 if (errno1 == EINVAL)
963 in1size = 0;
964 else if (errno1 == EILSEQ)
965 {
966 if (result != initial_result)
967 free (result);
968 errno = errno1;
969 return -1;
970 }
971 }
24d56127
LC
972 }
973# undef utf8bufsize
974 }
975
976 done:
977 /* Now the final memory allocation. */
978 if (result == tmpbuf)
979 {
980 size_t memsize = length + extra_alloc;
24d56127 981
9157d901
LC
982 if (*resultp != NULL && *lengthp >= memsize)
983 result = *resultp;
24d56127 984 else
1cd4fffc 985 {
9157d901
LC
986 char *memory;
987
988 memory = (char *) malloc (memsize > 0 ? memsize : 1);
989 if (memory != NULL)
990 result = memory;
991 else
992 {
993 errno = ENOMEM;
994 return -1;
995 }
24d56127 996 }
9157d901 997 memcpy (result, tmpbuf, length);
24d56127
LC
998 }
999 else if (result != *resultp && length + extra_alloc < allocated)
1000 {
1001 /* Shrink the allocated memory if possible. */
1002 size_t memsize = length + extra_alloc;
1003 char *memory;
1004
1005 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1006 if (memory != NULL)
1cd4fffc 1007 result = memory;
24d56127
LC
1008 }
1009 *resultp = result;
1010 *lengthp = length;
1011 return 0;
1012# undef tmpbuf
1013# undef tmpbufsize
1014}
1015
1016int
1017mem_cd_iconveh (const char *src, size_t srclen,
1cd4fffc
LC
1018 const iconveh_t *cd,
1019 enum iconv_ilseq_handler handler,
1020 size_t *offsets,
1021 char **resultp, size_t *lengthp)
24d56127 1022{
8912421c 1023 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1cd4fffc 1024 handler, 0, offsets, resultp, lengthp);
24d56127
LC
1025}
1026
1027char *
1028str_cd_iconveh (const char *src,
1cd4fffc
LC
1029 const iconveh_t *cd,
1030 enum iconv_ilseq_handler handler)
24d56127
LC
1031{
1032 /* For most encodings, a trailing NUL byte in the input will be converted
1033 to a trailing NUL byte in the output. But not for UTF-7. So that this
1034 function is usable for UTF-7, we have to exclude the NUL byte from the
1035 conversion and add it by hand afterwards. */
1036 char *result = NULL;
1037 size_t length = 0;
1038 int retval = mem_cd_iconveh_internal (src, strlen (src),
1cd4fffc
LC
1039 cd->cd, cd->cd1, cd->cd2, handler, 1,
1040 NULL, &result, &length);
24d56127
LC
1041
1042 if (retval < 0)
1043 {
1044 if (result != NULL)
1cd4fffc
LC
1045 {
1046 int saved_errno = errno;
1047 free (result);
1048 errno = saved_errno;
1049 }
24d56127
LC
1050 return NULL;
1051 }
1052
1053 /* Add the terminating NUL byte. */
1054 result[length] = '\0';
1055
1056 return result;
1057}
1058
1059#endif
1060
1061int
1062mem_iconveh (const char *src, size_t srclen,
1cd4fffc
LC
1063 const char *from_codeset, const char *to_codeset,
1064 enum iconv_ilseq_handler handler,
1065 size_t *offsets,
1066 char **resultp, size_t *lengthp)
24d56127
LC
1067{
1068 if (srclen == 0)
1069 {
1070 /* Nothing to convert. */
1071 *lengthp = 0;
1072 return 0;
1073 }
1074 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1075 {
1076 char *result;
1077
1078 if (*resultp != NULL && *lengthp >= srclen)
1cd4fffc 1079 result = *resultp;
24d56127 1080 else
1cd4fffc
LC
1081 {
1082 result = (char *) malloc (srclen);
1083 if (result == NULL)
1084 {
1085 errno = ENOMEM;
1086 return -1;
1087 }
1088 }
24d56127
LC
1089 memcpy (result, src, srclen);
1090 *resultp = result;
1091 *lengthp = srclen;
1092 return 0;
1093 }
1094 else
1095 {
1096#if HAVE_ICONV
8912421c 1097 iconveh_t cd;
24d56127
LC
1098 char *result;
1099 size_t length;
1100 int retval;
1101
8912421c 1102 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1cd4fffc 1103 return -1;
24d56127
LC
1104
1105 result = *resultp;
1106 length = *lengthp;
8912421c 1107 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1cd4fffc 1108 &result, &length);
24d56127
LC
1109
1110 if (retval < 0)
1cd4fffc
LC
1111 {
1112 /* Close cd, but preserve the errno from str_cd_iconv. */
1113 int saved_errno = errno;
1114 iconveh_close (&cd);
1115 errno = saved_errno;
1116 }
24d56127 1117 else
1cd4fffc
LC
1118 {
1119 if (iconveh_close (&cd) < 0)
1120 {
1121 /* Return -1, but free the allocated memory, and while doing
1122 that, preserve the errno from iconveh_close. */
1123 int saved_errno = errno;
1124 if (result != *resultp && result != NULL)
1125 free (result);
1126 errno = saved_errno;
1127 return -1;
1128 }
1129 *resultp = result;
1130 *lengthp = length;
1131 }
24d56127
LC
1132 return retval;
1133#else
1134 /* This is a different error code than if iconv_open existed but didn't
1cd4fffc
LC
1135 support from_codeset and to_codeset, so that the caller can emit
1136 an error message such as
1137 "iconv() is not supported. Installing GNU libiconv and
1138 then reinstalling this package would fix this." */
24d56127
LC
1139 errno = ENOSYS;
1140 return -1;
1141#endif
1142 }
1143}
1144
1145char *
1146str_iconveh (const char *src,
1cd4fffc
LC
1147 const char *from_codeset, const char *to_codeset,
1148 enum iconv_ilseq_handler handler)
24d56127
LC
1149{
1150 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1151 {
1152 char *result = strdup (src);
1153
1154 if (result == NULL)
1cd4fffc 1155 errno = ENOMEM;
24d56127
LC
1156 return result;
1157 }
1158 else
1159 {
1160#if HAVE_ICONV
8912421c 1161 iconveh_t cd;
24d56127
LC
1162 char *result;
1163
8912421c 1164 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1cd4fffc 1165 return NULL;
24d56127 1166
8912421c 1167 result = str_cd_iconveh (src, &cd, handler);
24d56127
LC
1168
1169 if (result == NULL)
1cd4fffc
LC
1170 {
1171 /* Close cd, but preserve the errno from str_cd_iconv. */
1172 int saved_errno = errno;
1173 iconveh_close (&cd);
1174 errno = saved_errno;
1175 }
24d56127 1176 else
1cd4fffc
LC
1177 {
1178 if (iconveh_close (&cd) < 0)
1179 {
1180 /* Return NULL, but free the allocated memory, and while doing
1181 that, preserve the errno from iconveh_close. */
1182 int saved_errno = errno;
1183 free (result);
1184 errno = saved_errno;
1185 return NULL;
1186 }
1187 }
24d56127
LC
1188 return result;
1189#else
1190 /* This is a different error code than if iconv_open existed but didn't
1cd4fffc
LC
1191 support from_codeset and to_codeset, so that the caller can emit
1192 an error message such as
1193 "iconv() is not supported. Installing GNU libiconv and
1194 then reinstalling this package would fix this." */
24d56127
LC
1195 errno = ENOSYS;
1196 return NULL;
1197#endif
1198 }
1199}