Use Gnulib's `warning' module.
[bpt/guile.git] / lib / striconveh.c
CommitLineData
24d56127 1/* Character set conversion with error handling.
8912421c 2 Copyright (C) 2001-2009 Free Software Foundation, Inc.
24d56127
LC
3 Written by Bruno Haible and Simon Josefsson.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include "striconveh.h"
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdlib.h>
26#include <string.h>
27
28#if HAVE_ICONV
29# include <iconv.h>
30# include "unistr.h"
31#endif
32
33#include "c-strcase.h"
34#include "c-strcaseeq.h"
35
36#ifndef SIZE_MAX
37# define SIZE_MAX ((size_t) -1)
38#endif
39
40
41#if HAVE_ICONV
42
8912421c
LC
43/* The caller must provide an iconveh_t, not just an iconv_t, because when a
44 conversion error occurs, we may have to determine the Unicode representation
45 of the inconvertible character. */
46
47int
48iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49{
50 iconv_t cd;
51 iconv_t cd1;
52 iconv_t cd2;
53
54 /* Avoid glibc-2.1 bug with EUC-KR. */
55# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
56 if (c_strcasecmp (from_codeset, "EUC-KR") == 0
57 || c_strcasecmp (to_codeset, "EUC-KR") == 0)
58 {
59 errno = EINVAL;
60 return -1;
61 }
62# endif
63
64 cd = iconv_open (to_codeset, from_codeset);
65
66 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
67 cd1 = (iconv_t)(-1);
68 else
69 {
70 cd1 = iconv_open ("UTF-8", from_codeset);
71 if (cd1 == (iconv_t)(-1))
72 {
73 int saved_errno = errno;
74 if (cd != (iconv_t)(-1))
75 iconv_close (cdp->cd);
76 errno = saved_errno;
77 return -1;
78 }
79 }
80
81 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
82# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
83 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
84# endif
85 )
86 cd2 = (iconv_t)(-1);
87 else
88 {
89 cd2 = iconv_open (to_codeset, "UTF-8");
90 if (cd2 == (iconv_t)(-1))
91 {
92 int saved_errno = errno;
93 if (cd1 != (iconv_t)(-1))
94 iconv_close (cd1);
95 if (cd != (iconv_t)(-1))
96 iconv_close (cd);
97 errno = saved_errno;
98 return -1;
99 }
100 }
101
102 cdp->cd = cd;
103 cdp->cd1 = cd1;
104 cdp->cd2 = cd2;
105 return 0;
106}
107
108int
109iconveh_close (const iconveh_t *cd)
110{
111 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
112 {
113 /* Return -1, but preserve the errno from iconv_close. */
114 int saved_errno = errno;
115 if (cd->cd1 != (iconv_t)(-1))
116 iconv_close (cd->cd1);
117 if (cd->cd != (iconv_t)(-1))
118 iconv_close (cd->cd);
119 errno = saved_errno;
120 return -1;
121 }
122 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
123 {
124 /* Return -1, but preserve the errno from iconv_close. */
125 int saved_errno = errno;
126 if (cd->cd != (iconv_t)(-1))
127 iconv_close (cd->cd);
128 errno = saved_errno;
129 return -1;
130 }
131 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
132 return -1;
133 return 0;
134}
24d56127
LC
135
136/* iconv_carefully is like iconv, except that it stops as soon as it encounters
137 a conversion error, and it returns in *INCREMENTED a boolean telling whether
138 it has incremented the input pointers past the error location. */
139# if !defined _LIBICONV_VERSION && !defined __GLIBC__
140/* Irix iconv() inserts a NUL byte if it cannot convert.
141 NetBSD iconv() inserts a question mark if it cannot convert.
142 Only GNU libiconv and GNU libc are known to prefer to fail rather
143 than doing a lossy conversion. */
144static size_t
145iconv_carefully (iconv_t cd,
146 const char **inbuf, size_t *inbytesleft,
147 char **outbuf, size_t *outbytesleft,
148 bool *incremented)
149{
150 const char *inptr = *inbuf;
151 const char *inptr_end = inptr + *inbytesleft;
152 char *outptr = *outbuf;
153 size_t outsize = *outbytesleft;
154 const char *inptr_before;
155 size_t res;
156
157 do
158 {
159 size_t insize;
160
161 inptr_before = inptr;
162 res = (size_t)(-1);
163
164 for (insize = 1; inptr + insize <= inptr_end; insize++)
165 {
166 res = iconv (cd,
167 (ICONV_CONST char **) &inptr, &insize,
168 &outptr, &outsize);
169 if (!(res == (size_t)(-1) && errno == EINVAL))
170 break;
171 /* iconv can eat up a shift sequence but give EINVAL while attempting
172 to convert the first character. E.g. libiconv does this. */
173 if (inptr > inptr_before)
174 {
175 res = 0;
176 break;
177 }
178 }
179
180 if (res == 0)
181 {
182 *outbuf = outptr;
183 *outbytesleft = outsize;
184 }
185 }
186 while (res == 0 && inptr < inptr_end);
187
188 *inbuf = inptr;
189 *inbytesleft = inptr_end - inptr;
190 if (res != (size_t)(-1) && res > 0)
191 {
192 /* iconv() has already incremented INPTR. We cannot go back to a
193 previous INPTR, otherwise the state inside CD would become invalid,
194 if FROM_CODESET is a stateful encoding. So, tell the caller that
195 *INBUF has already been incremented. */
196 *incremented = (inptr > inptr_before);
197 errno = EILSEQ;
198 return (size_t)(-1);
199 }
200 else
201 {
202 *incremented = false;
203 return res;
204 }
205}
206# else
207# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
208 (*(incremented) = false, \
209 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
210# endif
211
212/* iconv_carefully_1 is like iconv_carefully, except that it stops after
213 converting one character or one shift sequence. */
214static size_t
215iconv_carefully_1 (iconv_t cd,
216 const char **inbuf, size_t *inbytesleft,
217 char **outbuf, size_t *outbytesleft,
218 bool *incremented)
219{
220 const char *inptr_before = *inbuf;
221 const char *inptr = inptr_before;
222 const char *inptr_end = inptr_before + *inbytesleft;
223 char *outptr = *outbuf;
224 size_t outsize = *outbytesleft;
225 size_t res = (size_t)(-1);
226 size_t insize;
227
228 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
229 {
230 inptr = inptr_before;
231 res = iconv (cd,
232 (ICONV_CONST char **) &inptr, &insize,
233 &outptr, &outsize);
234 if (!(res == (size_t)(-1) && errno == EINVAL))
235 break;
236 /* iconv can eat up a shift sequence but give EINVAL while attempting
237 to convert the first character. E.g. libiconv does this. */
238 if (inptr > inptr_before)
239 {
240 res = 0;
241 break;
242 }
243 }
244
245 *inbuf = inptr;
246 *inbytesleft = inptr_end - inptr;
247# if !defined _LIBICONV_VERSION && !defined __GLIBC__
248 /* Irix iconv() inserts a NUL byte if it cannot convert.
249 NetBSD iconv() inserts a question mark if it cannot convert.
250 Only GNU libiconv and GNU libc are known to prefer to fail rather
251 than doing a lossy conversion. */
252 if (res != (size_t)(-1) && res > 0)
253 {
254 /* iconv() has already incremented INPTR. We cannot go back to a
255 previous INPTR, otherwise the state inside CD would become invalid,
256 if FROM_CODESET is a stateful encoding. So, tell the caller that
257 *INBUF has already been incremented. */
258 *incremented = (inptr > inptr_before);
259 errno = EILSEQ;
260 return (size_t)(-1);
261 }
262# endif
263
264 if (res != (size_t)(-1))
265 {
266 *outbuf = outptr;
267 *outbytesleft = outsize;
268 }
269 *incremented = false;
270 return res;
271}
272
273/* utf8conv_carefully is like iconv, except that
274 - it converts from UTF-8 to UTF-8,
275 - it stops as soon as it encounters a conversion error, and it returns
276 in *INCREMENTED a boolean telling whether it has incremented the input
277 pointers past the error location,
278 - if one_character_only is true, it stops after converting one
279 character. */
280static size_t
281utf8conv_carefully (bool one_character_only,
282 const char **inbuf, size_t *inbytesleft,
283 char **outbuf, size_t *outbytesleft,
284 bool *incremented)
285{
286 const char *inptr = *inbuf;
287 size_t insize = *inbytesleft;
288 char *outptr = *outbuf;
289 size_t outsize = *outbytesleft;
290 size_t res;
291
292 res = 0;
293 do
294 {
295 ucs4_t uc;
296 int n;
297 int m;
298
299 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
300 if (n < 0)
301 {
302 errno = (n == -2 ? EINVAL : EILSEQ);
303 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
304 inptr += n;
305 insize -= n;
306 res = (size_t)(-1);
307 *incremented = true;
308 break;
309 }
310 if (outsize == 0)
311 {
312 errno = E2BIG;
313 res = (size_t)(-1);
314 *incremented = false;
315 break;
316 }
317 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
318 if (m == -2)
319 {
320 errno = E2BIG;
321 res = (size_t)(-1);
322 *incremented = false;
323 break;
324 }
325 inptr += n;
326 insize -= n;
327 if (m == -1)
328 {
329 errno = EILSEQ;
330 res = (size_t)(-1);
331 *incremented = true;
332 break;
333 }
334 outptr += m;
335 outsize -= m;
336 }
337 while (!one_character_only && insize > 0);
338
339 *inbuf = inptr;
340 *inbytesleft = insize;
341 *outbuf = outptr;
342 *outbytesleft = outsize;
343 return res;
344}
345
346static int
347mem_cd_iconveh_internal (const char *src, size_t srclen,
348 iconv_t cd, iconv_t cd1, iconv_t cd2,
349 enum iconv_ilseq_handler handler,
350 size_t extra_alloc,
351 size_t *offsets,
352 char **resultp, size_t *lengthp)
353{
354 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
355 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
356 Instead, we have to start afresh from the beginning of SRC. */
357 /* Use a temporary buffer, so that for small strings, a single malloc()
358 call will be sufficient. */
359# define tmpbufsize 4096
360 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
361 libiconv's UCS-4-INTERNAL encoding. */
362 union { unsigned int align; char buf[tmpbufsize]; } tmp;
363# define tmpbuf tmp.buf
364
365 char *initial_result;
366 char *result;
367 size_t allocated;
368 size_t length;
369 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
370
371 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
372 {
373 initial_result = *resultp;
374 allocated = *lengthp;
375 }
376 else
377 {
378 initial_result = tmpbuf;
379 allocated = sizeof (tmpbuf);
380 }
381 result = initial_result;
382
383 /* Test whether a direct conversion is possible at all. */
384 if (cd == (iconv_t)(-1))
385 goto indirectly;
386
387 if (offsets != NULL)
388 {
389 size_t i;
390
391 for (i = 0; i < srclen; i++)
392 offsets[i] = (size_t)(-1);
393
394 last_length = (size_t)(-1);
395 }
396 length = 0;
397
398 /* First, try a direct conversion, and see whether a conversion error
399 occurs at all. */
400 {
401 const char *inptr = src;
402 size_t insize = srclen;
403
404 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
405# if defined _LIBICONV_VERSION \
406 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
407 /* Set to the initial state. */
408 iconv (cd, NULL, NULL, NULL, NULL);
409# endif
410
411 while (insize > 0)
412 {
413 char *outptr = result + length;
414 size_t outsize = allocated - extra_alloc - length;
415 bool incremented;
416 size_t res;
417 bool grow;
418
419 if (offsets != NULL)
420 {
421 if (length != last_length) /* ensure that offset[] be increasing */
422 {
423 offsets[inptr - src] = length;
424 last_length = length;
425 }
426 res = iconv_carefully_1 (cd,
427 &inptr, &insize,
428 &outptr, &outsize,
429 &incremented);
430 }
431 else
432 /* Use iconv_carefully instead of iconv here, because:
433 - If TO_CODESET is UTF-8, we can do the error handling in this
434 loop, no need for a second loop,
435 - With iconv() implementations other than GNU libiconv and GNU
436 libc, if we use iconv() in a big swoop, checking for an E2BIG
437 return, we lose the number of irreversible conversions. */
438 res = iconv_carefully (cd,
439 &inptr, &insize,
440 &outptr, &outsize,
441 &incremented);
442
443 length = outptr - result;
444 grow = (length + extra_alloc > allocated / 2);
445 if (res == (size_t)(-1))
446 {
447 if (errno == E2BIG)
448 grow = true;
449 else if (errno == EINVAL)
450 break;
451 else if (errno == EILSEQ && handler != iconveh_error)
452 {
453 if (cd2 == (iconv_t)(-1))
454 {
455 /* TO_CODESET is UTF-8. */
456 /* Error handling can produce up to 1 byte of output. */
457 if (length + 1 + extra_alloc > allocated)
458 {
459 char *memory;
460
461 allocated = 2 * allocated;
462 if (length + 1 + extra_alloc > allocated)
463 abort ();
464 if (result == initial_result)
465 memory = (char *) malloc (allocated);
466 else
467 memory = (char *) realloc (result, allocated);
468 if (memory == NULL)
469 {
470 if (result != initial_result)
471 free (result);
472 errno = ENOMEM;
473 return -1;
474 }
475 if (result == initial_result)
476 memcpy (memory, initial_result, length);
477 result = memory;
478 grow = false;
479 }
480 /* The input is invalid in FROM_CODESET. Eat up one byte
481 and emit a question mark. */
482 if (!incremented)
483 {
484 if (insize == 0)
485 abort ();
486 inptr++;
487 insize--;
488 }
489 result[length] = '?';
490 length++;
491 }
492 else
493 goto indirectly;
494 }
495 else
496 {
497 if (result != initial_result)
498 {
499 int saved_errno = errno;
500 free (result);
501 errno = saved_errno;
502 }
503 return -1;
504 }
505 }
506 if (insize == 0)
507 break;
508 if (grow)
509 {
510 char *memory;
511
512 allocated = 2 * allocated;
513 if (result == initial_result)
514 memory = (char *) malloc (allocated);
515 else
516 memory = (char *) realloc (result, allocated);
517 if (memory == NULL)
518 {
519 if (result != initial_result)
520 free (result);
521 errno = ENOMEM;
522 return -1;
523 }
524 if (result == initial_result)
525 memcpy (memory, initial_result, length);
526 result = memory;
527 }
528 }
529 }
530
531 /* Now get the conversion state back to the initial state.
532 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
533#if defined _LIBICONV_VERSION \
534 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
535 for (;;)
536 {
537 char *outptr = result + length;
538 size_t outsize = allocated - extra_alloc - length;
539 size_t res;
540
541 res = iconv (cd, NULL, NULL, &outptr, &outsize);
542 length = outptr - result;
543 if (res == (size_t)(-1))
544 {
545 if (errno == E2BIG)
546 {
547 char *memory;
548
549 allocated = 2 * allocated;
550 if (result == initial_result)
551 memory = (char *) malloc (allocated);
552 else
553 memory = (char *) realloc (result, allocated);
554 if (memory == NULL)
555 {
556 if (result != initial_result)
557 free (result);
558 errno = ENOMEM;
559 return -1;
560 }
561 if (result == initial_result)
562 memcpy (memory, initial_result, length);
563 result = memory;
564 }
565 else
566 {
567 if (result != initial_result)
568 {
569 int saved_errno = errno;
570 free (result);
571 errno = saved_errno;
572 }
573 return -1;
574 }
575 }
576 else
577 break;
578 }
579#endif
580
581 /* The direct conversion succeeded. */
582 goto done;
583
584 indirectly:
585 /* The direct conversion failed.
586 Use a conversion through UTF-8. */
587 if (offsets != NULL)
588 {
589 size_t i;
590
591 for (i = 0; i < srclen; i++)
592 offsets[i] = (size_t)(-1);
593
594 last_length = (size_t)(-1);
595 }
596 length = 0;
597 {
598 const bool slowly = (offsets != NULL || handler == iconveh_error);
599# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
600 char utf8buf[utf8bufsize + 1];
601 size_t utf8len = 0;
602 const char *in1ptr = src;
603 size_t in1size = srclen;
604 bool do_final_flush1 = true;
605 bool do_final_flush2 = true;
606
607 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
608# if defined _LIBICONV_VERSION \
609 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
610 /* Set to the initial state. */
611 if (cd1 != (iconv_t)(-1))
612 iconv (cd1, NULL, NULL, NULL, NULL);
613 if (cd2 != (iconv_t)(-1))
614 iconv (cd2, NULL, NULL, NULL, NULL);
615# endif
616
617 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
618 {
619 char *out1ptr = utf8buf + utf8len;
620 size_t out1size = utf8bufsize - utf8len;
621 bool incremented1;
622 size_t res1;
623 int errno1;
624
625 /* Conversion step 1: from FROM_CODESET to UTF-8. */
626 if (in1size > 0)
627 {
628 if (offsets != NULL
629 && length != last_length) /* ensure that offset[] be increasing */
630 {
631 offsets[in1ptr - src] = length;
632 last_length = length;
633 }
634 if (cd1 != (iconv_t)(-1))
635 {
636 if (slowly)
637 res1 = iconv_carefully_1 (cd1,
638 &in1ptr, &in1size,
639 &out1ptr, &out1size,
640 &incremented1);
641 else
642 res1 = iconv_carefully (cd1,
643 &in1ptr, &in1size,
644 &out1ptr, &out1size,
645 &incremented1);
646 }
647 else
648 {
649 /* FROM_CODESET is UTF-8. */
650 res1 = utf8conv_carefully (slowly,
651 &in1ptr, &in1size,
652 &out1ptr, &out1size,
653 &incremented1);
654 }
655 }
656 else if (do_final_flush1)
657 {
658 /* Now get the conversion state of CD1 back to the initial state.
659 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
660# if defined _LIBICONV_VERSION \
661 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
662 if (cd1 != (iconv_t)(-1))
663 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
664 else
665# endif
666 res1 = 0;
667 do_final_flush1 = false;
668 incremented1 = true;
669 }
670 else
671 {
672 res1 = 0;
673 incremented1 = true;
674 }
675 if (res1 == (size_t)(-1)
676 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
677 {
678 if (result != initial_result)
679 {
680 int saved_errno = errno;
681 free (result);
682 errno = saved_errno;
683 }
684 return -1;
685 }
686 if (res1 == (size_t)(-1)
687 && errno == EILSEQ && handler != iconveh_error)
688 {
689 /* The input is invalid in FROM_CODESET. Eat up one byte and
690 emit a question mark. Room for the question mark was allocated
691 at the end of utf8buf. */
692 if (!incremented1)
693 {
694 if (in1size == 0)
695 abort ();
696 in1ptr++;
697 in1size--;
698 }
8912421c
LC
699 *out1ptr++ = '?';
700 res1 = 0;
24d56127
LC
701 }
702 errno1 = errno;
703 utf8len = out1ptr - utf8buf;
704
705 if (offsets != NULL
706 || in1size == 0
707 || utf8len > utf8bufsize / 2
708 || (res1 == (size_t)(-1) && errno1 == E2BIG))
709 {
710 /* Conversion step 2: from UTF-8 to TO_CODESET. */
711 const char *in2ptr = utf8buf;
712 size_t in2size = utf8len;
713
714 while (in2size > 0
715 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
716 {
717 char *out2ptr = result + length;
718 size_t out2size = allocated - extra_alloc - length;
719 bool incremented2;
720 size_t res2;
721 bool grow;
722
723 if (in2size > 0)
724 {
725 if (cd2 != (iconv_t)(-1))
726 res2 = iconv_carefully (cd2,
727 &in2ptr, &in2size,
728 &out2ptr, &out2size,
729 &incremented2);
730 else
731 /* TO_CODESET is UTF-8. */
732 res2 = utf8conv_carefully (false,
733 &in2ptr, &in2size,
734 &out2ptr, &out2size,
735 &incremented2);
736 }
737 else /* in1size == 0 && !do_final_flush1
738 && in2size == 0 && do_final_flush2 */
739 {
740 /* Now get the conversion state of CD1 back to the initial
741 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
742# if defined _LIBICONV_VERSION \
743 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
744 if (cd2 != (iconv_t)(-1))
745 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
746 else
747# endif
748 res2 = 0;
749 do_final_flush2 = false;
750 incremented2 = true;
751 }
752
753 length = out2ptr - result;
754 grow = (length + extra_alloc > allocated / 2);
755 if (res2 == (size_t)(-1))
756 {
757 if (errno == E2BIG)
758 grow = true;
759 else if (errno == EINVAL)
760 break;
761 else if (errno == EILSEQ && handler != iconveh_error)
762 {
763 /* Error handling can produce up to 10 bytes of ASCII
764 output. But TO_CODESET may be UCS-2, UTF-16 or
765 UCS-4, so use CD2 here as well. */
766 char scratchbuf[10];
767 size_t scratchlen;
768 ucs4_t uc;
769 const char *inptr;
770 size_t insize;
771 size_t res;
772
773 if (incremented2)
774 {
775 if (u8_prev (&uc, (const uint8_t *) in2ptr,
776 (const uint8_t *) utf8buf)
777 == NULL)
778 abort ();
779 }
780 else
781 {
782 int n;
783 if (in2size == 0)
784 abort ();
785 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
786 in2size);
787 in2ptr += n;
788 in2size -= n;
789 }
790
791 if (handler == iconveh_escape_sequence)
792 {
793 static char hex[16] = "0123456789ABCDEF";
794 scratchlen = 0;
795 scratchbuf[scratchlen++] = '\\';
796 if (uc < 0x10000)
797 scratchbuf[scratchlen++] = 'u';
798 else
799 {
800 scratchbuf[scratchlen++] = 'U';
801 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
802 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
803 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
804 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
805 }
806 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
807 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
808 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
809 scratchbuf[scratchlen++] = hex[uc & 15];
810 }
811 else
812 {
813 scratchbuf[0] = '?';
814 scratchlen = 1;
815 }
816
817 inptr = scratchbuf;
818 insize = scratchlen;
819 if (cd2 != (iconv_t)(-1))
820 res = iconv (cd2,
821 (ICONV_CONST char **) &inptr, &insize,
822 &out2ptr, &out2size);
823 else
824 {
825 /* TO_CODESET is UTF-8. */
826 if (out2size >= insize)
827 {
828 memcpy (out2ptr, inptr, insize);
829 out2ptr += insize;
830 out2size -= insize;
831 inptr += insize;
832 insize = 0;
833 res = 0;
834 }
835 else
836 {
837 errno = E2BIG;
838 res = (size_t)(-1);
839 }
840 }
841 length = out2ptr - result;
842 if (res == (size_t)(-1) && errno == E2BIG)
843 {
844 char *memory;
845
846 allocated = 2 * allocated;
847 if (length + 1 + extra_alloc > allocated)
848 abort ();
849 if (result == initial_result)
850 memory = (char *) malloc (allocated);
851 else
852 memory = (char *) realloc (result, allocated);
853 if (memory == NULL)
854 {
855 if (result != initial_result)
856 free (result);
857 errno = ENOMEM;
858 return -1;
859 }
860 if (result == initial_result)
861 memcpy (memory, initial_result, length);
862 result = memory;
863 grow = false;
864
865 out2ptr = result + length;
866 out2size = allocated - extra_alloc - length;
867 if (cd2 != (iconv_t)(-1))
868 res = iconv (cd2,
869 (ICONV_CONST char **) &inptr,
870 &insize,
871 &out2ptr, &out2size);
872 else
873 {
874 /* TO_CODESET is UTF-8. */
875 if (!(out2size >= insize))
876 abort ();
877 memcpy (out2ptr, inptr, insize);
878 out2ptr += insize;
879 out2size -= insize;
880 inptr += insize;
881 insize = 0;
882 res = 0;
883 }
884 length = out2ptr - result;
885 }
886# if !defined _LIBICONV_VERSION && !defined __GLIBC__
887 /* Irix iconv() inserts a NUL byte if it cannot convert.
888 NetBSD iconv() inserts a question mark if it cannot
889 convert.
890 Only GNU libiconv and GNU libc are known to prefer
891 to fail rather than doing a lossy conversion. */
892 if (res != (size_t)(-1) && res > 0)
893 {
894 errno = EILSEQ;
895 res = (size_t)(-1);
896 }
897# endif
898 if (res == (size_t)(-1))
899 {
900 /* Failure converting the ASCII replacement. */
901 if (result != initial_result)
902 {
903 int saved_errno = errno;
904 free (result);
905 errno = saved_errno;
906 }
907 return -1;
908 }
909 }
910 else
911 {
912 if (result != initial_result)
913 {
914 int saved_errno = errno;
915 free (result);
916 errno = saved_errno;
917 }
918 return -1;
919 }
920 }
921 if (!(in2size > 0
922 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
923 break;
924 if (grow)
925 {
926 char *memory;
927
928 allocated = 2 * allocated;
929 if (result == initial_result)
930 memory = (char *) malloc (allocated);
931 else
932 memory = (char *) realloc (result, allocated);
933 if (memory == NULL)
934 {
935 if (result != initial_result)
936 free (result);
937 errno = ENOMEM;
938 return -1;
939 }
940 if (result == initial_result)
941 memcpy (memory, initial_result, length);
942 result = memory;
943 }
944 }
945
946 /* Move the remaining bytes to the beginning of utf8buf. */
947 if (in2size > 0)
948 memmove (utf8buf, in2ptr, in2size);
949 utf8len = in2size;
950 }
951
952 if (res1 == (size_t)(-1))
953 {
954 if (errno1 == EINVAL)
955 in1size = 0;
956 else if (errno1 == EILSEQ)
957 {
958 if (result != initial_result)
959 free (result);
960 errno = errno1;
961 return -1;
962 }
963 }
964 }
965# undef utf8bufsize
966 }
967
968 done:
969 /* Now the final memory allocation. */
970 if (result == tmpbuf)
971 {
972 size_t memsize = length + extra_alloc;
973 char *memory;
974
975 memory = (char *) malloc (memsize > 0 ? memsize : 1);
976 if (memory != NULL)
977 {
978 memcpy (memory, tmpbuf, length);
979 result = memory;
980 }
981 else
982 {
983 errno = ENOMEM;
984 return -1;
985 }
986 }
987 else if (result != *resultp && length + extra_alloc < allocated)
988 {
989 /* Shrink the allocated memory if possible. */
990 size_t memsize = length + extra_alloc;
991 char *memory;
992
993 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
994 if (memory != NULL)
995 result = memory;
996 }
997 *resultp = result;
998 *lengthp = length;
999 return 0;
1000# undef tmpbuf
1001# undef tmpbufsize
1002}
1003
1004int
1005mem_cd_iconveh (const char *src, size_t srclen,
8912421c 1006 const iconveh_t *cd,
24d56127
LC
1007 enum iconv_ilseq_handler handler,
1008 size_t *offsets,
1009 char **resultp, size_t *lengthp)
1010{
8912421c
LC
1011 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1012 handler, 0, offsets, resultp, lengthp);
24d56127
LC
1013}
1014
1015char *
1016str_cd_iconveh (const char *src,
8912421c 1017 const iconveh_t *cd,
24d56127
LC
1018 enum iconv_ilseq_handler handler)
1019{
1020 /* For most encodings, a trailing NUL byte in the input will be converted
1021 to a trailing NUL byte in the output. But not for UTF-7. So that this
1022 function is usable for UTF-7, we have to exclude the NUL byte from the
1023 conversion and add it by hand afterwards. */
1024 char *result = NULL;
1025 size_t length = 0;
1026 int retval = mem_cd_iconveh_internal (src, strlen (src),
8912421c
LC
1027 cd->cd, cd->cd1, cd->cd2, handler, 1,
1028 NULL, &result, &length);
24d56127
LC
1029
1030 if (retval < 0)
1031 {
1032 if (result != NULL)
1033 {
1034 int saved_errno = errno;
1035 free (result);
1036 errno = saved_errno;
1037 }
1038 return NULL;
1039 }
1040
1041 /* Add the terminating NUL byte. */
1042 result[length] = '\0';
1043
1044 return result;
1045}
1046
1047#endif
1048
1049int
1050mem_iconveh (const char *src, size_t srclen,
1051 const char *from_codeset, const char *to_codeset,
1052 enum iconv_ilseq_handler handler,
1053 size_t *offsets,
1054 char **resultp, size_t *lengthp)
1055{
1056 if (srclen == 0)
1057 {
1058 /* Nothing to convert. */
1059 *lengthp = 0;
1060 return 0;
1061 }
1062 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1063 {
1064 char *result;
1065
1066 if (*resultp != NULL && *lengthp >= srclen)
1067 result = *resultp;
1068 else
1069 {
1070 result = (char *) malloc (srclen);
1071 if (result == NULL)
1072 {
1073 errno = ENOMEM;
1074 return -1;
1075 }
1076 }
1077 memcpy (result, src, srclen);
1078 *resultp = result;
1079 *lengthp = srclen;
1080 return 0;
1081 }
1082 else
1083 {
1084#if HAVE_ICONV
8912421c 1085 iconveh_t cd;
24d56127
LC
1086 char *result;
1087 size_t length;
1088 int retval;
1089
8912421c
LC
1090 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1091 return -1;
24d56127
LC
1092
1093 result = *resultp;
1094 length = *lengthp;
8912421c 1095 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
24d56127
LC
1096 &result, &length);
1097
1098 if (retval < 0)
1099 {
8912421c 1100 /* Close cd, but preserve the errno from str_cd_iconv. */
24d56127 1101 int saved_errno = errno;
8912421c 1102 iconveh_close (&cd);
24d56127
LC
1103 errno = saved_errno;
1104 }
1105 else
1106 {
8912421c 1107 if (iconveh_close (&cd) < 0)
24d56127
LC
1108 {
1109 /* Return -1, but free the allocated memory, and while doing
8912421c 1110 that, preserve the errno from iconveh_close. */
24d56127
LC
1111 int saved_errno = errno;
1112 if (result != *resultp && result != NULL)
1113 free (result);
1114 errno = saved_errno;
1115 return -1;
1116 }
1117 *resultp = result;
1118 *lengthp = length;
1119 }
1120 return retval;
1121#else
1122 /* This is a different error code than if iconv_open existed but didn't
1123 support from_codeset and to_codeset, so that the caller can emit
1124 an error message such as
1125 "iconv() is not supported. Installing GNU libiconv and
1126 then reinstalling this package would fix this." */
1127 errno = ENOSYS;
1128 return -1;
1129#endif
1130 }
1131}
1132
1133char *
1134str_iconveh (const char *src,
1135 const char *from_codeset, const char *to_codeset,
1136 enum iconv_ilseq_handler handler)
1137{
1138 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1139 {
1140 char *result = strdup (src);
1141
1142 if (result == NULL)
1143 errno = ENOMEM;
1144 return result;
1145 }
1146 else
1147 {
1148#if HAVE_ICONV
8912421c 1149 iconveh_t cd;
24d56127
LC
1150 char *result;
1151
8912421c
LC
1152 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1153 return NULL;
24d56127 1154
8912421c 1155 result = str_cd_iconveh (src, &cd, handler);
24d56127
LC
1156
1157 if (result == NULL)
1158 {
8912421c 1159 /* Close cd, but preserve the errno from str_cd_iconv. */
24d56127 1160 int saved_errno = errno;
8912421c 1161 iconveh_close (&cd);
24d56127
LC
1162 errno = saved_errno;
1163 }
1164 else
1165 {
8912421c 1166 if (iconveh_close (&cd) < 0)
24d56127
LC
1167 {
1168 /* Return NULL, but free the allocated memory, and while doing
8912421c 1169 that, preserve the errno from iconveh_close. */
24d56127
LC
1170 int saved_errno = errno;
1171 free (result);
1172 errno = saved_errno;
1173 return NULL;
1174 }
1175 }
1176 return result;
1177#else
1178 /* This is a different error code than if iconv_open existed but didn't
1179 support from_codeset and to_codeset, so that the caller can emit
1180 an error message such as
1181 "iconv() is not supported. Installing GNU libiconv and
1182 then reinstalling this package would fix this." */
1183 errno = ENOSYS;
1184 return NULL;
1185#endif
1186 }
1187}