Update Gnulib to v0.1-77-gd9361da
[bpt/guile.git] / lib / iconv.c
CommitLineData
24d56127 1/* Character set conversion.
5e69ceb7 2 Copyright (C) 1999-2001, 2007, 2009-2014 Free Software Foundation, Inc.
24d56127
LC
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public License along
005de2e8 15 with this program; if not, see <http://www.gnu.org/licenses/>. */
24d56127
LC
16
17#include <config.h>
18
19/* Specification. */
20#include <iconv.h>
21
22#include <stddef.h>
23
24#if REPLACE_ICONV_UTF
25# include <errno.h>
26# include <stdint.h>
27# include <stdlib.h>
28# include "unistr.h"
29# ifndef uintptr_t
30# define uintptr_t unsigned long
31# endif
32#endif
33
34#if REPLACE_ICONV_UTF
35
36/* UTF-{16,32}{BE,LE} converters taken from GNU libiconv 1.11. */
37
38/* Return code if invalid. (xxx_mbtowc) */
39# define RET_ILSEQ -1
40/* Return code if no bytes were read. (xxx_mbtowc) */
41# define RET_TOOFEW -2
42
43/* Return code if invalid. (xxx_wctomb) */
44# define RET_ILUNI -1
45/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
46# define RET_TOOSMALL -2
47
48/*
49 * UTF-16BE
50 */
51
52/* Specification: RFC 2781 */
53
54static int
55utf16be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
56{
57 if (n >= 2)
58 {
59 ucs4_t wc = (s[0] << 8) + s[1];
60 if (wc >= 0xd800 && wc < 0xdc00)
1cd4fffc
LC
61 {
62 if (n >= 4)
63 {
64 ucs4_t wc2 = (s[2] << 8) + s[3];
65 if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
66 return RET_ILSEQ;
67 *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
68 return 4;
69 }
70 }
24d56127 71 else if (wc >= 0xdc00 && wc < 0xe000)
1cd4fffc
LC
72 {
73 return RET_ILSEQ;
74 }
24d56127 75 else
1cd4fffc
LC
76 {
77 *pwc = wc;
78 return 2;
79 }
24d56127
LC
80 }
81 return RET_TOOFEW;
82}
83
84static int
85utf16be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
86{
87 if (!(wc >= 0xd800 && wc < 0xe000))
88 {
89 if (wc < 0x10000)
1cd4fffc
LC
90 {
91 if (n >= 2)
92 {
93 r[0] = (unsigned char) (wc >> 8);
94 r[1] = (unsigned char) wc;
95 return 2;
96 }
97 else
98 return RET_TOOSMALL;
99 }
24d56127 100 else if (wc < 0x110000)
1cd4fffc
LC
101 {
102 if (n >= 4)
103 {
104 ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
105 ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
106 r[0] = (unsigned char) (wc1 >> 8);
107 r[1] = (unsigned char) wc1;
108 r[2] = (unsigned char) (wc2 >> 8);
109 r[3] = (unsigned char) wc2;
110 return 4;
111 }
112 else
113 return RET_TOOSMALL;
114 }
24d56127
LC
115 }
116 return RET_ILUNI;
117}
118
119/*
120 * UTF-16LE
121 */
122
123/* Specification: RFC 2781 */
124
125static int
126utf16le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
127{
128 if (n >= 2)
129 {
130 ucs4_t wc = s[0] + (s[1] << 8);
131 if (wc >= 0xd800 && wc < 0xdc00)
1cd4fffc
LC
132 {
133 if (n >= 4)
134 {
135 ucs4_t wc2 = s[2] + (s[3] << 8);
136 if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
137 return RET_ILSEQ;
138 *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
139 return 4;
140 }
141 }
24d56127 142 else if (wc >= 0xdc00 && wc < 0xe000)
1cd4fffc
LC
143 {
144 return RET_ILSEQ;
145 }
24d56127 146 else
1cd4fffc
LC
147 {
148 *pwc = wc;
149 return 2;
150 }
24d56127
LC
151 }
152 return RET_TOOFEW;
153}
154
155static int
156utf16le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
157{
158 if (!(wc >= 0xd800 && wc < 0xe000))
159 {
160 if (wc < 0x10000)
1cd4fffc
LC
161 {
162 if (n >= 2)
163 {
164 r[0] = (unsigned char) wc;
165 r[1] = (unsigned char) (wc >> 8);
166 return 2;
167 }
168 else
169 return RET_TOOSMALL;
170 }
24d56127 171 else if (wc < 0x110000)
1cd4fffc
LC
172 {
173 if (n >= 4)
174 {
175 ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
176 ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
177 r[0] = (unsigned char) wc1;
178 r[1] = (unsigned char) (wc1 >> 8);
179 r[2] = (unsigned char) wc2;
180 r[3] = (unsigned char) (wc2 >> 8);
181 return 4;
182 }
183 else
184 return RET_TOOSMALL;
185 }
24d56127
LC
186 }
187 return RET_ILUNI;
188}
189
190/*
191 * UTF-32BE
192 */
193
194/* Specification: Unicode 3.1 Standard Annex #19 */
195
196static int
197utf32be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
198{
199 if (n >= 4)
200 {
201 ucs4_t wc = (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3];
202 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
1cd4fffc
LC
203 {
204 *pwc = wc;
205 return 4;
206 }
24d56127 207 else
1cd4fffc 208 return RET_ILSEQ;
24d56127
LC
209 }
210 return RET_TOOFEW;
211}
212
213static int
214utf32be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
215{
216 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
217 {
218 if (n >= 4)
1cd4fffc
LC
219 {
220 r[0] = 0;
221 r[1] = (unsigned char) (wc >> 16);
222 r[2] = (unsigned char) (wc >> 8);
223 r[3] = (unsigned char) wc;
224 return 4;
225 }
24d56127 226 else
1cd4fffc 227 return RET_TOOSMALL;
24d56127
LC
228 }
229 return RET_ILUNI;
230}
231
232/*
233 * UTF-32LE
234 */
235
236/* Specification: Unicode 3.1 Standard Annex #19 */
237
238static int
239utf32le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
240{
241 if (n >= 4)
242 {
243 ucs4_t wc = s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24);
244 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
1cd4fffc
LC
245 {
246 *pwc = wc;
247 return 4;
248 }
24d56127 249 else
1cd4fffc 250 return RET_ILSEQ;
24d56127
LC
251 }
252 return RET_TOOFEW;
253}
254
255static int
256utf32le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
257{
258 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
259 {
260 if (n >= 4)
1cd4fffc
LC
261 {
262 r[0] = (unsigned char) wc;
263 r[1] = (unsigned char) (wc >> 8);
264 r[2] = (unsigned char) (wc >> 16);
265 r[3] = 0;
266 return 4;
24d56127
LC
267 }
268 else
1cd4fffc 269 return RET_TOOSMALL;
24d56127
LC
270 }
271 return RET_ILUNI;
272}
273
274#endif
275
276size_t
277rpl_iconv (iconv_t cd,
1cd4fffc
LC
278 ICONV_CONST char **inbuf, size_t *inbytesleft,
279 char **outbuf, size_t *outbytesleft)
24d56127
LC
280#undef iconv
281{
282#if REPLACE_ICONV_UTF
283 switch ((uintptr_t) cd)
284 {
285 {
1cd4fffc
LC
286 int (*xxx_wctomb) (unsigned char *, ucs4_t, size_t);
287
288 case (uintptr_t) _ICONV_UTF8_UTF16BE:
289 xxx_wctomb = utf16be_wctomb;
290 goto loop_from_utf8;
291 case (uintptr_t) _ICONV_UTF8_UTF16LE:
292 xxx_wctomb = utf16le_wctomb;
293 goto loop_from_utf8;
294 case (uintptr_t) _ICONV_UTF8_UTF32BE:
295 xxx_wctomb = utf32be_wctomb;
296 goto loop_from_utf8;
297 case (uintptr_t) _ICONV_UTF8_UTF32LE:
298 xxx_wctomb = utf32le_wctomb;
299 goto loop_from_utf8;
24d56127
LC
300
301 loop_from_utf8:
1cd4fffc
LC
302 if (inbuf == NULL || *inbuf == NULL)
303 return 0;
304 {
305 ICONV_CONST char *inptr = *inbuf;
306 size_t inleft = *inbytesleft;
307 char *outptr = *outbuf;
308 size_t outleft = *outbytesleft;
309 size_t res = 0;
310 while (inleft > 0)
311 {
312 ucs4_t uc;
313 int m = u8_mbtoucr (&uc, (const uint8_t *) inptr, inleft);
314 if (m <= 0)
315 {
316 if (m == -1)
317 {
318 errno = EILSEQ;
319 res = (size_t)(-1);
320 break;
321 }
322 if (m == -2)
323 {
324 errno = EINVAL;
325 res = (size_t)(-1);
326 break;
327 }
328 abort ();
329 }
330 else
331 {
332 int n = xxx_wctomb ((uint8_t *) outptr, uc, outleft);
333 if (n < 0)
334 {
335 if (n == RET_ILUNI)
336 {
337 errno = EILSEQ;
338 res = (size_t)(-1);
339 break;
340 }
341 if (n == RET_TOOSMALL)
342 {
343 errno = E2BIG;
344 res = (size_t)(-1);
345 break;
346 }
347 abort ();
348 }
349 else
350 {
351 inptr += m;
352 inleft -= m;
353 outptr += n;
354 outleft -= n;
355 }
356 }
357 }
358 *inbuf = inptr;
359 *inbytesleft = inleft;
360 *outbuf = outptr;
361 *outbytesleft = outleft;
362 return res;
363 }
24d56127
LC
364 }
365
366 {
1cd4fffc
LC
367 int (*xxx_mbtowc) (ucs4_t *, const unsigned char *, size_t);
368
369 case (uintptr_t) _ICONV_UTF16BE_UTF8:
370 xxx_mbtowc = utf16be_mbtowc;
371 goto loop_to_utf8;
372 case (uintptr_t) _ICONV_UTF16LE_UTF8:
373 xxx_mbtowc = utf16le_mbtowc;
374 goto loop_to_utf8;
375 case (uintptr_t) _ICONV_UTF32BE_UTF8:
376 xxx_mbtowc = utf32be_mbtowc;
377 goto loop_to_utf8;
378 case (uintptr_t) _ICONV_UTF32LE_UTF8:
379 xxx_mbtowc = utf32le_mbtowc;
380 goto loop_to_utf8;
24d56127
LC
381
382 loop_to_utf8:
1cd4fffc
LC
383 if (inbuf == NULL || *inbuf == NULL)
384 return 0;
385 {
386 ICONV_CONST char *inptr = *inbuf;
387 size_t inleft = *inbytesleft;
388 char *outptr = *outbuf;
389 size_t outleft = *outbytesleft;
390 size_t res = 0;
391 while (inleft > 0)
392 {
393 ucs4_t uc;
394 int m = xxx_mbtowc (&uc, (const uint8_t *) inptr, inleft);
395 if (m <= 0)
396 {
397 if (m == RET_ILSEQ)
398 {
399 errno = EILSEQ;
400 res = (size_t)(-1);
401 break;
402 }
403 if (m == RET_TOOFEW)
404 {
405 errno = EINVAL;
406 res = (size_t)(-1);
407 break;
408 }
409 abort ();
410 }
411 else
412 {
413 int n = u8_uctomb ((uint8_t *) outptr, uc, outleft);
414 if (n < 0)
415 {
416 if (n == -1)
417 {
418 errno = EILSEQ;
419 res = (size_t)(-1);
420 break;
421 }
422 if (n == -2)
423 {
424 errno = E2BIG;
425 res = (size_t)(-1);
426 break;
427 }
428 abort ();
429 }
430 else
431 {
432 inptr += m;
433 inleft -= m;
434 outptr += n;
435 outleft -= n;
436 }
437 }
438 }
439 *inbuf = inptr;
440 *inbytesleft = inleft;
441 *outbuf = outptr;
442 *outbytesleft = outleft;
443 return res;
444 }
24d56127
LC
445 }
446 }
447#endif
448 return iconv (cd, inbuf, inbytesleft, outbuf, outbytesleft);
449}