Merge remote-tracking branch 'local-2.0/stable-2.0'
[bpt/guile.git] / lib / iconv.c
1 /* Character set conversion.
2 Copyright (C) 1999-2001, 2007, 2009-2011 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public License along
15 with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 #include <config.h>
19
20 /* Specification. */
21 #include <iconv.h>
22
23 #include <stddef.h>
24
25 #if REPLACE_ICONV_UTF
26 # include <errno.h>
27 # include <stdint.h>
28 # include <stdlib.h>
29 # include "unistr.h"
30 # ifndef uintptr_t
31 # define uintptr_t unsigned long
32 # endif
33 #endif
34
35 #if REPLACE_ICONV_UTF
36
37 /* UTF-{16,32}{BE,LE} converters taken from GNU libiconv 1.11. */
38
39 /* Return code if invalid. (xxx_mbtowc) */
40 # define RET_ILSEQ -1
41 /* Return code if no bytes were read. (xxx_mbtowc) */
42 # define RET_TOOFEW -2
43
44 /* Return code if invalid. (xxx_wctomb) */
45 # define RET_ILUNI -1
46 /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
47 # define RET_TOOSMALL -2
48
49 /*
50 * UTF-16BE
51 */
52
53 /* Specification: RFC 2781 */
54
55 static int
56 utf16be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
57 {
58 if (n >= 2)
59 {
60 ucs4_t wc = (s[0] << 8) + s[1];
61 if (wc >= 0xd800 && wc < 0xdc00)
62 {
63 if (n >= 4)
64 {
65 ucs4_t wc2 = (s[2] << 8) + s[3];
66 if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
67 return RET_ILSEQ;
68 *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
69 return 4;
70 }
71 }
72 else if (wc >= 0xdc00 && wc < 0xe000)
73 {
74 return RET_ILSEQ;
75 }
76 else
77 {
78 *pwc = wc;
79 return 2;
80 }
81 }
82 return RET_TOOFEW;
83 }
84
85 static int
86 utf16be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
87 {
88 if (!(wc >= 0xd800 && wc < 0xe000))
89 {
90 if (wc < 0x10000)
91 {
92 if (n >= 2)
93 {
94 r[0] = (unsigned char) (wc >> 8);
95 r[1] = (unsigned char) wc;
96 return 2;
97 }
98 else
99 return RET_TOOSMALL;
100 }
101 else if (wc < 0x110000)
102 {
103 if (n >= 4)
104 {
105 ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
106 ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
107 r[0] = (unsigned char) (wc1 >> 8);
108 r[1] = (unsigned char) wc1;
109 r[2] = (unsigned char) (wc2 >> 8);
110 r[3] = (unsigned char) wc2;
111 return 4;
112 }
113 else
114 return RET_TOOSMALL;
115 }
116 }
117 return RET_ILUNI;
118 }
119
120 /*
121 * UTF-16LE
122 */
123
124 /* Specification: RFC 2781 */
125
126 static int
127 utf16le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
128 {
129 if (n >= 2)
130 {
131 ucs4_t wc = s[0] + (s[1] << 8);
132 if (wc >= 0xd800 && wc < 0xdc00)
133 {
134 if (n >= 4)
135 {
136 ucs4_t wc2 = s[2] + (s[3] << 8);
137 if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
138 return RET_ILSEQ;
139 *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
140 return 4;
141 }
142 }
143 else if (wc >= 0xdc00 && wc < 0xe000)
144 {
145 return RET_ILSEQ;
146 }
147 else
148 {
149 *pwc = wc;
150 return 2;
151 }
152 }
153 return RET_TOOFEW;
154 }
155
156 static int
157 utf16le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
158 {
159 if (!(wc >= 0xd800 && wc < 0xe000))
160 {
161 if (wc < 0x10000)
162 {
163 if (n >= 2)
164 {
165 r[0] = (unsigned char) wc;
166 r[1] = (unsigned char) (wc >> 8);
167 return 2;
168 }
169 else
170 return RET_TOOSMALL;
171 }
172 else if (wc < 0x110000)
173 {
174 if (n >= 4)
175 {
176 ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
177 ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
178 r[0] = (unsigned char) wc1;
179 r[1] = (unsigned char) (wc1 >> 8);
180 r[2] = (unsigned char) wc2;
181 r[3] = (unsigned char) (wc2 >> 8);
182 return 4;
183 }
184 else
185 return RET_TOOSMALL;
186 }
187 }
188 return RET_ILUNI;
189 }
190
191 /*
192 * UTF-32BE
193 */
194
195 /* Specification: Unicode 3.1 Standard Annex #19 */
196
197 static int
198 utf32be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
199 {
200 if (n >= 4)
201 {
202 ucs4_t wc = (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3];
203 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
204 {
205 *pwc = wc;
206 return 4;
207 }
208 else
209 return RET_ILSEQ;
210 }
211 return RET_TOOFEW;
212 }
213
214 static int
215 utf32be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
216 {
217 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
218 {
219 if (n >= 4)
220 {
221 r[0] = 0;
222 r[1] = (unsigned char) (wc >> 16);
223 r[2] = (unsigned char) (wc >> 8);
224 r[3] = (unsigned char) wc;
225 return 4;
226 }
227 else
228 return RET_TOOSMALL;
229 }
230 return RET_ILUNI;
231 }
232
233 /*
234 * UTF-32LE
235 */
236
237 /* Specification: Unicode 3.1 Standard Annex #19 */
238
239 static int
240 utf32le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
241 {
242 if (n >= 4)
243 {
244 ucs4_t wc = s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24);
245 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
246 {
247 *pwc = wc;
248 return 4;
249 }
250 else
251 return RET_ILSEQ;
252 }
253 return RET_TOOFEW;
254 }
255
256 static int
257 utf32le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
258 {
259 if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
260 {
261 if (n >= 4)
262 {
263 r[0] = (unsigned char) wc;
264 r[1] = (unsigned char) (wc >> 8);
265 r[2] = (unsigned char) (wc >> 16);
266 r[3] = 0;
267 return 4;
268 }
269 else
270 return RET_TOOSMALL;
271 }
272 return RET_ILUNI;
273 }
274
275 #endif
276
277 size_t
278 rpl_iconv (iconv_t cd,
279 ICONV_CONST char **inbuf, size_t *inbytesleft,
280 char **outbuf, size_t *outbytesleft)
281 #undef iconv
282 {
283 #if REPLACE_ICONV_UTF
284 switch ((uintptr_t) cd)
285 {
286 {
287 int (*xxx_wctomb) (unsigned char *, ucs4_t, size_t);
288
289 case (uintptr_t) _ICONV_UTF8_UTF16BE:
290 xxx_wctomb = utf16be_wctomb;
291 goto loop_from_utf8;
292 case (uintptr_t) _ICONV_UTF8_UTF16LE:
293 xxx_wctomb = utf16le_wctomb;
294 goto loop_from_utf8;
295 case (uintptr_t) _ICONV_UTF8_UTF32BE:
296 xxx_wctomb = utf32be_wctomb;
297 goto loop_from_utf8;
298 case (uintptr_t) _ICONV_UTF8_UTF32LE:
299 xxx_wctomb = utf32le_wctomb;
300 goto loop_from_utf8;
301
302 loop_from_utf8:
303 if (inbuf == NULL || *inbuf == NULL)
304 return 0;
305 {
306 ICONV_CONST char *inptr = *inbuf;
307 size_t inleft = *inbytesleft;
308 char *outptr = *outbuf;
309 size_t outleft = *outbytesleft;
310 size_t res = 0;
311 while (inleft > 0)
312 {
313 ucs4_t uc;
314 int m = u8_mbtoucr (&uc, (const uint8_t *) inptr, inleft);
315 if (m <= 0)
316 {
317 if (m == -1)
318 {
319 errno = EILSEQ;
320 res = (size_t)(-1);
321 break;
322 }
323 if (m == -2)
324 {
325 errno = EINVAL;
326 res = (size_t)(-1);
327 break;
328 }
329 abort ();
330 }
331 else
332 {
333 int n = xxx_wctomb ((uint8_t *) outptr, uc, outleft);
334 if (n < 0)
335 {
336 if (n == RET_ILUNI)
337 {
338 errno = EILSEQ;
339 res = (size_t)(-1);
340 break;
341 }
342 if (n == RET_TOOSMALL)
343 {
344 errno = E2BIG;
345 res = (size_t)(-1);
346 break;
347 }
348 abort ();
349 }
350 else
351 {
352 inptr += m;
353 inleft -= m;
354 outptr += n;
355 outleft -= n;
356 }
357 }
358 }
359 *inbuf = inptr;
360 *inbytesleft = inleft;
361 *outbuf = outptr;
362 *outbytesleft = outleft;
363 return res;
364 }
365 }
366
367 {
368 int (*xxx_mbtowc) (ucs4_t *, const unsigned char *, size_t);
369
370 case (uintptr_t) _ICONV_UTF16BE_UTF8:
371 xxx_mbtowc = utf16be_mbtowc;
372 goto loop_to_utf8;
373 case (uintptr_t) _ICONV_UTF16LE_UTF8:
374 xxx_mbtowc = utf16le_mbtowc;
375 goto loop_to_utf8;
376 case (uintptr_t) _ICONV_UTF32BE_UTF8:
377 xxx_mbtowc = utf32be_mbtowc;
378 goto loop_to_utf8;
379 case (uintptr_t) _ICONV_UTF32LE_UTF8:
380 xxx_mbtowc = utf32le_mbtowc;
381 goto loop_to_utf8;
382
383 loop_to_utf8:
384 if (inbuf == NULL || *inbuf == NULL)
385 return 0;
386 {
387 ICONV_CONST char *inptr = *inbuf;
388 size_t inleft = *inbytesleft;
389 char *outptr = *outbuf;
390 size_t outleft = *outbytesleft;
391 size_t res = 0;
392 while (inleft > 0)
393 {
394 ucs4_t uc;
395 int m = xxx_mbtowc (&uc, (const uint8_t *) inptr, inleft);
396 if (m <= 0)
397 {
398 if (m == RET_ILSEQ)
399 {
400 errno = EILSEQ;
401 res = (size_t)(-1);
402 break;
403 }
404 if (m == RET_TOOFEW)
405 {
406 errno = EINVAL;
407 res = (size_t)(-1);
408 break;
409 }
410 abort ();
411 }
412 else
413 {
414 int n = u8_uctomb ((uint8_t *) outptr, uc, outleft);
415 if (n < 0)
416 {
417 if (n == -1)
418 {
419 errno = EILSEQ;
420 res = (size_t)(-1);
421 break;
422 }
423 if (n == -2)
424 {
425 errno = E2BIG;
426 res = (size_t)(-1);
427 break;
428 }
429 abort ();
430 }
431 else
432 {
433 inptr += m;
434 inleft -= m;
435 outptr += n;
436 outleft -= n;
437 }
438 }
439 }
440 *inbuf = inptr;
441 *inbytesleft = inleft;
442 *outbuf = outptr;
443 *outbytesleft = outleft;
444 return res;
445 }
446 }
447 }
448 #endif
449 return iconv (cd, inbuf, inbytesleft, outbuf, outbytesleft);
450 }