Merge remote-tracking branch 'local-2.0/stable-2.0'
[bpt/guile.git] / lib / mbrtowc.c
CommitLineData
eb4a14ed
LC
1/* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2012 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include <wchar.h>
22
23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc(). */
25
26# include <errno.h>
27# include <stdlib.h>
28
29# include "localcharset.h"
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41 char *pstate = (char *)ps;
42
43 if (s == NULL)
44 {
45 pwc = NULL;
46 s = "";
47 n = 1;
48 }
49
50 if (n == 0)
51 return (size_t)(-2);
52
53 /* Here n > 0. */
54
55 if (pstate == NULL)
56 pstate = internal_state;
57
58 {
59 size_t nstate = pstate[0];
60 char buf[4];
61 const char *p;
62 size_t m;
63
64 switch (nstate)
65 {
66 case 0:
67 p = s;
68 m = n;
69 break;
70 case 3:
71 buf[2] = pstate[3];
72 /*FALLTHROUGH*/
73 case 2:
74 buf[1] = pstate[2];
75 /*FALLTHROUGH*/
76 case 1:
77 buf[0] = pstate[1];
78 p = buf;
79 m = nstate;
80 buf[m++] = s[0];
81 if (n >= 2 && m < 4)
82 {
83 buf[m++] = s[1];
84 if (n >= 3 && m < 4)
85 buf[m++] = s[2];
86 }
87 break;
88 default:
89 errno = EINVAL;
90 return (size_t)(-1);
91 }
92
93 /* Here m > 0. */
94
95# if __GLIBC__ || defined __UCLIBC__
96 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
97 mbtowc (NULL, NULL, 0);
98# endif
99 {
100 int res = mbtowc (pwc, p, m);
101
102 if (res >= 0)
103 {
104 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105 abort ();
106 if (nstate >= (res > 0 ? res : 1))
107 abort ();
108 res -= nstate;
109 pstate[0] = 0;
110 return res;
111 }
112
113 /* mbtowc does not distinguish between invalid and incomplete multibyte
114 sequences. But mbrtowc needs to make this distinction.
115 There are two possible approaches:
116 - Use iconv() and its return value.
117 - Use built-in knowledge about the possible encodings.
118 Given the low quality of implementation of iconv() on the systems that
119 lack mbrtowc(), we use the second approach.
120 The possible encodings are:
121 - 8-bit encodings,
122 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123 - UTF-8.
124 Use specialized code for each. */
125 if (m >= 4 || m >= MB_CUR_MAX)
126 goto invalid;
127 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
128 {
129 const char *encoding = locale_charset ();
130
131 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132 {
133 /* Cf. unistr/u8-mblen.c. */
134 unsigned char c = (unsigned char) p[0];
135
136 if (c >= 0xc2)
137 {
138 if (c < 0xe0)
139 {
140 if (m == 1)
141 goto incomplete;
142 }
143 else if (c < 0xf0)
144 {
145 if (m == 1)
146 goto incomplete;
147 if (m == 2)
148 {
149 unsigned char c2 = (unsigned char) p[1];
150
151 if ((c2 ^ 0x80) < 0x40
152 && (c >= 0xe1 || c2 >= 0xa0)
153 && (c != 0xed || c2 < 0xa0))
154 goto incomplete;
155 }
156 }
157 else if (c <= 0xf4)
158 {
159 if (m == 1)
160 goto incomplete;
161 else /* m == 2 || m == 3 */
162 {
163 unsigned char c2 = (unsigned char) p[1];
164
165 if ((c2 ^ 0x80) < 0x40
166 && (c >= 0xf1 || c2 >= 0x90)
167 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
168 {
169 if (m == 2)
170 goto incomplete;
171 else /* m == 3 */
172 {
173 unsigned char c3 = (unsigned char) p[2];
174
175 if ((c3 ^ 0x80) < 0x40)
176 goto incomplete;
177 }
178 }
179 }
180 }
181 }
182 goto invalid;
183 }
184
185 /* As a reference for this code, you can use the GNU libiconv
186 implementation. Look for uses of the RET_TOOFEW macro. */
187
188 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
189 {
190 if (m == 1)
191 {
192 unsigned char c = (unsigned char) p[0];
193
194 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
195 goto incomplete;
196 }
197 if (m == 2)
198 {
199 unsigned char c = (unsigned char) p[0];
200
201 if (c == 0x8f)
202 {
203 unsigned char c2 = (unsigned char) p[1];
204
205 if (c2 >= 0xa1 && c2 < 0xff)
206 goto incomplete;
207 }
208 }
209 goto invalid;
210 }
211 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
212 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
213 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
214 {
215 if (m == 1)
216 {
217 unsigned char c = (unsigned char) p[0];
218
219 if (c >= 0xa1 && c < 0xff)
220 goto incomplete;
221 }
222 goto invalid;
223 }
224 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
225 {
226 if (m == 1)
227 {
228 unsigned char c = (unsigned char) p[0];
229
230 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
231 goto incomplete;
232 }
233 else /* m == 2 || m == 3 */
234 {
235 unsigned char c = (unsigned char) p[0];
236
237 if (c == 0x8e)
238 goto incomplete;
239 }
240 goto invalid;
241 }
242 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
243 {
244 if (m == 1)
245 {
246 unsigned char c = (unsigned char) p[0];
247
248 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
249 goto incomplete;
250 }
251 else /* m == 2 || m == 3 */
252 {
253 unsigned char c = (unsigned char) p[0];
254
255 if (c >= 0x90 && c <= 0xe3)
256 {
257 unsigned char c2 = (unsigned char) p[1];
258
259 if (c2 >= 0x30 && c2 <= 0x39)
260 {
261 if (m == 2)
262 goto incomplete;
263 else /* m == 3 */
264 {
265 unsigned char c3 = (unsigned char) p[2];
266
267 if (c3 >= 0x81 && c3 <= 0xfe)
268 goto incomplete;
269 }
270 }
271 }
272 }
273 goto invalid;
274 }
275 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
276 {
277 if (m == 1)
278 {
279 unsigned char c = (unsigned char) p[0];
280
281 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
282 || (c >= 0xf0 && c <= 0xf9))
283 goto incomplete;
284 }
285 goto invalid;
286 }
287
288 /* An unknown multibyte encoding. */
289 goto incomplete;
290 }
291
292 incomplete:
293 {
294 size_t k = nstate;
295 /* Here 0 <= k < m < 4. */
296 pstate[++k] = s[0];
297 if (k < m)
298 {
299 pstate[++k] = s[1];
300 if (k < m)
301 pstate[++k] = s[2];
302 }
303 if (k != m)
304 abort ();
305 }
306 pstate[0] = m;
307 return (size_t)(-2);
308
309 invalid:
310 errno = EILSEQ;
311 /* The conversion state is undefined, says POSIX. */
312 return (size_t)(-1);
313 }
314 }
315}
316
317#else
318/* Override the system's mbrtowc() function. */
319
320# undef mbrtowc
321
322size_t
323rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
324{
325# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
326 if (s == NULL)
327 {
328 pwc = NULL;
329 s = "";
330 n = 1;
331 }
332# endif
333
334# if MBRTOWC_RETVAL_BUG
335 {
336 static mbstate_t internal_state;
337
338 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
339 hidden internal state, but we can call it on our variable. */
340 if (ps == NULL)
341 ps = &internal_state;
342
343 if (!mbsinit (ps))
344 {
345 /* Parse the rest of the multibyte character byte for byte. */
346 size_t count = 0;
347 for (; n > 0; s++, n--)
348 {
349 wchar_t wc;
350 size_t ret = mbrtowc (&wc, s, 1, ps);
351
352 if (ret == (size_t)(-1))
353 return (size_t)(-1);
354 count++;
355 if (ret != (size_t)(-2))
356 {
357 /* The multibyte character has been completed. */
358 if (pwc != NULL)
359 *pwc = wc;
360 return (wc == 0 ? 0 : count);
361 }
362 }
363 return (size_t)(-2);
364 }
365 }
366# endif
367
368# if MBRTOWC_NUL_RETVAL_BUG
369 {
370 wchar_t wc;
371 size_t ret = mbrtowc (&wc, s, n, ps);
372
373 if (ret != (size_t)(-1) && ret != (size_t)(-2))
374 {
375 if (pwc != NULL)
376 *pwc = wc;
377 if (wc == 0)
378 ret = 0;
379 }
380 return ret;
381 }
382# else
383 {
384# if MBRTOWC_NULL_ARG1_BUG
385 wchar_t dummy;
386
387 if (pwc == NULL)
388 pwc = &dummy;
389# endif
390
391 return mbrtowc (pwc, s, n, ps);
392 }
393# endif
394}
395
396#endif