Commit | Line | Data |
---|---|---|
7f8e40b7 | 1 | /* Convert multibyte character to wide character. |
61cd9dc9 | 2 | Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc. |
7f8e40b7 NJ |
3 | Written by Bruno Haible <bruno@clisp.org>, 2008. |
4 | ||
5 | This program is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as published by | |
7 | the Free Software Foundation; either version 3 of the License, or | |
8 | (at your option) any later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public License | |
16 | along with this program. If not, see <http://www.gnu.org/licenses/>. */ | |
17 | ||
18 | #include <config.h> | |
19 | ||
20 | /* Specification. */ | |
21 | #include <wchar.h> | |
22 | ||
23 | #if GNULIB_defined_mbstate_t | |
24 | /* Implement mbrtowc() on top of mbtowc(). */ | |
25 | ||
26 | # include <errno.h> | |
27 | # include <stdlib.h> | |
28 | ||
29 | # include "localcharset.h" | |
30 | # include "streq.h" | |
31 | # include "verify.h" | |
32 | ||
33 | ||
34 | verify (sizeof (mbstate_t) >= 4); | |
35 | ||
36 | static char internal_state[4]; | |
37 | ||
38 | size_t | |
39 | mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
40 | { | |
41 | char *pstate = (char *)ps; | |
42 | ||
43 | if (pstate == NULL) | |
44 | pstate = internal_state; | |
45 | ||
46 | if (s == NULL) | |
47 | { | |
48 | pwc = NULL; | |
49 | s = ""; | |
50 | n = 1; | |
51 | } | |
52 | ||
53 | if (n == 0) | |
54 | return (size_t)(-2); | |
55 | ||
56 | /* Here n > 0. */ | |
57 | { | |
58 | size_t nstate = pstate[0]; | |
59 | char buf[4]; | |
60 | const char *p; | |
61 | size_t m; | |
62 | ||
63 | switch (nstate) | |
64 | { | |
65 | case 0: | |
1cd4fffc LC |
66 | p = s; |
67 | m = n; | |
68 | break; | |
7f8e40b7 | 69 | case 3: |
1cd4fffc LC |
70 | buf[2] = pstate[3]; |
71 | /*FALLTHROUGH*/ | |
7f8e40b7 | 72 | case 2: |
1cd4fffc LC |
73 | buf[1] = pstate[2]; |
74 | /*FALLTHROUGH*/ | |
7f8e40b7 | 75 | case 1: |
1cd4fffc LC |
76 | buf[0] = pstate[1]; |
77 | p = buf; | |
78 | m = nstate; | |
79 | buf[m++] = s[0]; | |
80 | if (n >= 2 && m < 4) | |
81 | { | |
82 | buf[m++] = s[1]; | |
83 | if (n >= 3 && m < 4) | |
84 | buf[m++] = s[2]; | |
85 | } | |
86 | break; | |
7f8e40b7 | 87 | default: |
1cd4fffc LC |
88 | errno = EINVAL; |
89 | return (size_t)(-1); | |
7f8e40b7 NJ |
90 | } |
91 | ||
f240aacb | 92 | /* Here m > 0. */ |
7f8e40b7 NJ |
93 | |
94 | # if __GLIBC__ | |
95 | /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ | |
96 | mbtowc (NULL, NULL, 0); | |
97 | # endif | |
98 | { | |
99 | int res = mbtowc (pwc, p, m); | |
100 | ||
101 | if (res >= 0) | |
1cd4fffc LC |
102 | { |
103 | if (pwc != NULL && ((*pwc == 0) != (res == 0))) | |
104 | abort (); | |
105 | if (nstate >= (res > 0 ? res : 1)) | |
106 | abort (); | |
107 | res -= nstate; | |
108 | pstate[0] = 0; | |
109 | return res; | |
110 | } | |
7f8e40b7 NJ |
111 | |
112 | /* mbtowc does not distinguish between invalid and incomplete multibyte | |
1cd4fffc LC |
113 | sequences. But mbrtowc needs to make this distinction. |
114 | There are two possible approaches: | |
115 | - Use iconv() and its return value. | |
116 | - Use built-in knowledge about the possible encodings. | |
117 | Given the low quality of implementation of iconv() on the systems that | |
118 | lack mbrtowc(), we use the second approach. | |
119 | The possible encodings are: | |
120 | - 8-bit encodings, | |
121 | - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, | |
122 | - UTF-8. | |
123 | Use specialized code for each. */ | |
7f8e40b7 | 124 | if (m >= 4 || m >= MB_CUR_MAX) |
1cd4fffc | 125 | goto invalid; |
7f8e40b7 NJ |
126 | /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ |
127 | { | |
1cd4fffc LC |
128 | const char *encoding = locale_charset (); |
129 | ||
130 | if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) | |
131 | { | |
132 | /* Cf. unistr/u8-mblen.c. */ | |
133 | unsigned char c = (unsigned char) p[0]; | |
134 | ||
135 | if (c >= 0xc2) | |
136 | { | |
137 | if (c < 0xe0) | |
138 | { | |
139 | if (m == 1) | |
140 | goto incomplete; | |
141 | } | |
142 | else if (c < 0xf0) | |
143 | { | |
144 | if (m == 1) | |
145 | goto incomplete; | |
146 | if (m == 2) | |
147 | { | |
148 | unsigned char c2 = (unsigned char) p[1]; | |
149 | ||
150 | if ((c2 ^ 0x80) < 0x40 | |
151 | && (c >= 0xe1 || c2 >= 0xa0) | |
152 | && (c != 0xed || c2 < 0xa0)) | |
153 | goto incomplete; | |
154 | } | |
155 | } | |
156 | else if (c <= 0xf4) | |
157 | { | |
158 | if (m == 1) | |
159 | goto incomplete; | |
160 | else /* m == 2 || m == 3 */ | |
161 | { | |
162 | unsigned char c2 = (unsigned char) p[1]; | |
163 | ||
164 | if ((c2 ^ 0x80) < 0x40 | |
165 | && (c >= 0xf1 || c2 >= 0x90) | |
166 | && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) | |
167 | { | |
168 | if (m == 2) | |
169 | goto incomplete; | |
170 | else /* m == 3 */ | |
171 | { | |
172 | unsigned char c3 = (unsigned char) p[2]; | |
173 | ||
174 | if ((c3 ^ 0x80) < 0x40) | |
175 | goto incomplete; | |
176 | } | |
177 | } | |
178 | } | |
179 | } | |
180 | } | |
181 | goto invalid; | |
182 | } | |
183 | ||
184 | /* As a reference for this code, you can use the GNU libiconv | |
185 | implementation. Look for uses of the RET_TOOFEW macro. */ | |
186 | ||
187 | if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) | |
188 | { | |
189 | if (m == 1) | |
190 | { | |
191 | unsigned char c = (unsigned char) p[0]; | |
192 | ||
193 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) | |
194 | goto incomplete; | |
195 | } | |
196 | if (m == 2) | |
197 | { | |
198 | unsigned char c = (unsigned char) p[0]; | |
199 | ||
200 | if (c == 0x8f) | |
201 | { | |
202 | unsigned char c2 = (unsigned char) p[1]; | |
203 | ||
204 | if (c2 >= 0xa1 && c2 < 0xff) | |
205 | goto incomplete; | |
206 | } | |
207 | } | |
208 | goto invalid; | |
209 | } | |
210 | if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) | |
211 | || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) | |
212 | || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) | |
213 | { | |
214 | if (m == 1) | |
215 | { | |
216 | unsigned char c = (unsigned char) p[0]; | |
217 | ||
218 | if (c >= 0xa1 && c < 0xff) | |
219 | goto incomplete; | |
220 | } | |
221 | goto invalid; | |
222 | } | |
223 | if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) | |
224 | { | |
225 | if (m == 1) | |
226 | { | |
227 | unsigned char c = (unsigned char) p[0]; | |
228 | ||
229 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e) | |
230 | goto incomplete; | |
231 | } | |
232 | else /* m == 2 || m == 3 */ | |
233 | { | |
234 | unsigned char c = (unsigned char) p[0]; | |
235 | ||
236 | if (c == 0x8e) | |
237 | goto incomplete; | |
238 | } | |
239 | goto invalid; | |
240 | } | |
241 | if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) | |
242 | { | |
243 | if (m == 1) | |
244 | { | |
245 | unsigned char c = (unsigned char) p[0]; | |
246 | ||
247 | if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) | |
248 | goto incomplete; | |
249 | } | |
250 | else /* m == 2 || m == 3 */ | |
251 | { | |
252 | unsigned char c = (unsigned char) p[0]; | |
253 | ||
254 | if (c >= 0x90 && c <= 0xe3) | |
255 | { | |
256 | unsigned char c2 = (unsigned char) p[1]; | |
257 | ||
258 | if (c2 >= 0x30 && c2 <= 0x39) | |
259 | { | |
260 | if (m == 2) | |
261 | goto incomplete; | |
262 | else /* m == 3 */ | |
263 | { | |
264 | unsigned char c3 = (unsigned char) p[2]; | |
265 | ||
266 | if (c3 >= 0x81 && c3 <= 0xfe) | |
267 | goto incomplete; | |
268 | } | |
269 | } | |
270 | } | |
271 | } | |
272 | goto invalid; | |
273 | } | |
274 | if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) | |
275 | { | |
276 | if (m == 1) | |
277 | { | |
278 | unsigned char c = (unsigned char) p[0]; | |
279 | ||
280 | if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) | |
281 | || (c >= 0xf0 && c <= 0xf9)) | |
282 | goto incomplete; | |
283 | } | |
284 | goto invalid; | |
285 | } | |
286 | ||
287 | /* An unknown multibyte encoding. */ | |
288 | goto incomplete; | |
7f8e40b7 NJ |
289 | } |
290 | ||
291 | incomplete: | |
292 | { | |
1cd4fffc LC |
293 | size_t k = nstate; |
294 | /* Here 0 <= k < m < 4. */ | |
295 | pstate[++k] = s[0]; | |
296 | if (k < m) | |
297 | { | |
298 | pstate[++k] = s[1]; | |
299 | if (k < m) | |
300 | pstate[++k] = s[2]; | |
301 | } | |
302 | if (k != m) | |
303 | abort (); | |
7f8e40b7 NJ |
304 | } |
305 | pstate[0] = m; | |
306 | return (size_t)(-2); | |
307 | ||
308 | invalid: | |
309 | errno = EILSEQ; | |
310 | /* The conversion state is undefined, says POSIX. */ | |
311 | return (size_t)(-1); | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
316 | #else | |
317 | /* Override the system's mbrtowc() function. */ | |
318 | ||
319 | # undef mbrtowc | |
320 | ||
321 | size_t | |
322 | rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) | |
323 | { | |
324 | # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG | |
325 | if (s == NULL) | |
326 | { | |
327 | pwc = NULL; | |
328 | s = ""; | |
329 | n = 1; | |
330 | } | |
331 | # endif | |
332 | ||
333 | # if MBRTOWC_RETVAL_BUG | |
334 | { | |
335 | static mbstate_t internal_state; | |
336 | ||
337 | /* Override mbrtowc's internal state. We can not call mbsinit() on the | |
338 | hidden internal state, but we can call it on our variable. */ | |
339 | if (ps == NULL) | |
340 | ps = &internal_state; | |
341 | ||
342 | if (!mbsinit (ps)) | |
343 | { | |
1cd4fffc LC |
344 | /* Parse the rest of the multibyte character byte for byte. */ |
345 | size_t count = 0; | |
346 | for (; n > 0; s++, n--) | |
347 | { | |
348 | wchar_t wc; | |
349 | size_t ret = mbrtowc (&wc, s, 1, ps); | |
350 | ||
351 | if (ret == (size_t)(-1)) | |
352 | return (size_t)(-1); | |
353 | count++; | |
354 | if (ret != (size_t)(-2)) | |
355 | { | |
356 | /* The multibyte character has been completed. */ | |
357 | if (pwc != NULL) | |
358 | *pwc = wc; | |
359 | return (wc == 0 ? 0 : count); | |
360 | } | |
361 | } | |
362 | return (size_t)(-2); | |
7f8e40b7 NJ |
363 | } |
364 | } | |
365 | # endif | |
366 | ||
367 | # if MBRTOWC_NUL_RETVAL_BUG | |
368 | { | |
369 | wchar_t wc; | |
370 | size_t ret = mbrtowc (&wc, s, n, ps); | |
371 | ||
372 | if (ret != (size_t)(-1) && ret != (size_t)(-2)) | |
373 | { | |
1cd4fffc LC |
374 | if (pwc != NULL) |
375 | *pwc = wc; | |
376 | if (wc == 0) | |
377 | ret = 0; | |
7f8e40b7 NJ |
378 | } |
379 | return ret; | |
380 | } | |
381 | # else | |
382 | return mbrtowc (pwc, s, n, ps); | |
383 | # endif | |
384 | } | |
385 | ||
386 | #endif |