Commit | Line | Data |
---|---|---|
8d138742 CE |
1 | |
2 | /* | |
3 | ** Copyright 2000-2002 Double Precision, Inc. | |
4 | ** See COPYING for distribution information. | |
5 | ** | |
6 | ** $Id: utf8.c,v 1.4 2002/11/18 00:54:22 mrsam Exp $ | |
7 | */ | |
8 | ||
9 | #include "unicode.h" | |
10 | #include <stdio.h> | |
11 | #include <stdlib.h> | |
12 | #include <string.h> | |
13 | ||
14 | unicode_char *unicode_utf8_tou(const char *cp, int *ip) | |
15 | { | |
16 | size_t l; | |
17 | size_t n=1; | |
18 | unicode_char *p, uc; | |
19 | ||
20 | for (l=0; cp[l]; ++n) | |
21 | { | |
22 | if ((cp[l] & 0x80) == 0) | |
23 | { | |
24 | ++l; | |
25 | continue; | |
26 | } | |
27 | ||
28 | if ((cp[l] & 0xE0) == 0xC0) | |
29 | { | |
30 | if ((cp[l+1] & 0xC0) == 0x80) | |
31 | { | |
32 | l += 2; | |
33 | continue; | |
34 | } | |
35 | } | |
36 | ||
37 | if ((cp[l] & 0xF0) == 0xE0) | |
38 | { | |
39 | if ((cp[l+1] & 0xC0) == 0x80 && | |
40 | (cp[l+2] & 0xC0) == 0x80) | |
41 | { | |
42 | l += 3; | |
43 | continue; | |
44 | } | |
45 | } | |
46 | ||
47 | if ((cp[l] & 0xF8) == 0xF0) | |
48 | { | |
49 | if ((cp[l+1] & 0xC0) == 0x80 && | |
50 | (cp[l+2] & 0xC0) == 0x80 && | |
51 | (cp[l+3] & 0xC0) == 0x80) | |
52 | { | |
53 | l += 4; | |
54 | continue; | |
55 | } | |
56 | } | |
57 | ||
58 | if ((cp[l] & 0xFC) == 0xF8) | |
59 | { | |
60 | if ((cp[l+1] & 0xC0) == 0x80 && | |
61 | (cp[l+2] & 0xC0) == 0x80 && | |
62 | (cp[l+3] & 0xC0) == 0x80 && | |
63 | (cp[l+4] & 0xC0) == 0x80) | |
64 | { | |
65 | l += 5; | |
66 | continue; | |
67 | } | |
68 | } | |
69 | ||
70 | if ((cp[l] & 0xFE) == 0xFC) | |
71 | { | |
72 | if ((cp[l+1] & 0xC0) == 0x80 && | |
73 | (cp[l+2] & 0xC0) == 0x80 && | |
74 | (cp[l+3] & 0xC0) == 0x80 && | |
75 | (cp[l+4] & 0xC0) == 0x80 && | |
76 | (cp[l+5] & 0xC0) == 0x80) | |
77 | { | |
78 | l += 6; | |
79 | continue; | |
80 | } | |
81 | } | |
82 | ||
83 | if (ip) | |
84 | { | |
85 | *ip= l; | |
86 | return (0); | |
87 | } | |
88 | ++l; | |
89 | } | |
90 | if (ip) | |
91 | *ip = -1; | |
92 | if ((p=malloc(n*sizeof(unicode_char))) == 0) | |
93 | return (0); | |
94 | n=0; | |
95 | ||
96 | for (l=0; cp[l]; p[n++]=uc) | |
97 | { | |
98 | if ((cp[l] & 0x80) == 0) | |
99 | { | |
100 | uc=cp[l]; | |
101 | ++l; | |
102 | continue; | |
103 | } | |
104 | ||
105 | if ((cp[l] & 0xE0) == 0xC0) | |
106 | { | |
107 | if ((cp[l+1] & 0xC0) == 0x80) | |
108 | { | |
109 | uc=cp[l] & 0x1F; | |
110 | uc <<= 6; uc |= cp[l+1] & 0x3F; | |
111 | l += 2; | |
112 | continue; | |
113 | } | |
114 | } | |
115 | ||
116 | if ((cp[l] & 0xF0) == 0xE0) | |
117 | { | |
118 | if ((cp[l+1] & 0xC0) == 0x80 && | |
119 | (cp[l+2] & 0xC0) == 0x80) | |
120 | { | |
121 | uc=cp[l] & 0x0F; | |
122 | uc <<= 6; uc |= cp[l+1] & 0x3F; | |
123 | uc <<= 6; uc |= cp[l+2] & 0x3F; | |
124 | l += 3; | |
125 | continue; | |
126 | } | |
127 | } | |
128 | ||
129 | if ((cp[l] & 0xF8) == 0xF0) | |
130 | { | |
131 | if ((cp[l+1] & 0xC0) == 0x80 && | |
132 | (cp[l+2] & 0xC0) == 0x80 && | |
133 | (cp[l+3] & 0xC0) == 0x80) | |
134 | { | |
135 | uc=cp[l] & 0x07; | |
136 | uc <<= 6; uc |= cp[l+1] & 0x3F; | |
137 | uc <<= 6; uc |= cp[l+2] & 0x3F; | |
138 | uc <<= 6; uc |= cp[l+3] & 0x3F; | |
139 | l += 4; | |
140 | continue; | |
141 | } | |
142 | } | |
143 | ||
144 | if ((cp[l] & 0xFC) == 0xF8) | |
145 | { | |
146 | if ((cp[l+1] & 0xC0) == 0x80 && | |
147 | (cp[l+2] & 0xC0) == 0x80 && | |
148 | (cp[l+3] & 0xC0) == 0x80 && | |
149 | (cp[l+4] & 0xC0) == 0x80) | |
150 | { | |
151 | uc=cp[l] & 0x03; | |
152 | uc <<= 6; uc |= cp[l+1] & 0x3F; | |
153 | uc <<= 6; uc |= cp[l+2] & 0x3F; | |
154 | uc <<= 6; uc |= cp[l+3] & 0x3F; | |
155 | uc <<= 6; uc |= cp[l+4] & 0x3F; | |
156 | l += 5; | |
157 | continue; | |
158 | } | |
159 | } | |
160 | ||
161 | if ((cp[l] & 0xFE) == 0xFC) | |
162 | { | |
163 | if ((cp[l+1] & 0xC0) == 0x80 && | |
164 | (cp[l+2] & 0xC0) == 0x80 && | |
165 | (cp[l+3] & 0xC0) == 0x80 && | |
166 | (cp[l+4] & 0xC0) == 0x80 && | |
167 | (cp[l+5] & 0xC0) == 0x80) | |
168 | { | |
169 | uc=cp[l] & 0x01; | |
170 | uc <<= 6; uc |= cp[l+1] & 0x3F; | |
171 | uc <<= 6; uc |= cp[l+2] & 0x3F; | |
172 | uc <<= 6; uc |= cp[l+3] & 0x3F; | |
173 | uc <<= 6; uc |= cp[l+4] & 0x3F; | |
174 | uc <<= 6; uc |= cp[l+5] & 0x3F; | |
175 | l += 6; | |
176 | continue; | |
177 | } | |
178 | } | |
179 | uc=cp[l]; | |
180 | ++l; | |
181 | } | |
182 | p[n]=0; | |
183 | return (p); | |
184 | } | |
185 | ||
186 | char *unicode_utf8_fromu(const unicode_char *cp, int *ip) | |
187 | { | |
188 | char *p=0; | |
189 | int pass; | |
190 | size_t l=0; | |
191 | ||
192 | for (pass=0; pass<2; pass++) | |
193 | { | |
194 | if (pass) | |
195 | { | |
196 | p=malloc(l+1); | |
197 | if (!p) | |
198 | { | |
199 | if (ip) *ip= -1; | |
200 | return (0); | |
201 | } | |
202 | } | |
203 | ||
204 | l=unicode_utf8_fromu_pass(cp, p); | |
205 | if (pass) | |
206 | p[l]=0; | |
207 | } | |
208 | return (p); | |
209 | } | |
210 | ||
211 | ||
212 | size_t unicode_utf8_fromu_pass(const unicode_char *cp, char *p) | |
213 | { | |
214 | size_t l=0; | |
215 | unicode_char uc; | |
216 | ||
217 | l=0; | |
218 | ||
219 | while (cp && *cp) | |
220 | { | |
221 | uc= *cp++; | |
222 | ||
223 | if ((unicode_char)uc == | |
224 | (unicode_char)(uc & 0x007F)) | |
225 | { | |
226 | if (p) | |
227 | { | |
228 | p[l]= (char)uc; | |
229 | } | |
230 | ++l; | |
231 | continue; | |
232 | } | |
233 | ||
234 | if ((unicode_char)uc == | |
235 | (unicode_char)(uc & 0x07FF)) | |
236 | { | |
237 | if (p) | |
238 | { | |
239 | p[l+1]=(char)(uc & 0x3F) | 0x80; | |
240 | uc >>= 6; | |
241 | p[l]= (char)(uc & 0x1F) | 0xC0; | |
242 | } | |
243 | l += 2; | |
244 | continue; | |
245 | } | |
246 | ||
247 | if ((unicode_char)uc == | |
248 | (unicode_char)(uc & 0x00FFFF)) | |
249 | { | |
250 | if (p) | |
251 | { | |
252 | p[l+2]=(char)(uc & 0x3F) | 0x80; | |
253 | uc >>= 6; | |
254 | p[l+1]=(char)(uc & 0x3F) | 0x80; | |
255 | uc >>= 6; | |
256 | p[l]= (char)(uc & 0x0F) | 0xE0; | |
257 | } | |
258 | l += 3; | |
259 | continue; | |
260 | } | |
261 | ||
262 | if ((unicode_char)uc == | |
263 | (unicode_char)(uc & 0x001FFFFF)) | |
264 | { | |
265 | if (p) | |
266 | { | |
267 | p[l+3]=(char)(uc & 0x3F) | 0x80; | |
268 | uc >>= 6; | |
269 | p[l+2]=(char)(uc & 0x3F) | 0x80; | |
270 | uc >>= 6; | |
271 | p[l+1]=(char)(uc & 0x3F) | 0x80; | |
272 | uc >>= 6; | |
273 | p[l]= (char)(uc & 0x07) | 0xF0; | |
274 | } | |
275 | l += 4; | |
276 | continue; | |
277 | } | |
278 | ||
279 | if ((unicode_char)uc == | |
280 | (unicode_char)(uc & 0x03FFFFFF)) | |
281 | { | |
282 | if (p) | |
283 | { | |
284 | p[l+4]=(char)(uc & 0x3F) | 0x80; | |
285 | uc >>= 6; | |
286 | p[l+3]=(char)(uc & 0x3F) | 0x80; | |
287 | uc >>= 6; | |
288 | p[l+2]=(char)(uc & 0x3F) | 0x80; | |
289 | uc >>= 6; | |
290 | p[l+1]=(char)(uc & 0x3F) | 0x80; | |
291 | uc >>= 6; | |
292 | p[l]= (char)(uc & 0x03) | 0xF8; | |
293 | } | |
294 | l += 5; | |
295 | continue; | |
296 | } | |
297 | ||
298 | if (p) | |
299 | { | |
300 | p[l+5]=(char)(uc & 0x3F) | 0x80; | |
301 | uc >>= 6; | |
302 | p[l+4]=(char)(uc & 0x3F) | 0x80; | |
303 | uc >>= 6; | |
304 | p[l+3]=(char)(uc & 0x3F) | 0x80; | |
305 | uc >>= 6; | |
306 | p[l+2]=(char)(uc & 0x3F) | 0x80; | |
307 | uc >>= 6; | |
308 | p[l+1]=(char)(uc & 0x3F) | 0x80; | |
309 | uc >>= 6; | |
310 | p[l]= (char)(uc & 0x01) | 0xFC; | |
311 | } | |
312 | l += 6; | |
313 | } | |
314 | return l; | |
315 | } |