Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / gb2312.c
CommitLineData
8d138742
CE
1/*
2** Copyright 2000-2002 Double Precision, Inc.
3** See COPYING for distribution information.
4**
5** $Id: gb2312.c,v 1.13 2004/05/23 14:28:24 mrsam Exp $
6*/
7
8#include "gb2312.h"
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12
13static const unicode_char * const gb2312[]= {
14 gb2312_a1,
15 gb2312_a2,
16 gb2312_a3,
17 gb2312_a4,
18 gb2312_a5,
19 gb2312_a6,
20 gb2312_a7,
21 gb2312_a8,
22 gb2312_a9,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27 NULL,
28 NULL,
29 gb2312_b0,
30 gb2312_b1,
31 gb2312_b2,
32 gb2312_b3,
33 gb2312_b4,
34 gb2312_b5,
35 gb2312_b6,
36 gb2312_b7,
37 gb2312_b8,
38 gb2312_b9,
39 gb2312_ba,
40 gb2312_bb,
41 gb2312_bc,
42 gb2312_bd,
43 gb2312_be,
44 gb2312_bf,
45 gb2312_c0,
46 gb2312_c1,
47 gb2312_c2,
48 gb2312_c3,
49 gb2312_c4,
50 gb2312_c5,
51 gb2312_c6,
52 gb2312_c7,
53 gb2312_c8,
54 gb2312_c9,
55 gb2312_ca,
56 gb2312_cb,
57 gb2312_cc,
58 gb2312_cd,
59 gb2312_ce,
60 gb2312_cf,
61 gb2312_d0,
62 gb2312_d1,
63 gb2312_d2,
64 gb2312_d3,
65 gb2312_d4,
66 gb2312_d5,
67 gb2312_d6,
68 gb2312_d7,
69 gb2312_d8,
70 gb2312_d9,
71 gb2312_da,
72 gb2312_db,
73 gb2312_dc,
74 gb2312_dd,
75 gb2312_de,
76 gb2312_df,
77 gb2312_e0,
78 gb2312_e1,
79 gb2312_e2,
80 gb2312_e3,
81 gb2312_e4,
82 gb2312_e5,
83 gb2312_e6,
84 gb2312_e7,
85 gb2312_e8,
86 gb2312_e9,
87 gb2312_ea,
88 gb2312_eb,
89 gb2312_ec,
90 gb2312_ed,
91 gb2312_ee,
92 gb2312_ef,
93 gb2312_f0,
94 gb2312_f1,
95 gb2312_f2,
96 gb2312_f3,
97 gb2312_f4,
98 gb2312_f5,
99 gb2312_f6,
100 gb2312_f7,
101 NULL,
102 NULL,
103 NULL,
104 NULL,
105 NULL,
106 NULL,
107 NULL};
108
109static unicode_char *c2u(const struct unicode_info *u,
110 const char *cp, int *err)
111{
112 size_t i, cnt;
113 unicode_char *uc;
114
115 if (err)
116 *err= -1;
117
118 /*
119 ** Count the number of potential unicode characters first.
120 */
121
122 for (i=cnt=0; cp[i]; i++)
123 {
124 if ( (int)(unsigned char)cp[i] < 0xA1 ||
125 (int)(unsigned char)cp[i] > 0xFE ||
126 cp[i+1] == 0)
127 {
128 ++cnt;
129 continue;
130 }
131
132 ++i;
133 ++cnt;
134 }
135
136 uc=malloc((cnt+1)*sizeof(unicode_char));
137 if (!uc)
138 return (NULL);
139
140 i=cnt=0;
141 while (cp[i])
142 {
143 int a=(int)(unsigned char)cp[i], b;
144
145 if ( a >= 0xA1 && a <= 0xFE && cp[i+1])
146 {
147 unicode_char ucv;
148 b=(int)(unsigned char)cp[i+1];
149
150 if (0xA1 <= b && b <= 0xFE
151 && gb2312[a-0xA1]
152 && (ucv=gb2312[a-0xA1][b-0xA1]))
153 uc[cnt++]= ucv;
154 else if (err)
155 {
156 *err = i;
157 free(uc);
158 return NULL;
159 }
160 else
161 uc[cnt++] = (unicode_char)0xFFFD;
162 i += 2;
163 }
164 else if (a < (unsigned)0x80)
165 {
166 uc[cnt++]=a;
167 i += 1;
168 }
169 else if (err)
170 {
171 *err=i;
172 free(uc);
173 return (NULL);
174 }
175 else
176 {
177 uc[cnt++]= 0xFFFD;
178 i += 1;
179 }
180 }
181 uc[cnt]=0;
182
183 return (uc);
184}
185
186static unsigned revlookup(unicode_char c)
187{
188 unsigned j;
189 unsigned bucket;
190 unsigned uc;
191
192 bucket=c % gb2312_revhash_size;
193 uc=0;
194
195 for (j=gb2312_revtable_index[ bucket ];
196 j < sizeof(gb2312_revtable_uc)/sizeof(gb2312_revtable_uc[0]);
197 ++j)
198 {
199 unicode_char uuc=gb2312_revtable_uc[j];
200
201 if (uuc == c)
202 return (gb2312_revtable_octets[j]);
203
204 if ((uuc % gb2312_revhash_size) != bucket)
205 break;
206 }
207 return (0);
208}
209
210static char *u2c(const struct unicode_info *u,
211 const unicode_char *cp, int *err)
212{
213 size_t cnt, i;
214 char *s;
215
216 if (err)
217 *err= -1;
218 /*
219 ** Figure out the size of the octet string. Unicodes < 0x7f will
220 ** map to a single byte, unicodes >= 0x80 will map to two bytes.
221 */
222
223 for (i=cnt=0; cp[i]; i++)
224 {
225 if (cp[i] > 0x7f)
226 ++cnt;
227 ++cnt;
228 }
229
230 s=malloc(cnt+1);
231 if (!s)
232 return (NULL);
233 cnt=0;
234
235 for (i=0; cp[i]; i++)
236 {
237 unsigned uc;
238
239 /* US-ASCII or GB 1988 (ISO 646 PRC version) */
240 if (cp[i] < (unicode_char)0x0080)
241 {
242 s[cnt++]= (char)cp[i];
243 continue;
244 }
245
246 /* For compatibility: 2 characters replaced by GB 1988 */
247 if (cp[i] == (unicode_char)0x00A5) /* YEN SIGN == yuan sign */
248 {
249 s[cnt++] = 0x24;
250 continue;
251 }
252 if (cp[i] == (unicode_char)0x203E) /* OVERLINE */
253 {
254 s[cnt++] = 0x7E;
255 continue;
256 }
257
258 uc=revlookup(cp[i]);
259
260 if (!uc)
261 {
262 if (err)
263 {
264 *err=i;
265 free(s);
266 return (NULL);
267 }
268 s[cnt++] = '?';
269 }
270 else
271 {
272 s[cnt++]= (char)(uc >> 8);
273 s[cnt++]= (char)(uc & 0x00FF);
274 }
275 }
276 s[cnt]=0;
277 return (s);
278}
279
280static char *toupper_func(const struct unicode_info *u,
281 const char *cp, int *ip)
282{
283 unicode_char *uc=c2u(u, cp, ip);
284 char *s;
285
286 unsigned i;
287
288 if (!uc)
289 return (NULL);
290
291 for (i=0; uc[i]; i++)
292 {
293 unicode_char c=unicode_uc(uc[i]);
294
295 if (revlookup(c))
296 uc[i]=c;
297 }
298
299 s=u2c(u, uc, NULL);
300 free(uc);
301 return (s);
302}
303
304static char *tolower_func(const struct unicode_info *u,
305 const char *cp, int *ip)
306{
307 unicode_char *uc=c2u(u, cp, ip);
308 char *s;
309
310 unsigned i;
311
312 if (!uc)
313 return (NULL);
314
315 for (i=0; uc[i]; i++)
316 {
317 unicode_char c=unicode_lc(uc[i]);
318
319 if (revlookup(c))
320 uc[i]=c;
321 }
322
323 s=u2c(u, uc, NULL);
324 free(uc);
325 return (s);
326}
327
328static char *totitle_func(const struct unicode_info *u,
329 const char *cp, int *ip)
330{
331 unicode_char *uc=c2u(u, cp, ip);
332 char *s;
333
334 unsigned i;
335
336 if (!uc)
337 return (NULL);
338
339 for (i=0; uc[i]; i++)
340 {
341 unicode_char c=unicode_tc(uc[i]);
342
343 if (revlookup(c))
344 uc[i]=c;
345 }
346
347 s=u2c(u, uc, NULL);
348 free(uc);
349 return (s);
350}
351
352const struct unicode_info unicode_GB2312 = {
353 "GB2312",
354 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
355 UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
356 c2u,
357 u2c,
358 toupper_func,
359 tolower_func,
360 totitle_func};
361
362#if 0
363
364int main()
365{
366 FILE *fp=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
367 char buf[4000];
368 unicode_char *uc;
369 char *s, *p;
370 int dummyi;
371
372 if (!fp)
373 return (0);
374
375 while (fgets(buf, sizeof(buf), fp))
376 {
377 unsigned a, b, c;
378 int dummy;
379
380 if (sscanf(buf, "U+%4x kIRG_GSource 0-%4x", &b, &a) != 2)
381 continue;
382 a |= 0x8080;
383
384 printf("0x%04x 0x%04x: ", a, b);
385
386 buf[0]= a / 256;
387 buf[1]= a % 256;
388 buf[2]=0;
389
390 uc=c2u(buf, &dummy);
391 if (!uc)
392 {
393 printf("c2u failure: %d\n", dummy);
394 return (1);
395 }
396 if (uc[0] != b || uc[1])
397 {
398 printf("c2u failure: got 0x%04x, expected 0x%04x\n",
399 (unsigned)uc[0], (unsigned)b);
400 return (1);
401 }
402 s=u2c(uc, &dummy);
403 free(uc);
404 if (!s)
405 {
406 printf("u2c failure: %d\n", dummy);
407 return (1);
408 }
409
410 c=0;
411 if (!s[0] || !s[1] || s[2] ||
412 (c=(int)(unsigned char)s[0] * 256 +
413 (unsigned char)s[1]) != a)
414 {
415 printf("u2c failure: got 0x%04x, expected 0x%04x\n",
416 c, a);
417 return (1);
418 }
419
420 p=toupper_func(s, NULL);
421 if (!p)
422 {
423 printf("toupper failure\n");
424 return (1);
425 }
426 if (strcmp(p, s))
427 printf("toupper ");
428 free(p);
429
430 p=tolower_func(s, NULL);
431 if (!p)
432 {
433 printf("tolower failure\n");
434 return (1);
435 }
436 if (strcmp(p, s))
437 printf("tolower ");
438 free(p);
439
440 p=totitle_func(s, NULL);
441 if (!p)
442 {
443 printf("totitle failure\n");
444 return (1);
445 }
446 if (strcmp(p, s))
447 printf("totitle ");
448 free(p);
449
450 free(s);
451 printf("ok\n");
452 }
453 fclose(fp);
454
455 buf[0]=0x40;
456 buf[1]=0;
457 uc=c2u(buf, NULL);
458
459 if (!uc)
460 {
461 printf("us-ascii c2u failure\n");
462 return (1);
463 }
464 s=u2c(uc, NULL);
465 free(uc);
466 if (!s)
467 {
468 printf("us-asccu u2c failure\n");
469 return (1);
470 }
471 free(s);
472
473 buf[0]=0x40;
474 buf[1]=0xF0;
475 buf[2]=0;
476
477 uc=c2u(buf, NULL);
478 if (!uc)
479 {
480 printf("fallback failed\n");
481 return (1);
482 }
483 printf("fallback: %04x %04x\n", (unsigned)uc[0],
484 (unsigned)uc[1]);
485
486 s=u2c(uc, NULL);
487 free(uc);
488
489 if (!s)
490 {
491 printf("fallback-reverse failed\n");
492 return (1);
493 }
494 printf("fallback: %02x %02x\n", (int)(unsigned char)s[0],
495 (int)(unsigned char)s[1]);
496 free(s);
497
498 buf[0]=0xB2;
499 buf[1]=0x20;
500 buf[2]=0;
501
502 uc=c2u(buf, &dummyi);
503
504 if (uc)
505 {
506 printf("abort failed\n");
507 return (1);
508 }
509
510 printf("aborted at index %d\n", dummyi);
511
512 {
513 static unicode_char testing[]={0x0040, 0x1000, 0};
514
515 uc=testing;
516
517 s=u2c(uc, NULL);
518
519 if (!s)
520 {
521 printf("abort-fallback failed\n");
522 return (1);
523 }
524 printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s[0],
525 (int)(unsigned char)s[1]);
526 free(s);
527
528 uc=testing;
529 }
530
531 s=u2c(uc, &dummyi);
532
533 if (s)
534 {
535 printf("abort-abort failed\n");
536 return (1);
537 }
538
539 printf("abort-aborted: index %d\n", dummyi);
540 return (0);
541}
542#endif