Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / big5.c
1 /*
2 ** Copyright 2000-2002 Double Precision, Inc.
3 ** See COPYING for distribution information.
4 **
5 ** $Id: big5.c,v 1.14 2004/05/23 14:28:24 mrsam Exp $
6 */
7
8 #include "big5.h"
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12
13 #define BIG5_HKSCS_EXTENSION 1
14
15 static const unicode_char * const big5fwdlo[]= {
16 NULL,
17 NULL,
18 NULL,
19 NULL,
20 NULL,
21 NULL,
22 NULL,
23 big5_88_lo,
24 big5_89_lo,
25 big5_8a_lo,
26 big5_8b_lo,
27 big5_8c_lo,
28 big5_8d_lo,
29 big5_8e_lo,
30 big5_8f_lo,
31 big5_90_lo,
32 big5_91_lo,
33 big5_92_lo,
34 big5_93_lo,
35 big5_94_lo,
36 big5_95_lo,
37 big5_96_lo,
38 big5_97_lo,
39 big5_98_lo,
40 big5_99_lo,
41 big5_9a_lo,
42 big5_9b_lo,
43 big5_9c_lo,
44 big5_9d_lo,
45 big5_9e_lo,
46 big5_9f_lo,
47 big5_a0_lo,
48 big5_a1_lo,
49 big5_a2_lo,
50 big5_a3_lo,
51 big5_a4_lo,
52 big5_a5_lo,
53 big5_a6_lo,
54 big5_a7_lo,
55 big5_a8_lo,
56 big5_a9_lo,
57 big5_aa_lo,
58 big5_ab_lo,
59 big5_ac_lo,
60 big5_ad_lo,
61 big5_ae_lo,
62 big5_af_lo,
63 big5_b0_lo,
64 big5_b1_lo,
65 big5_b2_lo,
66 big5_b3_lo,
67 big5_b4_lo,
68 big5_b5_lo,
69 big5_b6_lo,
70 big5_b7_lo,
71 big5_b8_lo,
72 big5_b9_lo,
73 big5_ba_lo,
74 big5_bb_lo,
75 big5_bc_lo,
76 big5_bd_lo,
77 big5_be_lo,
78 big5_bf_lo,
79 big5_c0_lo,
80 big5_c1_lo,
81 big5_c2_lo,
82 big5_c3_lo,
83 big5_c4_lo,
84 big5_c5_lo,
85 big5_c6_lo,
86 big5_c7_lo,
87 big5_c8_lo,
88 big5_c9_lo,
89 big5_ca_lo,
90 big5_cb_lo,
91 big5_cc_lo,
92 big5_cd_lo,
93 big5_ce_lo,
94 big5_cf_lo,
95 big5_d0_lo,
96 big5_d1_lo,
97 big5_d2_lo,
98 big5_d3_lo,
99 big5_d4_lo,
100 big5_d5_lo,
101 big5_d6_lo,
102 big5_d7_lo,
103 big5_d8_lo,
104 big5_d9_lo,
105 big5_da_lo,
106 big5_db_lo,
107 big5_dc_lo,
108 big5_dd_lo,
109 big5_de_lo,
110 big5_df_lo,
111 big5_e0_lo,
112 big5_e1_lo,
113 big5_e2_lo,
114 big5_e3_lo,
115 big5_e4_lo,
116 big5_e5_lo,
117 big5_e6_lo,
118 big5_e7_lo,
119 big5_e8_lo,
120 big5_e9_lo,
121 big5_ea_lo,
122 big5_eb_lo,
123 big5_ec_lo,
124 big5_ed_lo,
125 big5_ee_lo,
126 big5_ef_lo,
127 big5_f0_lo,
128 big5_f1_lo,
129 big5_f2_lo,
130 big5_f3_lo,
131 big5_f4_lo,
132 big5_f5_lo,
133 big5_f6_lo,
134 big5_f7_lo,
135 big5_f8_lo,
136 big5_f9_lo,
137 big5_fa_lo,
138 big5_fb_lo,
139 big5_fc_lo,
140 big5_fd_lo,
141 big5_fe_lo};
142
143 static const unicode_char * const big5fwdhi[]= {
144 NULL,
145 NULL,
146 NULL,
147 NULL,
148 NULL,
149 NULL,
150 NULL,
151 big5_88_hi,
152 big5_89_hi,
153 big5_8a_hi,
154 big5_8b_hi,
155 big5_8c_hi,
156 big5_8d_hi,
157 big5_8e_hi,
158 big5_8f_hi,
159 big5_90_hi,
160 big5_91_hi,
161 big5_92_hi,
162 big5_93_hi,
163 big5_94_hi,
164 big5_95_hi,
165 big5_96_hi,
166 big5_97_hi,
167 big5_98_hi,
168 big5_99_hi,
169 big5_9a_hi,
170 big5_9b_hi,
171 big5_9c_hi,
172 big5_9d_hi,
173 big5_9e_hi,
174 big5_9f_hi,
175 big5_a0_hi,
176 big5_a1_hi,
177 big5_a2_hi,
178 big5_a3_hi,
179 big5_a4_hi,
180 big5_a5_hi,
181 big5_a6_hi,
182 big5_a7_hi,
183 big5_a8_hi,
184 big5_a9_hi,
185 big5_aa_hi,
186 big5_ab_hi,
187 big5_ac_hi,
188 big5_ad_hi,
189 big5_ae_hi,
190 big5_af_hi,
191 big5_b0_hi,
192 big5_b1_hi,
193 big5_b2_hi,
194 big5_b3_hi,
195 big5_b4_hi,
196 big5_b5_hi,
197 big5_b6_hi,
198 big5_b7_hi,
199 big5_b8_hi,
200 big5_b9_hi,
201 big5_ba_hi,
202 big5_bb_hi,
203 big5_bc_hi,
204 big5_bd_hi,
205 big5_be_hi,
206 big5_bf_hi,
207 big5_c0_hi,
208 big5_c1_hi,
209 big5_c2_hi,
210 big5_c3_hi,
211 big5_c4_hi,
212 big5_c5_hi,
213 big5_c6_hi,
214 big5_c7_hi,
215 big5_c8_hi,
216 big5_c9_hi,
217 big5_ca_hi,
218 big5_cb_hi,
219 big5_cc_hi,
220 big5_cd_hi,
221 big5_ce_hi,
222 big5_cf_hi,
223 big5_d0_hi,
224 big5_d1_hi,
225 big5_d2_hi,
226 big5_d3_hi,
227 big5_d4_hi,
228 big5_d5_hi,
229 big5_d6_hi,
230 big5_d7_hi,
231 big5_d8_hi,
232 big5_d9_hi,
233 big5_da_hi,
234 big5_db_hi,
235 big5_dc_hi,
236 big5_dd_hi,
237 big5_de_hi,
238 big5_df_hi,
239 big5_e0_hi,
240 big5_e1_hi,
241 big5_e2_hi,
242 big5_e3_hi,
243 big5_e4_hi,
244 big5_e5_hi,
245 big5_e6_hi,
246 big5_e7_hi,
247 big5_e8_hi,
248 big5_e9_hi,
249 big5_ea_hi,
250 big5_eb_hi,
251 big5_ec_hi,
252 big5_ed_hi,
253 big5_ee_hi,
254 big5_ef_hi,
255 big5_f0_hi,
256 big5_f1_hi,
257 big5_f2_hi,
258 big5_f3_hi,
259 big5_f4_hi,
260 big5_f5_hi,
261 big5_f6_hi,
262 big5_f7_hi,
263 big5_f8_hi,
264 big5_f9_hi,
265 big5_fa_hi,
266 big5_fb_hi,
267 big5_fc_hi,
268 big5_fd_hi,
269 big5_fe_hi};
270
271 static unicode_char *c2u_doconv(const struct unicode_info *u,
272 const char *cp, int *err, int compat)
273 {
274 size_t i, cnt;
275 unicode_char *uc;
276
277 if (err)
278 *err= -1;
279
280 /*
281 ** Count the number of potential unicode characters first.
282 */
283
284 for (i=cnt=0; cp[i]; i++)
285 {
286 if ((int)(unsigned char)cp[i] < 0x88 ||
287 (int)(unsigned char)cp[i] > 0xFE ||
288 cp[i+1] == 0)
289 {
290 ++cnt;
291 continue;
292 }
293
294 ++i;
295 ++cnt;
296 }
297
298 uc=malloc((cnt+1)*sizeof(unicode_char));
299 if (!uc)
300 return (NULL);
301
302 i=cnt=0;
303 while (cp[i])
304 {
305 unsigned int a=(int)(unsigned char)cp[i], b;
306
307 /* 2-byte Character */
308 if ((unsigned)0x88 <= a && a <= (unsigned)0xFE && cp[i+1])
309 {
310 unicode_char ucv;
311 b=(int)(unsigned char)cp[i+1];
312
313 /* ranges extended by HKSCS */
314 if (!(compat & BIG5_HKSCS_EXTENSION)
315 && (a < (unsigned)0xA1
316 || (a == (unsigned)0xC6
317 && (unsigned)0xBF <= b && b <= (unsigned)0xD7)))
318 ucv = (unicode_char)0xFFFD;
319 /* 0xXX40-0xXX7E */
320 else if (0x40 <= b && b <= 0x7E
321 && big5fwdlo[a-0x81]
322 && (ucv=big5fwdlo[a-0x81][b-0x40]))
323 ;
324 /* 0xXXA1-0xXXFE */
325 else if ((unsigned)0xA1 <= b && b <= (unsigned)0xFE
326 && big5fwdhi[a-0x81]
327 && (ucv=big5fwdhi[a-0x81][b-0xA1]))
328 ;
329 /* Not found */
330 else
331 ucv = (unicode_char)0xFFFD;
332
333 /* mapped to PUA by HKSCS extension */
334 if (!(compat & BIG5_HKSCS_EXTENSION)
335 && (unicode_char)0xE000 <= ucv
336 && ucv <= (unicode_char)0xF8FF)
337 ucv = (unicode_char)0xFFFD;
338
339 if (ucv == (unicode_char)0xFFFD && err)
340 {
341 *err = i;
342 free(uc);
343 return NULL;
344 }
345 uc[cnt++] = ucv;
346 i += 2;
347 }
348 /* US-ASCII */
349 else if (a < (unsigned)0x80)
350 {
351 uc[cnt++]=a;
352 i += 1;
353 }
354 /* Not Found */
355 else if (err)
356 {
357 *err=i;
358 free(uc);
359 return (NULL);
360 }
361 else
362 {
363 uc[cnt++] = (unicode_char)0xFFFD;
364 i += 1;
365 }
366 }
367 uc[cnt]=0;
368
369 return (uc);
370 }
371
372 static unicode_char *c2u_eten(const struct unicode_info *u,
373 const char *cp, int *err)
374 {
375 return c2u_doconv(u, cp, err, 0);
376 }
377
378 static unicode_char *c2u_hkscs(const struct unicode_info *u,
379 const char *cp, int *err)
380 {
381 return c2u_doconv(u, cp, err, BIG5_HKSCS_EXTENSION);
382 }
383
384 static unsigned revlookup(unicode_char c)
385 {
386 unsigned j;
387 unsigned bucket;
388 unsigned uc;
389
390 bucket=c % big5_revhash_size;
391 uc=0;
392
393 for (j=big5_revtable_index[ bucket ];
394 j < sizeof(big5_revtable_uc)/sizeof(big5_revtable_uc[0]);
395 ++j)
396 {
397 unicode_char uuc=big5_revtable_uc[j];
398
399 if (uuc == c)
400 return (big5_revtable_octets[j]);
401
402 if ((uuc % big5_revhash_size) != bucket)
403 break;
404 }
405 return (0);
406 }
407
408 static char *u2c_doconv(const struct unicode_info *u,
409 const unicode_char *cp, int *err, int compat)
410 {
411 size_t cnt, i;
412 char *s;
413
414 if (err)
415 *err= -1;
416 /*
417 ** Figure out the size of the octet string. Unicodes < 0x7f will
418 ** map to a single byte, unicodes >= 0x80 will map to two bytes.
419 */
420
421 for (i=cnt=0; cp[i]; i++)
422 {
423 if (cp[i] > 0x7f)
424 ++cnt;
425 ++cnt;
426 }
427
428 s=malloc(cnt+1);
429 if (!s)
430 return (NULL);
431 cnt=0;
432
433 for (i=0; cp[i]; i++)
434 {
435 unsigned uc;
436
437 /* US-ASCII */
438 if (cp[i] < (unicode_char)0x0080)
439 {
440 s[cnt++]= (char)cp[i];
441 continue;
442 }
443 /* PUA by HKSCS */
444 if (!(compat & BIG5_HKSCS_EXTENSION)
445 && (unicode_char)0xE000 <= cp[i]
446 && cp[i] <= (unicode_char)0xF8FF)
447 {
448 if (err)
449 {
450 *err=i;
451 free(s);
452 return (NULL);
453 }
454 s[cnt++] = '?';
455 continue;
456 }
457
458 uc=revlookup(cp[i]);
459
460 if (!uc
461 || (!(compat & BIG5_HKSCS_EXTENSION)
462 && (uc < (unsigned)0xA140
463 || ((unsigned)0xC6BF <= uc && uc <= (unsigned)0xC6D7))))
464 {
465 if (err)
466 {
467 *err=i;
468 free(s);
469 return (NULL);
470 }
471 s[cnt++] = '?';
472 }
473 else
474 {
475 s[cnt++]= (char)(uc >> 8);
476 s[cnt++]= (char)(uc & 0x00FF);
477 }
478 }
479 s[cnt]=0;
480 return (s);
481 }
482
483 static char *u2c_eten(const struct unicode_info *u,
484 const unicode_char *cp, int *err)
485 {
486 return u2c_doconv(u, cp, err, 0);
487 }
488
489 static char *u2c_hkscs(const struct unicode_info *u,
490 const unicode_char *cp, int *err)
491 {
492 return u2c_doconv(u, cp, err, BIG5_HKSCS_EXTENSION);
493 }
494
495 static char *toupper_func(const struct unicode_info *u,
496 const char *cp, int *ip)
497 {
498 unicode_char *uc=(*u->c2u)(u, cp, ip);
499 char *s;
500
501 unsigned i;
502
503 if (!uc)
504 return (NULL);
505
506 for (i=0; uc[i]; i++)
507 {
508 unicode_char c=unicode_uc(uc[i]);
509
510 if (revlookup(c))
511 uc[i]=c;
512 }
513
514 s=(*u->u2c)(u, uc, NULL);
515 free(uc);
516 return (s);
517 }
518
519 static char *tolower_func(const struct unicode_info *u,
520 const char *cp, int *ip)
521 {
522 unicode_char *uc=(*u->c2u)(u, cp, ip);
523 char *s;
524
525 unsigned i;
526
527 if (!uc)
528 return (NULL);
529
530 for (i=0; uc[i]; i++)
531 {
532 unicode_char c=unicode_lc(uc[i]);
533
534 if (revlookup(c))
535 uc[i]=c;
536 }
537
538 s=(*u->u2c)(u, uc, NULL);
539 free(uc);
540 return (s);
541 }
542
543 static char *totitle_func(const struct unicode_info *u,
544 const char *cp, int *ip)
545 {
546 unicode_char *uc=(*u->c2u)(u, cp, ip);
547 char *s;
548
549 unsigned i;
550
551 if (!uc)
552 return (NULL);
553
554 for (i=0; uc[i]; i++)
555 {
556 unicode_char c=unicode_tc(uc[i]);
557
558 if (revlookup(c))
559 uc[i]=c;
560 }
561
562 s=(*u->u2c)(u, uc, NULL);
563 free(uc);
564 return (s);
565 }
566
567 const struct unicode_info unicode_BIG5_ETEN = {
568 "BIG5",
569 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
570 UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
571 c2u_eten,
572 u2c_eten,
573 toupper_func,
574 tolower_func,
575 totitle_func};
576
577 const struct unicode_info unicode_BIG5_HKSCS = {
578 "BIG5-HKSCS",
579 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
580 UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
581 c2u_hkscs,
582 u2c_hkscs,
583 toupper_func,
584 tolower_func,
585 totitle_func};
586
587 #if 0
588
589 int main()
590 {
591 FILE *fp=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
592 char buf[4000];
593 unicode_char *uc;
594 char *s, *p;
595 int dummyi;
596
597 if (!fp)
598 return (0);
599
600 while (fgets(buf, sizeof(buf), fp))
601 {
602 unsigned a, b, c;
603 int dummy;
604
605 if (sscanf(buf, "U+%4x kBigFive %4x", &b, &a) != 2)
606 continue;
607 printf("0x%04x 0x%04x: ", a, b);
608
609 buf[0]= a / 256;
610 buf[1]= a % 256;
611 buf[2]=0;
612
613 uc=c2u(buf, &dummy);
614 if (!uc)
615 {
616 printf("c2u failure: %d\n", dummy);
617 return (1);
618 }
619 if (uc[0] != b || uc[1])
620 {
621 printf("c2u failure: got 0x%04x, expected 0x%04x\n",
622 (unsigned)uc[0], (unsigned)b);
623 return (1);
624 }
625 s=u2c(uc, &dummy);
626 if (s == NULL && uc[0] == 0xfffd)
627 {
628 free(uc);
629 printf("Ok\n");
630 continue; /* Unmapped */
631 }
632 free(uc);
633 if (!s)
634 {
635 printf("u2c failure: %d\n", dummy);
636 return (1);
637 }
638
639 c=0;
640 if (!s[0] || !s[1] || s[2] ||
641 (c=(int)(unsigned char)s[0] * 256 +
642 (unsigned char)s[1]) != a)
643 {
644 printf("u2c failure: got 0x%04x, expected 0x%04x\n",
645 c, a);
646 return (1);
647 }
648
649 p=toupper_func(s, NULL);
650 if (!p)
651 {
652 printf("toupper failure\n");
653 return (1);
654 }
655 if (strcmp(p, s))
656 printf("toupper ");
657 free(p);
658
659 p=tolower_func(s, NULL);
660 if (!p)
661 {
662 printf("tolower failure\n");
663 return (1);
664 }
665 if (strcmp(p, s))
666 printf("tolower ");
667 free(p);
668
669 p=totitle_func(s, NULL);
670 if (!p)
671 {
672 printf("totitle failure\n");
673 return (1);
674 }
675 if (strcmp(p, s))
676 printf("totitle ");
677 free(p);
678
679 free(s);
680 printf("ok\n");
681 }
682 fclose(fp);
683
684 buf[0]=0x40;
685 buf[1]=0;
686 uc=c2u(buf, NULL);
687
688 if (!uc)
689 {
690 printf("us-ascii c2u failure\n");
691 return (1);
692 }
693 s=u2c(uc, NULL);
694 free(uc);
695 if (!s)
696 {
697 printf("us-ascii u2c failure\n");
698 return (1);
699 }
700 free(s);
701
702 buf[0]=0xA2;
703 buf[1]=0x40;
704 buf[2]=0;
705
706 uc=c2u(buf, NULL);
707 if (!uc)
708 {
709 printf("fallback failed\n");
710 return (1);
711 }
712 printf("fallback: %04x %04x\n", (unsigned)uc[0],
713 (unsigned)uc[1]);
714
715 s=u2c(uc, NULL);
716 free(uc);
717
718 if (!s)
719 {
720 printf("fallback-reverse failed\n");
721 return (1);
722 }
723 printf("fallback: %02x %02x\n", (int)(unsigned char)s[0],
724 (int)(unsigned char)s[1]);
725 free(s);
726
727 buf[0]=0xA2;
728 buf[1]=0x40;
729 buf[2]=0;
730
731 uc=c2u(buf, &dummyi);
732
733 if (uc)
734 {
735 printf("abort failed\n");
736 return (1);
737 }
738
739 printf("aborted at index %d\n", dummyi);
740
741 {
742 static unicode_char testing[]={0x0040, 0x1000, 0};
743
744 uc=testing;
745
746 s=u2c(uc, NULL);
747
748 if (!s)
749 {
750 printf("abort-fallback failed\n");
751 return (1);
752 }
753 printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s[0],
754 (int)(unsigned char)s[1]);
755 free(s);
756
757 uc=testing;
758 }
759
760 s=u2c(uc, &dummyi);
761
762 if (s)
763 {
764 printf("abort-abort failed\n");
765 return (1);
766 }
767
768 printf("abort-aborted: index %d\n", dummyi);
769 return (0);
770 }
771 #endif