2 ** Copyright 2000-2002 Double Precision, Inc.
3 ** See COPYING for distribution information.
5 ** $Id: big5.c,v 1.14 2004/05/23 14:28:24 mrsam Exp $
13 #define BIG5_HKSCS_EXTENSION 1
15 static const unicode_char
* const big5fwdlo
[]= {
143 static const unicode_char
* const big5fwdhi
[]= {
271 static unicode_char
*c2u_doconv(const struct unicode_info
*u
,
272 const char *cp
, int *err
, int compat
)
281 ** Count the number of potential unicode characters first.
284 for (i
=cnt
=0; cp
[i
]; i
++)
286 if ((int)(unsigned char)cp
[i
] < 0x88 ||
287 (int)(unsigned char)cp
[i
] > 0xFE ||
298 uc
=malloc((cnt
+1)*sizeof(unicode_char
));
305 unsigned int a
=(int)(unsigned char)cp
[i
], b
;
307 /* 2-byte Character */
308 if ((unsigned)0x88 <= a
&& a
<= (unsigned)0xFE && cp
[i
+1])
311 b
=(int)(unsigned char)cp
[i
+1];
313 /* ranges extended by HKSCS */
314 if (!(compat
& BIG5_HKSCS_EXTENSION
)
315 && (a
< (unsigned)0xA1
316 || (a
== (unsigned)0xC6
317 && (unsigned)0xBF <= b
&& b
<= (unsigned)0xD7)))
318 ucv
= (unicode_char
)0xFFFD;
320 else if (0x40 <= b
&& b
<= 0x7E
322 && (ucv
=big5fwdlo
[a
-0x81][b
-0x40]))
325 else if ((unsigned)0xA1 <= b
&& b
<= (unsigned)0xFE
327 && (ucv
=big5fwdhi
[a
-0x81][b
-0xA1]))
331 ucv
= (unicode_char
)0xFFFD;
333 /* mapped to PUA by HKSCS extension */
334 if (!(compat
& BIG5_HKSCS_EXTENSION
)
335 && (unicode_char
)0xE000 <= ucv
336 && ucv
<= (unicode_char
)0xF8FF)
337 ucv
= (unicode_char
)0xFFFD;
339 if (ucv
== (unicode_char
)0xFFFD && err
)
349 else if (a
< (unsigned)0x80)
363 uc
[cnt
++] = (unicode_char
)0xFFFD;
372 static unicode_char
*c2u_eten(const struct unicode_info
*u
,
373 const char *cp
, int *err
)
375 return c2u_doconv(u
, cp
, err
, 0);
378 static unicode_char
*c2u_hkscs(const struct unicode_info
*u
,
379 const char *cp
, int *err
)
381 return c2u_doconv(u
, cp
, err
, BIG5_HKSCS_EXTENSION
);
384 static unsigned revlookup(unicode_char c
)
390 bucket
=c
% big5_revhash_size
;
393 for (j
=big5_revtable_index
[ bucket
];
394 j
< sizeof(big5_revtable_uc
)/sizeof(big5_revtable_uc
[0]);
397 unicode_char uuc
=big5_revtable_uc
[j
];
400 return (big5_revtable_octets
[j
]);
402 if ((uuc
% big5_revhash_size
) != bucket
)
408 static char *u2c_doconv(const struct unicode_info
*u
,
409 const unicode_char
*cp
, int *err
, int compat
)
417 ** Figure out the size of the octet string. Unicodes < 0x7f will
418 ** map to a single byte, unicodes >= 0x80 will map to two bytes.
421 for (i
=cnt
=0; cp
[i
]; i
++)
433 for (i
=0; cp
[i
]; i
++)
438 if (cp
[i
] < (unicode_char
)0x0080)
440 s
[cnt
++]= (char)cp
[i
];
444 if (!(compat
& BIG5_HKSCS_EXTENSION
)
445 && (unicode_char
)0xE000 <= cp
[i
]
446 && cp
[i
] <= (unicode_char
)0xF8FF)
461 || (!(compat
& BIG5_HKSCS_EXTENSION
)
462 && (uc
< (unsigned)0xA140
463 || ((unsigned)0xC6BF <= uc
&& uc
<= (unsigned)0xC6D7))))
475 s
[cnt
++]= (char)(uc
>> 8);
476 s
[cnt
++]= (char)(uc
& 0x00FF);
483 static char *u2c_eten(const struct unicode_info
*u
,
484 const unicode_char
*cp
, int *err
)
486 return u2c_doconv(u
, cp
, err
, 0);
489 static char *u2c_hkscs(const struct unicode_info
*u
,
490 const unicode_char
*cp
, int *err
)
492 return u2c_doconv(u
, cp
, err
, BIG5_HKSCS_EXTENSION
);
495 static char *toupper_func(const struct unicode_info
*u
,
496 const char *cp
, int *ip
)
498 unicode_char
*uc
=(*u
->c2u
)(u
, cp
, ip
);
506 for (i
=0; uc
[i
]; i
++)
508 unicode_char c
=unicode_uc(uc
[i
]);
514 s
=(*u
->u2c
)(u
, uc
, NULL
);
519 static char *tolower_func(const struct unicode_info
*u
,
520 const char *cp
, int *ip
)
522 unicode_char
*uc
=(*u
->c2u
)(u
, cp
, ip
);
530 for (i
=0; uc
[i
]; i
++)
532 unicode_char c
=unicode_lc(uc
[i
]);
538 s
=(*u
->u2c
)(u
, uc
, NULL
);
543 static char *totitle_func(const struct unicode_info
*u
,
544 const char *cp
, int *ip
)
546 unicode_char
*uc
=(*u
->c2u
)(u
, cp
, ip
);
554 for (i
=0; uc
[i
]; i
++)
556 unicode_char c
=unicode_tc(uc
[i
]);
562 s
=(*u
->u2c
)(u
, uc
, NULL
);
567 const struct unicode_info unicode_BIG5_ETEN
= {
569 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_USASCII
|
570 UNICODE_HEADER_BASE64
| UNICODE_BODY_BASE64
,
577 const struct unicode_info unicode_BIG5_HKSCS
= {
579 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_USASCII
|
580 UNICODE_HEADER_BASE64
| UNICODE_BODY_BASE64
,
591 FILE *fp
=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
600 while (fgets(buf
, sizeof(buf
), fp
))
605 if (sscanf(buf
, "U+%4x kBigFive %4x", &b
, &a
) != 2)
607 printf("0x%04x 0x%04x: ", a
, b
);
616 printf("c2u failure: %d\n", dummy
);
619 if (uc
[0] != b
|| uc
[1])
621 printf("c2u failure: got 0x%04x, expected 0x%04x\n",
622 (unsigned)uc
[0], (unsigned)b
);
626 if (s
== NULL
&& uc
[0] == 0xfffd)
630 continue; /* Unmapped */
635 printf("u2c failure: %d\n", dummy
);
640 if (!s
[0] || !s
[1] || s
[2] ||
641 (c
=(int)(unsigned char)s
[0] * 256 +
642 (unsigned char)s
[1]) != a
)
644 printf("u2c failure: got 0x%04x, expected 0x%04x\n",
649 p
=toupper_func(s
, NULL
);
652 printf("toupper failure\n");
659 p
=tolower_func(s
, NULL
);
662 printf("tolower failure\n");
669 p
=totitle_func(s
, NULL
);
672 printf("totitle failure\n");
690 printf("us-ascii c2u failure\n");
697 printf("us-ascii u2c failure\n");
709 printf("fallback failed\n");
712 printf("fallback: %04x %04x\n", (unsigned)uc
[0],
720 printf("fallback-reverse failed\n");
723 printf("fallback: %02x %02x\n", (int)(unsigned char)s
[0],
724 (int)(unsigned char)s
[1]);
731 uc
=c2u(buf
, &dummyi
);
735 printf("abort failed\n");
739 printf("aborted at index %d\n", dummyi
);
742 static unicode_char testing
[]={0x0040, 0x1000, 0};
750 printf("abort-fallback failed\n");
753 printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s
[0],
754 (int)(unsigned char)s
[1]);
764 printf("abort-abort failed\n");
768 printf("abort-aborted: index %d\n", dummyi
);