+++ /dev/null
-/*
-** Copyright 2000-2002 Double Precision, Inc.
-** See COPYING for distribution information.
-**
-** $Id: gb2312.c,v 1.13 2004/05/23 14:28:24 mrsam Exp $
-*/
-
-#include "gb2312.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-static const unicode_char * const gb2312[]= {
- gb2312_a1,
- gb2312_a2,
- gb2312_a3,
- gb2312_a4,
- gb2312_a5,
- gb2312_a6,
- gb2312_a7,
- gb2312_a8,
- gb2312_a9,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- gb2312_b0,
- gb2312_b1,
- gb2312_b2,
- gb2312_b3,
- gb2312_b4,
- gb2312_b5,
- gb2312_b6,
- gb2312_b7,
- gb2312_b8,
- gb2312_b9,
- gb2312_ba,
- gb2312_bb,
- gb2312_bc,
- gb2312_bd,
- gb2312_be,
- gb2312_bf,
- gb2312_c0,
- gb2312_c1,
- gb2312_c2,
- gb2312_c3,
- gb2312_c4,
- gb2312_c5,
- gb2312_c6,
- gb2312_c7,
- gb2312_c8,
- gb2312_c9,
- gb2312_ca,
- gb2312_cb,
- gb2312_cc,
- gb2312_cd,
- gb2312_ce,
- gb2312_cf,
- gb2312_d0,
- gb2312_d1,
- gb2312_d2,
- gb2312_d3,
- gb2312_d4,
- gb2312_d5,
- gb2312_d6,
- gb2312_d7,
- gb2312_d8,
- gb2312_d9,
- gb2312_da,
- gb2312_db,
- gb2312_dc,
- gb2312_dd,
- gb2312_de,
- gb2312_df,
- gb2312_e0,
- gb2312_e1,
- gb2312_e2,
- gb2312_e3,
- gb2312_e4,
- gb2312_e5,
- gb2312_e6,
- gb2312_e7,
- gb2312_e8,
- gb2312_e9,
- gb2312_ea,
- gb2312_eb,
- gb2312_ec,
- gb2312_ed,
- gb2312_ee,
- gb2312_ef,
- gb2312_f0,
- gb2312_f1,
- gb2312_f2,
- gb2312_f3,
- gb2312_f4,
- gb2312_f5,
- gb2312_f6,
- gb2312_f7,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL};
-
-static unicode_char *c2u(const struct unicode_info *u,
- const char *cp, int *err)
-{
- size_t i, cnt;
- unicode_char *uc;
-
- if (err)
- *err= -1;
-
- /*
- ** Count the number of potential unicode characters first.
- */
-
- for (i=cnt=0; cp[i]; i++)
- {
- if ( (int)(unsigned char)cp[i] < 0xA1 ||
- (int)(unsigned char)cp[i] > 0xFE ||
- cp[i+1] == 0)
- {
- ++cnt;
- continue;
- }
-
- ++i;
- ++cnt;
- }
-
- uc=malloc((cnt+1)*sizeof(unicode_char));
- if (!uc)
- return (NULL);
-
- i=cnt=0;
- while (cp[i])
- {
- int a=(int)(unsigned char)cp[i], b;
-
- if ( a >= 0xA1 && a <= 0xFE && cp[i+1])
- {
- unicode_char ucv;
- b=(int)(unsigned char)cp[i+1];
-
- if (0xA1 <= b && b <= 0xFE
- && gb2312[a-0xA1]
- && (ucv=gb2312[a-0xA1][b-0xA1]))
- uc[cnt++]= ucv;
- else if (err)
- {
- *err = i;
- free(uc);
- return NULL;
- }
- else
- uc[cnt++] = (unicode_char)0xFFFD;
- i += 2;
- }
- else if (a < (unsigned)0x80)
- {
- uc[cnt++]=a;
- i += 1;
- }
- else if (err)
- {
- *err=i;
- free(uc);
- return (NULL);
- }
- else
- {
- uc[cnt++]= 0xFFFD;
- i += 1;
- }
- }
- uc[cnt]=0;
-
- return (uc);
-}
-
-static unsigned revlookup(unicode_char c)
-{
- unsigned j;
- unsigned bucket;
- unsigned uc;
-
- bucket=c % gb2312_revhash_size;
- uc=0;
-
- for (j=gb2312_revtable_index[ bucket ];
- j < sizeof(gb2312_revtable_uc)/sizeof(gb2312_revtable_uc[0]);
- ++j)
- {
- unicode_char uuc=gb2312_revtable_uc[j];
-
- if (uuc == c)
- return (gb2312_revtable_octets[j]);
-
- if ((uuc % gb2312_revhash_size) != bucket)
- break;
- }
- return (0);
-}
-
-static char *u2c(const struct unicode_info *u,
- const unicode_char *cp, int *err)
-{
- size_t cnt, i;
- char *s;
-
- if (err)
- *err= -1;
- /*
- ** Figure out the size of the octet string. Unicodes < 0x7f will
- ** map to a single byte, unicodes >= 0x80 will map to two bytes.
- */
-
- for (i=cnt=0; cp[i]; i++)
- {
- if (cp[i] > 0x7f)
- ++cnt;
- ++cnt;
- }
-
- s=malloc(cnt+1);
- if (!s)
- return (NULL);
- cnt=0;
-
- for (i=0; cp[i]; i++)
- {
- unsigned uc;
-
- /* US-ASCII or GB 1988 (ISO 646 PRC version) */
- if (cp[i] < (unicode_char)0x0080)
- {
- s[cnt++]= (char)cp[i];
- continue;
- }
-
- /* For compatibility: 2 characters replaced by GB 1988 */
- if (cp[i] == (unicode_char)0x00A5) /* YEN SIGN == yuan sign */
- {
- s[cnt++] = 0x24;
- continue;
- }
- if (cp[i] == (unicode_char)0x203E) /* OVERLINE */
- {
- s[cnt++] = 0x7E;
- continue;
- }
-
- uc=revlookup(cp[i]);
-
- if (!uc)
- {
- if (err)
- {
- *err=i;
- free(s);
- return (NULL);
- }
- s[cnt++] = '?';
- }
- else
- {
- s[cnt++]= (char)(uc >> 8);
- s[cnt++]= (char)(uc & 0x00FF);
- }
- }
- s[cnt]=0;
- return (s);
-}
-
-static char *toupper_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc=c2u(u, cp, ip);
- char *s;
-
- unsigned i;
-
- if (!uc)
- return (NULL);
-
- for (i=0; uc[i]; i++)
- {
- unicode_char c=unicode_uc(uc[i]);
-
- if (revlookup(c))
- uc[i]=c;
- }
-
- s=u2c(u, uc, NULL);
- free(uc);
- return (s);
-}
-
-static char *tolower_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc=c2u(u, cp, ip);
- char *s;
-
- unsigned i;
-
- if (!uc)
- return (NULL);
-
- for (i=0; uc[i]; i++)
- {
- unicode_char c=unicode_lc(uc[i]);
-
- if (revlookup(c))
- uc[i]=c;
- }
-
- s=u2c(u, uc, NULL);
- free(uc);
- return (s);
-}
-
-static char *totitle_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc=c2u(u, cp, ip);
- char *s;
-
- unsigned i;
-
- if (!uc)
- return (NULL);
-
- for (i=0; uc[i]; i++)
- {
- unicode_char c=unicode_tc(uc[i]);
-
- if (revlookup(c))
- uc[i]=c;
- }
-
- s=u2c(u, uc, NULL);
- free(uc);
- return (s);
-}
-
-const struct unicode_info unicode_GB2312 = {
- "GB2312",
- UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
- UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
- c2u,
- u2c,
- toupper_func,
- tolower_func,
- totitle_func};
-
-#if 0
-
-int main()
-{
- FILE *fp=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
- char buf[4000];
- unicode_char *uc;
- char *s, *p;
- int dummyi;
-
- if (!fp)
- return (0);
-
- while (fgets(buf, sizeof(buf), fp))
- {
- unsigned a, b, c;
- int dummy;
-
- if (sscanf(buf, "U+%4x kIRG_GSource 0-%4x", &b, &a) != 2)
- continue;
- a |= 0x8080;
-
- printf("0x%04x 0x%04x: ", a, b);
-
- buf[0]= a / 256;
- buf[1]= a % 256;
- buf[2]=0;
-
- uc=c2u(buf, &dummy);
- if (!uc)
- {
- printf("c2u failure: %d\n", dummy);
- return (1);
- }
- if (uc[0] != b || uc[1])
- {
- printf("c2u failure: got 0x%04x, expected 0x%04x\n",
- (unsigned)uc[0], (unsigned)b);
- return (1);
- }
- s=u2c(uc, &dummy);
- free(uc);
- if (!s)
- {
- printf("u2c failure: %d\n", dummy);
- return (1);
- }
-
- c=0;
- if (!s[0] || !s[1] || s[2] ||
- (c=(int)(unsigned char)s[0] * 256 +
- (unsigned char)s[1]) != a)
- {
- printf("u2c failure: got 0x%04x, expected 0x%04x\n",
- c, a);
- return (1);
- }
-
- p=toupper_func(s, NULL);
- if (!p)
- {
- printf("toupper failure\n");
- return (1);
- }
- if (strcmp(p, s))
- printf("toupper ");
- free(p);
-
- p=tolower_func(s, NULL);
- if (!p)
- {
- printf("tolower failure\n");
- return (1);
- }
- if (strcmp(p, s))
- printf("tolower ");
- free(p);
-
- p=totitle_func(s, NULL);
- if (!p)
- {
- printf("totitle failure\n");
- return (1);
- }
- if (strcmp(p, s))
- printf("totitle ");
- free(p);
-
- free(s);
- printf("ok\n");
- }
- fclose(fp);
-
- buf[0]=0x40;
- buf[1]=0;
- uc=c2u(buf, NULL);
-
- if (!uc)
- {
- printf("us-ascii c2u failure\n");
- return (1);
- }
- s=u2c(uc, NULL);
- free(uc);
- if (!s)
- {
- printf("us-asccu u2c failure\n");
- return (1);
- }
- free(s);
-
- buf[0]=0x40;
- buf[1]=0xF0;
- buf[2]=0;
-
- uc=c2u(buf, NULL);
- if (!uc)
- {
- printf("fallback failed\n");
- return (1);
- }
- printf("fallback: %04x %04x\n", (unsigned)uc[0],
- (unsigned)uc[1]);
-
- s=u2c(uc, NULL);
- free(uc);
-
- if (!s)
- {
- printf("fallback-reverse failed\n");
- return (1);
- }
- printf("fallback: %02x %02x\n", (int)(unsigned char)s[0],
- (int)(unsigned char)s[1]);
- free(s);
-
- buf[0]=0xB2;
- buf[1]=0x20;
- buf[2]=0;
-
- uc=c2u(buf, &dummyi);
-
- if (uc)
- {
- printf("abort failed\n");
- return (1);
- }
-
- printf("aborted at index %d\n", dummyi);
-
- {
- static unicode_char testing[]={0x0040, 0x1000, 0};
-
- uc=testing;
-
- s=u2c(uc, NULL);
-
- if (!s)
- {
- printf("abort-fallback failed\n");
- return (1);
- }
- printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s[0],
- (int)(unsigned char)s[1]);
- free(s);
-
- uc=testing;
- }
-
- s=u2c(uc, &dummyi);
-
- if (s)
- {
- printf("abort-abort failed\n");
- return (1);
- }
-
- printf("abort-aborted: index %d\n", dummyi);
- return (0);
-}
-#endif