2 ** Copyright 2000-2002 Double Precision, Inc.
3 ** See COPYING for distribution information.
5 ** $Id: gb2312.c,v 1.13 2004/05/23 14:28:24 mrsam Exp $
13 static const unicode_char
* const gb2312
[]= {
109 static unicode_char
*c2u(const struct unicode_info
*u
,
110 const char *cp
, int *err
)
119 ** Count the number of potential unicode characters first.
122 for (i
=cnt
=0; cp
[i
]; i
++)
124 if ( (int)(unsigned char)cp
[i
] < 0xA1 ||
125 (int)(unsigned char)cp
[i
] > 0xFE ||
136 uc
=malloc((cnt
+1)*sizeof(unicode_char
));
143 int a
=(int)(unsigned char)cp
[i
], b
;
145 if ( a
>= 0xA1 && a
<= 0xFE && cp
[i
+1])
148 b
=(int)(unsigned char)cp
[i
+1];
150 if (0xA1 <= b
&& b
<= 0xFE
152 && (ucv
=gb2312
[a
-0xA1][b
-0xA1]))
161 uc
[cnt
++] = (unicode_char
)0xFFFD;
164 else if (a
< (unsigned)0x80)
186 static unsigned revlookup(unicode_char c
)
192 bucket
=c
% gb2312_revhash_size
;
195 for (j
=gb2312_revtable_index
[ bucket
];
196 j
< sizeof(gb2312_revtable_uc
)/sizeof(gb2312_revtable_uc
[0]);
199 unicode_char uuc
=gb2312_revtable_uc
[j
];
202 return (gb2312_revtable_octets
[j
]);
204 if ((uuc
% gb2312_revhash_size
) != bucket
)
210 static char *u2c(const struct unicode_info
*u
,
211 const unicode_char
*cp
, int *err
)
219 ** Figure out the size of the octet string. Unicodes < 0x7f will
220 ** map to a single byte, unicodes >= 0x80 will map to two bytes.
223 for (i
=cnt
=0; cp
[i
]; i
++)
235 for (i
=0; cp
[i
]; i
++)
239 /* US-ASCII or GB 1988 (ISO 646 PRC version) */
240 if (cp
[i
] < (unicode_char
)0x0080)
242 s
[cnt
++]= (char)cp
[i
];
246 /* For compatibility: 2 characters replaced by GB 1988 */
247 if (cp
[i
] == (unicode_char
)0x00A5) /* YEN SIGN == yuan sign */
252 if (cp
[i
] == (unicode_char
)0x203E) /* OVERLINE */
272 s
[cnt
++]= (char)(uc
>> 8);
273 s
[cnt
++]= (char)(uc
& 0x00FF);
280 static char *toupper_func(const struct unicode_info
*u
,
281 const char *cp
, int *ip
)
283 unicode_char
*uc
=c2u(u
, cp
, ip
);
291 for (i
=0; uc
[i
]; i
++)
293 unicode_char c
=unicode_uc(uc
[i
]);
304 static char *tolower_func(const struct unicode_info
*u
,
305 const char *cp
, int *ip
)
307 unicode_char
*uc
=c2u(u
, cp
, ip
);
315 for (i
=0; uc
[i
]; i
++)
317 unicode_char c
=unicode_lc(uc
[i
]);
328 static char *totitle_func(const struct unicode_info
*u
,
329 const char *cp
, int *ip
)
331 unicode_char
*uc
=c2u(u
, cp
, ip
);
339 for (i
=0; uc
[i
]; i
++)
341 unicode_char c
=unicode_tc(uc
[i
]);
352 const struct unicode_info unicode_GB2312
= {
354 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_USASCII
|
355 UNICODE_HEADER_BASE64
| UNICODE_BODY_BASE64
,
366 FILE *fp
=popen("gunzip -cd <Unihan-3.2.0.txt.gz", "r");
375 while (fgets(buf
, sizeof(buf
), fp
))
380 if (sscanf(buf
, "U+%4x kIRG_GSource 0-%4x", &b
, &a
) != 2)
384 printf("0x%04x 0x%04x: ", a
, b
);
393 printf("c2u failure: %d\n", dummy
);
396 if (uc
[0] != b
|| uc
[1])
398 printf("c2u failure: got 0x%04x, expected 0x%04x\n",
399 (unsigned)uc
[0], (unsigned)b
);
406 printf("u2c failure: %d\n", dummy
);
411 if (!s
[0] || !s
[1] || s
[2] ||
412 (c
=(int)(unsigned char)s
[0] * 256 +
413 (unsigned char)s
[1]) != a
)
415 printf("u2c failure: got 0x%04x, expected 0x%04x\n",
420 p
=toupper_func(s
, NULL
);
423 printf("toupper failure\n");
430 p
=tolower_func(s
, NULL
);
433 printf("tolower failure\n");
440 p
=totitle_func(s
, NULL
);
443 printf("totitle failure\n");
461 printf("us-ascii c2u failure\n");
468 printf("us-asccu u2c failure\n");
480 printf("fallback failed\n");
483 printf("fallback: %04x %04x\n", (unsigned)uc
[0],
491 printf("fallback-reverse failed\n");
494 printf("fallback: %02x %02x\n", (int)(unsigned char)s
[0],
495 (int)(unsigned char)s
[1]);
502 uc
=c2u(buf
, &dummyi
);
506 printf("abort failed\n");
510 printf("aborted at index %d\n", dummyi
);
513 static unicode_char testing
[]={0x0040, 0x1000, 0};
521 printf("abort-fallback failed\n");
524 printf("abort-fallback: %02x %02x\n", (int)(unsigned char)s
[0],
525 (int)(unsigned char)s
[1]);
535 printf("abort-abort failed\n");
539 printf("abort-aborted: index %d\n", dummyi
);