Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / iso2022jp.c
CommitLineData
8d138742
CE
1/*
2 * ISO-2022-JP <=> Unicode translate functions.
3 * by Norihisa Washitake <nori@washitake.com>
4 * US-ASCII/JIS X 0201/JIS X 0212 support
5 * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
6 *
7 * $Id: iso2022jp.c,v 1.12 2004/05/23 14:28:24 mrsam Exp $
8 *
9 * This conversion is highly expensive, so it is recommended
10 * that you do not include iso-2022-jp supprt unless you need it.
11 */
12
13/*
14 * Debug Option.
15 * if you want to make iso2022jp test application,
16 * please set the value of _DEBUG to non-zero.
17 */
18#define JIS_DEBUG 0
19/* #define JIS_BUILD_APP */
20
21#include "iso2022jp.h"
22
23#if (JIS_DEBUG) > 0
24#ifdef JIS_BUILD_APP
25 #define JIS_OUT fprintf
26 #define JIS_OUT_FH stderr
27#else
28 #include <syslog.h>
29 #define JIS_OUT syslog
30 #define JIS_OUT_FH (LOG_MAIL|LOG_DEBUG)
31#endif
32#endif
33
34
35/*
36 * read_jis_char.
37 * -- from my second kanji conversion library in 2001. --
38 * Arguments:
39 * src: text in iso-2022-jp.
40 * ch: character info of each character.
41 * Returns:
42 * characters to be skipped in original text.
43 * this value is at least 1.
44 */
45
46static size_t read_jis_char(const char* src, struct jischar_t *ch)
47{
48 /*
49 * In most cases, JIS characters are grouped in 0x20
50 * characters. So we switch by value of src[0]/0x20.
51 */
52 switch (src[0] >> 5) {
53 case 0: /* 0x00 to 0x1F */
54 switch (src[0]) {
55 case JIS_CHAR_SI:
56 ch->type = JIS_TYPE_8BITKANA;
57 ch->value = 0;
58 return 1;
59 case JIS_CHAR_SO:
60 ch->type = JIS_TYPE_ASCII;
61 ch->value = 0;
62 return 1;
63 case JIS_CHAR_ESC:
64 ch->value = 0;
65 switch (src[1]) {
66 case '(': /* 94 character set (G0) */
67 switch (src[2]) {
68 case 'B': /* US-ASCII */
69 ch->type = JIS_TYPE_ASCII;
70 return 3;
71 case 'I': /* JIS X 0201 GR */
72 ch->type = JIS_TYPE_7BITKANA;
73 return 3;
74 case 'J': /* JIS X 0201 GL */
75 ch->type = JIS_TYPE_ROMAN;
76 return 3;
77 default:
78 ch->type = JIS_TYPE_ASCII;
79 ch->value = JIS_CHAR_ESC;
80 return 1;
81 }
82 case '$': /* 94/96n character set */
83 switch (src[2]) {
84 case '@': /* JIS C 6226:1978 */
85 ch->type = JIS_TYPE_JISX0208_1978;
86 return 3;
87 case 'B': /* JIS X 0208:1983/1990/1997 */
88 ch->type = JIS_TYPE_JISX0208;
89 return 3;
90 case '(':
91 switch (src[3]) {
92 case '@': /* JIS C 6226:1978 */
93 ch->type = JIS_TYPE_JISX0208_1978;
94 return 4;
95 case 'B': /* JIS X 0208:1983/1990/1997 */
96 ch->type = JIS_TYPE_JISX0208;
97 return 4;
98 case 'D': /* JIS X 0212:1990 */
99 ch->type = JIS_TYPE_JISX0212;
100 return 4;
101 default:
102 ch->type = JIS_TYPE_BINARY;
103 ch->value = JIS_CHAR_ESC;
104 return 1;
105 }
106 default:
107 ch->type = JIS_TYPE_BINARY;
108 ch->value = JIS_CHAR_ESC;
109 return 1;
110 }
111 case 'K': /* NEC KANJI (IN) */
112 ch->type = JIS_TYPE_JISX0208_1978;
113 return 1;
114 case 'H': /* NEC KANJI (OUT) */
115 ch->type = JIS_TYPE_ASCII;
116 return 1;
117 }
118 default:
119 ch->type = JIS_TYPE_BINARY;
120 ch->value = src[0];
121 return 1;
122 }
123 case 1: /* 0x20 to 0x3F */
124 case 2: /* 0x40 to 0x5F */
125 if (ch->type == JIS_TYPE_7BITKANA) {
126 ch->value = src[0] + 0x80;
127 return 1;
128 }
129 /* Other than 7bit kana are passed to next */
130 case 3: /* 0x60 to 0x7F */
131 if (src[0] == 0x7F) {
132 ch->type = JIS_TYPE_BINARY;
133 ch->value = src[0];
134 return 1;
135 }
136 if ((ch->type == JIS_TYPE_JISX0208
137 || ch->type == JIS_TYPE_JISX0208_1978
138 || ch->type == JIS_TYPE_JISX0212) && src[1]) {
139 ch->value = (src[0] * 0x100) + src[1];
140 return 2;
141 }
142 ch->value = src[0];
143 return 1;
144 case 4: /* 0x80 to 0x9F */
145 ch->value = src[0];
146 ch->type = JIS_TYPE_BINARY;
147 return 1;
148 case 5: /* 0xA0 to 0xBF */
149 case 6: /* 0xC0 to 0xDF */
150 if (ch->type == JIS_TYPE_8BITKANA) {
151 if (0xA0 < (unsigned)src[0] && (unsigned)src[0] <= 0xDF) {
152 ch->value = (unsigned char)src[0];
153 return 1;
154 }
155 }
156 ch->type = JIS_TYPE_BINARY;
157 ch->value = (unsigned char)src[0];
158 return 1;
159 case 7: /* 0xE0 to 0xFF */
160 ch->value = (unsigned char)src[0];
161 ch->type = JIS_TYPE_BINARY;
162 return 1;
163 default:
164 ch->value = (unsigned char)src[0];
165 ch->type = JIS_TYPE_BINARY;
166 return 1;
167 }
168}
169
170static unicode_char c2u_conv(int j, int jis_type)
171{
172 unsigned int upper = (j >> 8);
173 unsigned int lower = j & 0xFF;
174 const unicode_char **tbls;
175
176 if (!upper)
177 {
178 switch (jis_type)
179 {
180 /* JIS X 0201 GR */
181 case JIS_TYPE_7BITKANA:
182 case JIS_TYPE_8BITKANA:
183 if (0xA1 <= lower && lower <=0xDF)
184 return (unicode_char)(lower + (0xFF9F - 0xDF));
185 else
186 return (unicode_char)0xFFFD;
187 break;
188
189 /* JIS X 0201 GL */
190 case JIS_TYPE_ROMAN:
191 /* 2 characters replaced by JIS X 0201 */
192 if (lower == 0x5C) /* REVERSE SOLIDUS -> YEN SIGN */
193 return (unicode_char)0x00A5;
194 if (lower == 0x7E) /* TILDE -> OVERLINE */
195 return (unicode_char)0x203E;
196 /* break; */
197 /* US-ASCII or Control characters */
198 case JIS_TYPE_ASCII:
199 case JIS_TYPE_BINARY:
200 if (lower < 0x80)
201 return (unicode_char)lower;
202 else
203 return (unicode_char)0xFFFD;
204 break;
205
206 /* Otherwise return REPLACEMENT CHARACTER. */
207 default:
208 return (unicode_char)0xFFFD;
209 }
210 }
211
212 switch (jis_type)
213 {
214 /* JIS X 0208:1983/1990/1997 */
215 case JIS_TYPE_JISX0208:
216 tbls = jisx0208_to_uni_tbls;
217 break;
218
219 /* JIS C 6226:1978 */
220 case JIS_TYPE_JISX0208_1978:
221 tbls = jisx0208_1978_to_uni_tbls;
222 break;
223
224 /* JIS X 0212:1990 */
225 case JIS_TYPE_JISX0212:
226 tbls = jisx0212_to_uni_tbls;
227 break;
228
229 /* Otherwise return REPLACEMENT CHARACTER. */
230 default:
231 return (unicode_char)0xFFFD;
232 break;
233 }
234
235 if (0x20 < upper && upper < 0x7F
236 && 0x20 < lower && lower < 0x7F)
237 {
238 if (tbls[upper-0x21] != NULL
239 && tbls[upper-0x21][lower-0x21] != (unicode_char)0x003F)
240 {
241 if (tbls[upper-0x21][lower-0x21])
242 return tbls[upper-0x21][lower-0x21];
243 return (unicode_char)0xFFFD;
244 }
245 }
246
247 /* we should think of 8bit-JIS, maybe. */
248 /* but currently returns the REPLACEMENT CHARACTER. */
249 return (unicode_char)0xFFFD;
250}
251
252static unicode_char *c2u(const struct unicode_info *u,
253 const char *jis_str, int *err)
254{
255 size_t i, cnt, w;
256 unicode_char *uc;
257 struct jischar_t jchar;
258
259 if (err)
260 *err = -1;
261
262 /* Count the number of potential unicode characters first. */
263 i = cnt = 0;
264 jchar.type = 0;
265 jchar.value = 0;
266 while (jis_str[i]) {
267 i += read_jis_char(&jis_str[i], &jchar);
268 if (jchar.value)
269 ++cnt;
270 }
271
272 uc = malloc((cnt+1) * sizeof(unicode_char));
273#if JIS_DEBUG>0
274 if (uc)
275 JIS_OUT(JIS_OUT_FH, "c2u: allocated heap; 0x%04X bytes.\n", cnt+1);
276 else
277 JIS_OUT(JIS_OUT_FH, "c2u: heap allocation failed; 0x%04X bytes.\n", cnt+1);
278#endif
279 if (!uc)
280 return (NULL);
281
282 i = cnt = 0;
283 jchar.type = 0;
284 jchar.value = 0;
285 while (jis_str[i]) {
286 w = read_jis_char(&jis_str[i], &jchar);
287 if (jchar.value) {
288 uc[cnt] = c2u_conv(jchar.value, jchar.type);
289#if JIS_DEBUG > 1
290 JIS_OUT(JIS_OUT_FH, "c2u: converted; JIS 0x%04X => U+%04X", jchar.value, uc[cnt]);
291#endif
292 if (uc[cnt] == (unicode_char)0xFFFD && err)
293 {
294 *err = i;
295 free(uc);
296 return NULL;
297 }
298 ++cnt;
299 }
300 i+=w;
301 }
302
303 uc[cnt] = 0;
304#if JIS_DEBUG > 0
305 JIS_OUT(JIS_OUT_FH, "c2u: end of heap; 0x%04X bytes.", cnt+1);
306#endif
307 return (uc);
308}
309
310static void revlookup(unicode_char u, struct jischar_t *ch)
311{
312 unsigned int upper = u >> 8;
313 unsigned int lower = u & 0xff;
314
315 /* ISO-2022-JP(-1) is mapped inside BMP range. */
316 if (u >= (unicode_char)0x10000)
317 {
318 ch->type = JIS_TYPE_BINARY;
319 ch->value = 0x003F;
320 return;
321 }
322
323 /* US-ASCII */
324 if (u < (unicode_char)0x0080)
325 {
326 ch->type = JIS_TYPE_ASCII;
327 ch->value = (unsigned)u;
328 return;
329 }
330
331 /* 2 Characters replaced by JIS X 0201 */
332 if (u == (unicode_char)0x00a5)
333 {
334 ch->type = JIS_TYPE_ROMAN;
335 ch->value = 0x5C;
336 return;
337 }
338 if (u == (unicode_char)0x203E)
339 {
340 ch->type = JIS_TYPE_ROMAN;
341 ch->value = 0x7E;
342 return;
343 }
344
345 /* JIS X 0201 GR */
346 if ((unicode_char)0xFF61 <= u && u <= (unicode_char)0xFF9F)
347 {
348 ch->type = JIS_TYPE_8BITKANA;
349 ch->value = u - (unsigned)0xFF40 + (unsigned)0x80;
350 return;
351 }
352
353 /* JIS X 0208/JIS X 0212 */
354 if (uni_to_jisx0208_tbls[upper] != NULL
355 && uni_to_jisx0208_tbls[upper][lower] != 0x003F)
356 {
357 ch->type = JIS_TYPE_JISX0208;
358 ch->value = uni_to_jisx0208_tbls[upper][lower];
359 return;
360 }
361 if (uni_to_jisx0212_tbls[upper] != NULL
362 && uni_to_jisx0212_tbls[upper][lower] != 0x003F)
363 {
364 ch->type = JIS_TYPE_JISX0212;
365 ch->value = uni_to_jisx0212_tbls[upper][lower];
366 return;
367 }
368
369 /* return 'unknown' character if unknown */
370 ch->type = JIS_TYPE_BINARY;
371 ch->value = 0x003F;
372 return;
373}
374
375#if 0
376static int get_iso2022jp_type(unsigned j)
377{
378 if (0xA0 < j && j < 0xE0)
379 return JIS_TYPE_8BITKANA;
380 if (j > 0xff)
381 return JIS_TYPE_KANJI;
382 return JIS_TYPE_ASCII;
383}
384#endif
385
386static char *u2c(const struct unicode_info *u,
387 const unicode_char *str, int *err)
388{
389 size_t i, cnt;
390 int j;
391 int jtype = JIS_TYPE_ASCII;
392 int jt;
393 char *s;
394 struct jischar_t ch;
395
396 if (err)
397 *err = -1;
398
399 for (i = cnt = 0; str[i]; i++) {
400 revlookup(str[i], &ch);
401 jt = ch.type;
402 j = ch.value;
403 if (jt != jtype) {
404 cnt += ((jt == JIS_TYPE_JISX0212) ? 4 : 3);
405 jtype = jt;
406 }
407 cnt += ((jtype == JIS_TYPE_JISX0208 || jtype == JIS_TYPE_JISX0212) ? 2 : 1);
408 }
409 if (jtype != JIS_TYPE_ASCII && jtype != JIS_TYPE_BINARY)
410 cnt += 3;
411
412 s = malloc(cnt+1);
413#if JIS_DEBUG > 0
414 if (s)
415 JIS_OUT(JIS_OUT_FH, "u2c: allocated heap; 0x%04X bytes.\n", cnt+1);
416 else
417 JIS_OUT(JIS_OUT_FH, "u2c: heap allocation failed; 0x%04X bytes.\n", cnt+1);
418#endif
419 if (!s)
420 return (NULL);
421
422 jtype = JIS_TYPE_ASCII;
423 for (i = cnt = 0; str[i]; i++) {
424 revlookup(str[i], &ch);
425
426 jt = ch.type;
427 j = ch.value;
428 if (jt != jtype) {
429 switch (jt) {
430 case JIS_TYPE_JISX0208:
431 s[cnt++] = JIS_CHAR_ESC;
432 s[cnt++] = '$';
433 s[cnt++] = 'B';
434#if JIS_DEBUG > 2
435 JIS_OUT(JIS_OUT_FH, "u2c: changed map; JIS_TYPE_JISX0208.\n");
436#endif
437 break;
438 case JIS_TYPE_JISX0212:
439 s[cnt++] = JIS_CHAR_ESC;
440 s[cnt++] = '$';
441 s[cnt++] = '(';
442 s[cnt++] = 'D';
443 break;
444 case JIS_TYPE_7BITKANA:
445 case JIS_TYPE_8BITKANA:
446 s[cnt++] = JIS_CHAR_ESC;
447 s[cnt++] = '(';
448 s[cnt++] = 'I';
449#if JIS_DEBUG > 2
450 JIS_OUT(JIS_OUT_FH, "u2c: changed map; JIS_TYPE_8BITKANA.\n");
451#endif
452 break;
453 case JIS_TYPE_ROMAN:
454 s[cnt++] = JIS_CHAR_ESC;
455 s[cnt++] = '(';
456 s[cnt++] = 'J';
457 break;
458 default:
459 s[cnt++] = JIS_CHAR_ESC;
460 s[cnt++] = '(';
461 s[cnt++] = 'B';
462#if JIS_DEBUG > 2
463 JIS_OUT(JIS_OUT_FH, "u2c: changed map; JIS_TYPE_ASCII.\n");
464#endif
465 break;
466 }
467 jtype = jt;
468 }
469 switch (jtype) {
470 case JIS_TYPE_JISX0208:
471 case JIS_TYPE_JISX0212:
472 s[cnt++] = (char)(j >> 8);
473 s[cnt++] = (char)(j & 0xff);
474 break;
475 case JIS_TYPE_7BITKANA:
476 case JIS_TYPE_8BITKANA:
477 s[cnt++] = (char)(j - 0x80);
478 break;
479 default:
480 s[cnt++] = (char)j;
481 break;
482 }
483#if JIS_DEBUG > 1
484 JIS_OUT(JIS_OUT_FH, "u2c: converted; U+%04X => JIS 0x%04X\n", str[i], j);
485#endif
486 if (jtype == JIS_TYPE_BINARY && j == 0x003F)
487 if (err)
488 {
489 *err = i;
490 free(s);
491 return NULL;
492 }
493 }
494 if (jtype != JIS_TYPE_ASCII && jtype != JIS_TYPE_BINARY) {
495 s[cnt++] = JIS_CHAR_ESC;
496 s[cnt++] = '(';
497 s[cnt++] = 'B';
498 }
499 s[cnt] = '\x0';
500
501#if JIS_DEBUG > 0
502 JIS_OUT(JIS_OUT_FH, "u2c: end of heap; 0x%04X bytes.\n", cnt+1);
503#endif
504 return s;
505}
506
507static char *toupper_func(const struct unicode_info *u,
508 const char *cp, int *ip)
509{
510 unicode_char *uc = c2u(u, cp, ip);
511 char *s;
512 size_t i;
513
514 if (!uc)
515 return (NULL);
516
517 for (i=0; uc[i]; i++) {
518 if ((unicode_char)'a' <= uc[i] && uc[i] <= (unicode_char)'z')
519 uc[i] = uc[i] - ((unicode_char)'a' - (unicode_char)'A');
520 }
521
522 s = u2c(u, uc, NULL);
523 free(uc);
524 return (s);
525}
526
527static char *tolower_func(const struct unicode_info *u,
528 const char *cp, int *ip)
529{
530 unicode_char *uc = c2u(u, cp, ip);
531 char *s;
532 size_t i;
533
534 if (!uc)
535 return (NULL);
536
537 for (i=0; uc[i]; i++) {
538 if ((unicode_char)'A' <= uc[i] && uc[i] <= (unicode_char)'Z')
539 uc[i] = uc[i] + ((unicode_char)'a' - (unicode_char)'A');
540 }
541
542 s = u2c(u, uc, NULL);
543 free(uc);
544
545 return (s);
546}
547
548
549static char *totitle_func(const struct unicode_info *u,
550 const char *cp, int *ip)
551{
552 unicode_char *uc = c2u(u, cp, ip);
553 char *s;
554
555 if (!uc)
556 return (NULL);
557
558 /* Uh, sorry, what's "title" char? */
559 /*
560 * for (i=0; uc[i]; i++)
561 * uc[i] = unicode_tc(uc[i]);
562 */
563
564 s = u2c(u, uc, NULL);
565 free(uc);
566 return (s);
567}
568
569extern const struct unicode_info unicode_UTF8;
570
571const struct unicode_info unicode_ISO2022_JP = {
572 "ISO-2022-JP",
573 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_SISO |
574 UNICODE_HEADER_BASE64,
575 c2u,
576 u2c,
577 toupper_func,
578 tolower_func,
579 totitle_func,
580 &unicode_UTF8
581};
582
583const struct unicode_info unicode_ISO2022_JP_1 = {
584 "ISO-2022-JP-1",
585 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_SISO |
586 UNICODE_HEADER_BASE64,
587 c2u,
588 u2c,
589 toupper_func,
590 tolower_func,
591 totitle_func,
592 &unicode_UTF8
593};
594
595#if (JIS_DEBUG > 0) && defined(JIS_BUILD_APP)
596int main(int argc, char** argv)
597{
598 FILE* fp;
599 char c;
600 int cnt;
601 char* str;
602 unicode_char* ustr;
603 char* jstr;
604 int i;
605
606 if (argc<2) {
607 JIS_OUT(JIS_OUT_FH, "usage: %s filename(s)\n", argv[0]);
608 exit(1);
609 }
610
611 while (argc > 1) {
612 --argc;
613 JIS_OUT(JIS_OUT_FH, "main: opening file %s.\n", argv[argc]);
614 fp = fopen(argv[argc], "r");
615 cnt=0;
616 while (c = fgetc(fp) != EOF)
617 cnt++;
618
619 str = malloc(cnt+1);
620 fseek(fp, 0, SEEK_SET);
621 fread(str, cnt, 1, fp);
622 str[cnt] = 0;
623
624 ustr = c2u(str, NULL);
625 /* for (i=0; ustr[i]; i++)
626 * putchar(ustr[i]);
627 */
628 jstr = u2c(ustr, NULL);
629 for (i=0; jstr[i]; i++)
630 putchar(jstr[i]);
631
632 free(jstr);
633 free(ustr);
634 free(str);
635 }
636 return 1;
637}
638#endif /* defined(JIS_BUILD_APP) */