2 * ISO-2022-JP <=> Unicode translate functions.
3 * by Norihisa Washitake <nori@washitake.com>
4 * US-ASCII/JIS X 0201/JIS X 0212 support
5 * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
7 * $Id: iso2022jp.c,v 1.12 2004/05/23 14:28:24 mrsam Exp $
9 * This conversion is highly expensive, so it is recommended
10 * that you do not include iso-2022-jp supprt unless you need it.
15 * if you want to make iso2022jp test application,
16 * please set the value of _DEBUG to non-zero.
19 /* #define JIS_BUILD_APP */
21 #include "iso2022jp.h"
25 #define JIS_OUT fprintf
26 #define JIS_OUT_FH stderr
29 #define JIS_OUT syslog
30 #define JIS_OUT_FH (LOG_MAIL|LOG_DEBUG)
37 * -- from my second kanji conversion library in 2001. --
39 * src: text in iso-2022-jp.
40 * ch: character info of each character.
42 * characters to be skipped in original text.
43 * this value is at least 1.
46 static size_t read_jis_char(const char* src
, struct jischar_t
*ch
)
49 * In most cases, JIS characters are grouped in 0x20
50 * characters. So we switch by value of src[0]/0x20.
52 switch (src
[0] >> 5) {
53 case 0: /* 0x00 to 0x1F */
56 ch
->type
= JIS_TYPE_8BITKANA
;
60 ch
->type
= JIS_TYPE_ASCII
;
66 case '(': /* 94 character set (G0) */
68 case 'B': /* US-ASCII */
69 ch
->type
= JIS_TYPE_ASCII
;
71 case 'I': /* JIS X 0201 GR */
72 ch
->type
= JIS_TYPE_7BITKANA
;
74 case 'J': /* JIS X 0201 GL */
75 ch
->type
= JIS_TYPE_ROMAN
;
78 ch
->type
= JIS_TYPE_ASCII
;
79 ch
->value
= JIS_CHAR_ESC
;
82 case '$': /* 94/96n character set */
84 case '@': /* JIS C 6226:1978 */
85 ch
->type
= JIS_TYPE_JISX0208_1978
;
87 case 'B': /* JIS X 0208:1983/1990/1997 */
88 ch
->type
= JIS_TYPE_JISX0208
;
92 case '@': /* JIS C 6226:1978 */
93 ch
->type
= JIS_TYPE_JISX0208_1978
;
95 case 'B': /* JIS X 0208:1983/1990/1997 */
96 ch
->type
= JIS_TYPE_JISX0208
;
98 case 'D': /* JIS X 0212:1990 */
99 ch
->type
= JIS_TYPE_JISX0212
;
102 ch
->type
= JIS_TYPE_BINARY
;
103 ch
->value
= JIS_CHAR_ESC
;
107 ch
->type
= JIS_TYPE_BINARY
;
108 ch
->value
= JIS_CHAR_ESC
;
111 case 'K': /* NEC KANJI (IN) */
112 ch
->type
= JIS_TYPE_JISX0208_1978
;
114 case 'H': /* NEC KANJI (OUT) */
115 ch
->type
= JIS_TYPE_ASCII
;
119 ch
->type
= JIS_TYPE_BINARY
;
123 case 1: /* 0x20 to 0x3F */
124 case 2: /* 0x40 to 0x5F */
125 if (ch
->type
== JIS_TYPE_7BITKANA
) {
126 ch
->value
= src
[0] + 0x80;
129 /* Other than 7bit kana are passed to next */
130 case 3: /* 0x60 to 0x7F */
131 if (src
[0] == 0x7F) {
132 ch
->type
= JIS_TYPE_BINARY
;
136 if ((ch
->type
== JIS_TYPE_JISX0208
137 || ch
->type
== JIS_TYPE_JISX0208_1978
138 || ch
->type
== JIS_TYPE_JISX0212
) && src
[1]) {
139 ch
->value
= (src
[0] * 0x100) + src
[1];
144 case 4: /* 0x80 to 0x9F */
146 ch
->type
= JIS_TYPE_BINARY
;
148 case 5: /* 0xA0 to 0xBF */
149 case 6: /* 0xC0 to 0xDF */
150 if (ch
->type
== JIS_TYPE_8BITKANA
) {
151 if (0xA0 < (unsigned)src
[0] && (unsigned)src
[0] <= 0xDF) {
152 ch
->value
= (unsigned char)src
[0];
156 ch
->type
= JIS_TYPE_BINARY
;
157 ch
->value
= (unsigned char)src
[0];
159 case 7: /* 0xE0 to 0xFF */
160 ch
->value
= (unsigned char)src
[0];
161 ch
->type
= JIS_TYPE_BINARY
;
164 ch
->value
= (unsigned char)src
[0];
165 ch
->type
= JIS_TYPE_BINARY
;
170 static unicode_char
c2u_conv(int j
, int jis_type
)
172 unsigned int upper
= (j
>> 8);
173 unsigned int lower
= j
& 0xFF;
174 const unicode_char
**tbls
;
181 case JIS_TYPE_7BITKANA
:
182 case JIS_TYPE_8BITKANA
:
183 if (0xA1 <= lower
&& lower
<=0xDF)
184 return (unicode_char
)(lower
+ (0xFF9F - 0xDF));
186 return (unicode_char
)0xFFFD;
191 /* 2 characters replaced by JIS X 0201 */
192 if (lower
== 0x5C) /* REVERSE SOLIDUS -> YEN SIGN */
193 return (unicode_char
)0x00A5;
194 if (lower
== 0x7E) /* TILDE -> OVERLINE */
195 return (unicode_char
)0x203E;
197 /* US-ASCII or Control characters */
199 case JIS_TYPE_BINARY
:
201 return (unicode_char
)lower
;
203 return (unicode_char
)0xFFFD;
206 /* Otherwise return REPLACEMENT CHARACTER. */
208 return (unicode_char
)0xFFFD;
214 /* JIS X 0208:1983/1990/1997 */
215 case JIS_TYPE_JISX0208
:
216 tbls
= jisx0208_to_uni_tbls
;
219 /* JIS C 6226:1978 */
220 case JIS_TYPE_JISX0208_1978
:
221 tbls
= jisx0208_1978_to_uni_tbls
;
224 /* JIS X 0212:1990 */
225 case JIS_TYPE_JISX0212
:
226 tbls
= jisx0212_to_uni_tbls
;
229 /* Otherwise return REPLACEMENT CHARACTER. */
231 return (unicode_char
)0xFFFD;
235 if (0x20 < upper
&& upper
< 0x7F
236 && 0x20 < lower
&& lower
< 0x7F)
238 if (tbls
[upper
-0x21] != NULL
239 && tbls
[upper
-0x21][lower
-0x21] != (unicode_char
)0x003F)
241 if (tbls
[upper
-0x21][lower
-0x21])
242 return tbls
[upper
-0x21][lower
-0x21];
243 return (unicode_char
)0xFFFD;
247 /* we should think of 8bit-JIS, maybe. */
248 /* but currently returns the REPLACEMENT CHARACTER. */
249 return (unicode_char
)0xFFFD;
252 static unicode_char
*c2u(const struct unicode_info
*u
,
253 const char *jis_str
, int *err
)
257 struct jischar_t jchar
;
262 /* Count the number of potential unicode characters first. */
267 i
+= read_jis_char(&jis_str
[i
], &jchar
);
272 uc
= malloc((cnt
+1) * sizeof(unicode_char
));
275 JIS_OUT(JIS_OUT_FH
, "c2u: allocated heap; 0x%04X bytes.\n", cnt
+1);
277 JIS_OUT(JIS_OUT_FH
, "c2u: heap allocation failed; 0x%04X bytes.\n", cnt
+1);
286 w
= read_jis_char(&jis_str
[i
], &jchar
);
288 uc
[cnt
] = c2u_conv(jchar
.value
, jchar
.type
);
290 JIS_OUT(JIS_OUT_FH
, "c2u: converted; JIS 0x%04X => U+%04X", jchar
.value
, uc
[cnt
]);
292 if (uc
[cnt
] == (unicode_char
)0xFFFD && err
)
305 JIS_OUT(JIS_OUT_FH
, "c2u: end of heap; 0x%04X bytes.", cnt
+1);
310 static void revlookup(unicode_char u
, struct jischar_t
*ch
)
312 unsigned int upper
= u
>> 8;
313 unsigned int lower
= u
& 0xff;
315 /* ISO-2022-JP(-1) is mapped inside BMP range. */
316 if (u
>= (unicode_char
)0x10000)
318 ch
->type
= JIS_TYPE_BINARY
;
324 if (u
< (unicode_char
)0x0080)
326 ch
->type
= JIS_TYPE_ASCII
;
327 ch
->value
= (unsigned)u
;
331 /* 2 Characters replaced by JIS X 0201 */
332 if (u
== (unicode_char
)0x00a5)
334 ch
->type
= JIS_TYPE_ROMAN
;
338 if (u
== (unicode_char
)0x203E)
340 ch
->type
= JIS_TYPE_ROMAN
;
346 if ((unicode_char
)0xFF61 <= u
&& u
<= (unicode_char
)0xFF9F)
348 ch
->type
= JIS_TYPE_8BITKANA
;
349 ch
->value
= u
- (unsigned)0xFF40 + (unsigned)0x80;
353 /* JIS X 0208/JIS X 0212 */
354 if (uni_to_jisx0208_tbls
[upper
] != NULL
355 && uni_to_jisx0208_tbls
[upper
][lower
] != 0x003F)
357 ch
->type
= JIS_TYPE_JISX0208
;
358 ch
->value
= uni_to_jisx0208_tbls
[upper
][lower
];
361 if (uni_to_jisx0212_tbls
[upper
] != NULL
362 && uni_to_jisx0212_tbls
[upper
][lower
] != 0x003F)
364 ch
->type
= JIS_TYPE_JISX0212
;
365 ch
->value
= uni_to_jisx0212_tbls
[upper
][lower
];
369 /* return 'unknown' character if unknown */
370 ch
->type
= JIS_TYPE_BINARY
;
376 static int get_iso2022jp_type(unsigned j
)
378 if (0xA0 < j
&& j
< 0xE0)
379 return JIS_TYPE_8BITKANA
;
381 return JIS_TYPE_KANJI
;
382 return JIS_TYPE_ASCII
;
386 static char *u2c(const struct unicode_info
*u
,
387 const unicode_char
*str
, int *err
)
391 int jtype
= JIS_TYPE_ASCII
;
399 for (i
= cnt
= 0; str
[i
]; i
++) {
400 revlookup(str
[i
], &ch
);
404 cnt
+= ((jt
== JIS_TYPE_JISX0212
) ? 4 : 3);
407 cnt
+= ((jtype
== JIS_TYPE_JISX0208
|| jtype
== JIS_TYPE_JISX0212
) ? 2 : 1);
409 if (jtype
!= JIS_TYPE_ASCII
&& jtype
!= JIS_TYPE_BINARY
)
415 JIS_OUT(JIS_OUT_FH
, "u2c: allocated heap; 0x%04X bytes.\n", cnt
+1);
417 JIS_OUT(JIS_OUT_FH
, "u2c: heap allocation failed; 0x%04X bytes.\n", cnt
+1);
422 jtype
= JIS_TYPE_ASCII
;
423 for (i
= cnt
= 0; str
[i
]; i
++) {
424 revlookup(str
[i
], &ch
);
430 case JIS_TYPE_JISX0208
:
431 s
[cnt
++] = JIS_CHAR_ESC
;
435 JIS_OUT(JIS_OUT_FH
, "u2c: changed map; JIS_TYPE_JISX0208.\n");
438 case JIS_TYPE_JISX0212
:
439 s
[cnt
++] = JIS_CHAR_ESC
;
444 case JIS_TYPE_7BITKANA
:
445 case JIS_TYPE_8BITKANA
:
446 s
[cnt
++] = JIS_CHAR_ESC
;
450 JIS_OUT(JIS_OUT_FH
, "u2c: changed map; JIS_TYPE_8BITKANA.\n");
454 s
[cnt
++] = JIS_CHAR_ESC
;
459 s
[cnt
++] = JIS_CHAR_ESC
;
463 JIS_OUT(JIS_OUT_FH
, "u2c: changed map; JIS_TYPE_ASCII.\n");
470 case JIS_TYPE_JISX0208
:
471 case JIS_TYPE_JISX0212
:
472 s
[cnt
++] = (char)(j
>> 8);
473 s
[cnt
++] = (char)(j
& 0xff);
475 case JIS_TYPE_7BITKANA
:
476 case JIS_TYPE_8BITKANA
:
477 s
[cnt
++] = (char)(j
- 0x80);
484 JIS_OUT(JIS_OUT_FH
, "u2c: converted; U+%04X => JIS 0x%04X\n", str
[i
], j
);
486 if (jtype
== JIS_TYPE_BINARY
&& j
== 0x003F)
494 if (jtype
!= JIS_TYPE_ASCII
&& jtype
!= JIS_TYPE_BINARY
) {
495 s
[cnt
++] = JIS_CHAR_ESC
;
502 JIS_OUT(JIS_OUT_FH
, "u2c: end of heap; 0x%04X bytes.\n", cnt
+1);
507 static char *toupper_func(const struct unicode_info
*u
,
508 const char *cp
, int *ip
)
510 unicode_char
*uc
= c2u(u
, cp
, ip
);
517 for (i
=0; uc
[i
]; i
++) {
518 if ((unicode_char
)'a' <= uc
[i
] && uc
[i
] <= (unicode_char
)'z')
519 uc
[i
] = uc
[i
] - ((unicode_char
)'a' - (unicode_char
)'A');
522 s
= u2c(u
, uc
, NULL
);
527 static char *tolower_func(const struct unicode_info
*u
,
528 const char *cp
, int *ip
)
530 unicode_char
*uc
= c2u(u
, cp
, ip
);
537 for (i
=0; uc
[i
]; i
++) {
538 if ((unicode_char
)'A' <= uc
[i
] && uc
[i
] <= (unicode_char
)'Z')
539 uc
[i
] = uc
[i
] + ((unicode_char
)'a' - (unicode_char
)'A');
542 s
= u2c(u
, uc
, NULL
);
549 static char *totitle_func(const struct unicode_info
*u
,
550 const char *cp
, int *ip
)
552 unicode_char
*uc
= c2u(u
, cp
, ip
);
558 /* Uh, sorry, what's "title" char? */
560 * for (i=0; uc[i]; i++)
561 * uc[i] = unicode_tc(uc[i]);
564 s
= u2c(u
, uc
, NULL
);
569 extern const struct unicode_info unicode_UTF8
;
571 const struct unicode_info unicode_ISO2022_JP
= {
573 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_SISO
|
574 UNICODE_HEADER_BASE64
,
583 const struct unicode_info unicode_ISO2022_JP_1
= {
585 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_SISO
|
586 UNICODE_HEADER_BASE64
,
595 #if (JIS_DEBUG > 0) && defined(JIS_BUILD_APP)
596 int main(int argc
, char** argv
)
607 JIS_OUT(JIS_OUT_FH
, "usage: %s filename(s)\n", argv
[0]);
613 JIS_OUT(JIS_OUT_FH
, "main: opening file %s.\n", argv
[argc
]);
614 fp
= fopen(argv
[argc
], "r");
616 while (c
= fgetc(fp
) != EOF
)
620 fseek(fp
, 0, SEEK_SET
);
621 fread(str
, cnt
, 1, fp
);
624 ustr
= c2u(str
, NULL
);
625 /* for (i=0; ustr[i]; i++)
628 jstr
= u2c(ustr
, NULL
);
629 for (i
=0; jstr
[i
]; i
++)
638 #endif /* defined(JIS_BUILD_APP) */