2 * ISO-2022-KR, EUC-KR & CP949 <=> Unicode translate functions.
3 * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
11 #define EUCKR_CP949_EXTENSION 1
14 * ISO-2022-KR (RFC1557) Converters
22 static size_t read_char(const char* src
, struct kschar_t
*ch
)
29 ch
->state
= KS_STATE_ASCII
;
34 ch
->state
= KS_STATE_KSX1001
;
38 /* Announcer sequence */
39 if (src
[1] == '$' && src
[2] == ')' && src
[3] == 'C') {
46 ch
->state
= KS_STATE_BINARY
;
47 ch
->value
= KS_CHAR_ESC
;
52 /* Control Characters */
53 if ((unsigned char)src
[0] < 0x20)
55 /* state will not be changed. */
56 ch
->value
= (unsigned int)src
[0];
59 if ((ch
->state
== KS_STATE_ASCII
|| ch
->state
== KS_STATE_BINARY
)
60 && (unsigned char)src
[0] < 0x80)
62 ch
->state
= KS_STATE_ASCII
;
63 ch
->value
= (unsigned int)src
[0];
67 else if (ch
->state
== KS_STATE_KSX1001
68 && 0x21 <= src
[0] && src
[0] <= 0x7E
69 && 0x21 <= src
[1] && src
[1] <= 0x7E)
71 hi
= (unsigned int)src
[0];
72 lo
= (unsigned int)src
[1];
73 if (cp949_to_uni_tbls
[hi
-1] != NULL
74 && cp949_to_uni_tbls
[hi
-1][lo
+0x3F] != 0xFFFD)
76 ch
->value
= hi
* 256 + lo
;
87 ch
->state
= KS_STATE_BINARY
;
93 static unicode_char
c2u_iso2022kr_convchar(unsigned int c
, int state
)
95 unsigned int hi
= (c
>> 8);
96 unsigned int lo
= c
& 0x00FF;
98 /* Control characters */
99 if (c
< (unsigned int)0x0020)
100 return (unicode_char
)c
;
102 else if (state
== KS_STATE_ASCII
&& c
< (unsigned int)0x0080)
103 return (unicode_char
)c
;
105 else if (state
== KS_STATE_KSX1001
&& c
!= 0x003F
106 && cp949_to_uni_tbls
[hi
-1] != NULL
107 && cp949_to_uni_tbls
[hi
-1][lo
+0x3F] != 0xFFFD)
108 return cp949_to_uni_tbls
[hi
-1][lo
+0x3F];
111 return (unicode_char
)0xFFFD;
114 static unicode_char
*c2u_iso2022kr(const struct unicode_info
*u
,
115 const char *ks_str
, int *err
)
124 /* Count the number of potential unicode characters first. */
126 ch
.state
= KS_STATE_ASCII
;
129 i
+= read_char(ks_str
+i
, &ch
);
134 uc
= malloc((cnt
+1) * sizeof(unicode_char
));
139 ch
.state
= KS_STATE_ASCII
;
142 w
= read_char(ks_str
+i
, &ch
);
144 uc
[cnt
] = c2u_iso2022kr_convchar(ch
.value
, ch
.state
);
145 if (uc
[cnt
] == (unicode_char
)0xFFFD && err
) {
159 static void revlookup(unicode_char u
, struct kschar_t
*ch
)
161 unsigned int hi
= u
>> 8;
162 unsigned int lo
= u
& 0x00ff;
164 unsigned char c1
, c2
;
166 /* ISO-2022-KR is mapped inside BMP range. */
167 if (u
>= (unicode_char
)0x10000)
169 ch
->state
= KS_STATE_BINARY
;
175 if (u
< (unicode_char
)0x0080)
177 ch
->state
= KS_STATE_ASCII
;
178 ch
->value
= (unsigned int)u
;
182 /* For compatibility: 2 Characters replaced by KS X 1003 */
183 if (u
== (unicode_char
)0x20A9) /* WON SIGN */
185 ch
->state
= KS_STATE_ASCII
;
189 if (u
== (unicode_char
)0x203E) /* OVERLINE */
191 ch
->state
= KS_STATE_ASCII
;
197 if (uni_to_ksx1001_tbls
[hi
] != NULL
198 && (k
= uni_to_ksx1001_tbls
[hi
][lo
]) != 0x003F)
202 if (c1
>= (unsigned char)0xA1 && c2
>= (unsigned char)0xA1)
206 ch
->state
= KS_STATE_KSX1001
;
207 ch
->value
= c1
*256 + c2
;
212 ch
->state
= KS_STATE_BINARY
;
218 /* Otherwise, return 'unknown' characters */
219 ch
->state
= KS_STATE_BINARY
;
224 static char *u2c_iso2022kr(const struct unicode_info
*u
,
225 const unicode_char
*str
, int *err
)
229 int kstate
= KS_STATE_ASCII
;
238 /* Count the number of potential octets first. */
239 ch
.state
= KS_STATE_ASCII
;
241 kstate
= KS_STATE_ASCII
;
243 for (i
= cnt
= 0; str
[i
]; i
++) {
244 revlookup(str
[i
], &ch
);
253 cnt
+= ((kstate
== KS_STATE_KSX1001
)? 2: 1);
254 if (kstate
== KS_STATE_KSX1001
)
257 if (kstate
!= KS_STATE_ASCII
&& kstate
!= KS_STATE_BINARY
)
269 s
[cnt
++] = KS_CHAR_ESC
;
274 ch
.state
= KS_STATE_ASCII
;
276 kstate
= KS_STATE_ASCII
;
277 for (i
= 0; str
[i
]; i
++)
279 revlookup(str
[i
], &ch
);
286 case KS_STATE_KSX1001
:
287 s
[cnt
++] = KS_CHAR_SO
;
290 s
[cnt
++] = KS_CHAR_SI
;
297 case KS_STATE_KSX1001
:
298 s
[cnt
++] = (char)(k
>> 8);
299 s
[cnt
++] = (char)(k
& 0x00FF);
305 if (kstate
== KS_STATE_BINARY
&& k
== 0x003F)
313 if (kstate
!= KS_STATE_ASCII
&& kstate
!= KS_STATE_BINARY
)
315 s
[cnt
++] = KS_CHAR_SI
;
324 * EUC-KR / CP949 (UHC) Converters
327 static unicode_char
*c2u_euckr_doconv(const struct unicode_info
*u
,
328 const char *euckr_str
, int *err
,
333 unsigned char hi
=0, lo
=0;
340 len
= strlen(euckr_str
);
341 uc
= (unicode_char
*)malloc((len
+1) * sizeof(unicode_char
) *2);
347 /* 2 Characters replaced by KS X 1003 */
348 if ((compat
& EUCKR_CP949_EXTENSION
)
349 && euckr_str
[i
] == 0x5C) /* WON SIGN */
351 uc
[pos
++] = (unicode_char
)0x20A9;
354 else if ((compat
& EUCKR_CP949_EXTENSION
)
355 && euckr_str
[i
] == 0x7E) /* OVERLINE */
357 uc
[pos
++] = (unicode_char
)0x203E;
360 /* US-ASCII or KS X 1003 */
361 else if((unsigned char)euckr_str
[i
] < 0x80)
363 uc
[pos
++] = (unicode_char
)(euckr_str
[i
]);
367 else if ((unsigned char)euckr_str
[i
] >= 0xa1
368 && (unsigned char)euckr_str
[i
+1] >= 0xa1)
370 hi
= (unsigned char)euckr_str
[i
];
371 lo
= (unsigned char)euckr_str
[i
+1];
373 if (cp949_to_uni_tbls
[hi
-0x81] == NULL
)
374 c
= (unicode_char
)0xFFFD;
376 c
= cp949_to_uni_tbls
[hi
-0x81][lo
-0x41];
379 if (c
== (unicode_char
)0xFFFD && err
)
388 /* CP949 extension */
389 else if ((0x81 <= (unsigned)euckr_str
[i
]
390 && (unsigned)euckr_str
[i
] <= 0xFE)
391 && ((0x41 <= (unsigned)euckr_str
[i
+1]
392 && (unsigned)euckr_str
[i
+1] <= 0x5A)
393 || (0x61 <= (unsigned)euckr_str
[i
+1]
394 && (unsigned)euckr_str
[i
+1] <= 0x7A)
395 || (0x81 <= (unsigned)euckr_str
[i
+1]
396 && (unsigned)euckr_str
[i
+1] <= 0xFE)))
398 hi
= (unsigned char)euckr_str
[i
];
399 lo
= (unsigned char)euckr_str
[i
+1];
401 if (!(compat
& EUCKR_CP949_EXTENSION
))
403 else if (cp949_to_uni_tbls
[hi
-0x81] != NULL
)
404 c
= cp949_to_uni_tbls
[hi
-0x81][lo
-0x41];
409 if (c
== 0xFFFD && err
)
424 uc
[pos
++] = (unicode_char
)0xFFFD;
433 static unicode_char
*c2u_euckr(const struct unicode_info
*u
,
434 const char *euckr_str
, int *err
)
436 return c2u_euckr_doconv(u
, euckr_str
, err
, 0);
439 static unicode_char
*c2u_cp949(const struct unicode_info
*u
,
440 const char *euckr_str
, int *err
)
442 return c2u_euckr_doconv(u
, euckr_str
, err
, EUCKR_CP949_EXTENSION
);
446 static char *u2c_euckr_doconv(const struct unicode_info
*u
,
447 const unicode_char
*str
, int *err
,
459 s
= malloc((len
+1)*2);
464 for(i
=0; str
[i
]; i
++)
467 unsigned char hi
=0, lo
=0;
469 unsigned char str_i_high
=str
[i
] >> 8;
471 /* EUC-KR is mapped inside BMP range. */
472 if (str
[i
] >= (unicode_char
)0x10000)
483 else if (str
[i
] < (unicode_char
)0x0080)
484 s
[pos
++] = (char)str
[i
];
485 /* For compatibility: 2 characters replaced by KS X 1003 */
486 else if (str
[i
] == (unicode_char
)0x20A9) /* WON SIGN */
488 else if (str
[i
] == (unicode_char
)0x203E) /* OVERLINE */
491 else if (uni_to_ksx1001_tbls
[str_i_high
] != NULL
)
493 ksx_char
= uni_to_ksx1001_tbls
[str_i_high
][str
[i
] & 0xff];
495 lo
= ksx_char
& 0xff;
508 if (ksx_char
== 0x003F && err
)
515 /* CP949 Extension */
516 else if (uni_to_cp949_tbls
[str_i_high
] != NULL
)
519 if (!(compat
& EUCKR_CP949_EXTENSION
))
522 ksx_char
= uni_to_cp949_tbls
[str_i_high
][str
[i
] & 0xff];
524 lo
= ksx_char
& 0xff;
537 if (ksx_char
== 0x003F && err
)
559 static char *u2c_euckr(const struct unicode_info
*u
,
560 const unicode_char
*str
, int *err
)
562 return u2c_euckr_doconv(u
, str
, err
, 0);
565 static char *u2c_cp949(const struct unicode_info
*u
,
566 const unicode_char
*str
, int *err
)
568 return u2c_euckr_doconv(u
, str
, err
, EUCKR_CP949_EXTENSION
);
572 static char *toupper_func(const struct unicode_info
*u
,
573 const char *cp
, int *ip
)
575 unicode_char
*uc
= (*u
->c2u
)(u
, cp
, ip
);
582 for (i
=0; uc
[i
] && i
<10000; i
++) {
583 if ((unicode_char
)'a' <= uc
[i
] && uc
[i
] <= (unicode_char
)'z')
584 uc
[i
] = uc
[i
] - ((unicode_char
)'a' - (unicode_char
)'A');
587 s
= (*u
->u2c
)(u
, uc
, NULL
);
592 static char *tolower_func(const struct unicode_info
*u
,
593 const char *cp
, int *ip
)
595 unicode_char
*uc
= (*u
->c2u
)(u
, cp
, ip
);
602 for (i
=0; uc
[i
]; i
++) {
603 if ((unicode_char
)'A' <= uc
[i
] && uc
[i
] <= (unicode_char
)'Z')
604 uc
[i
] = uc
[i
] + ((unicode_char
)'a' - (unicode_char
)'A');
607 s
= (*u
->u2c
)(u
, uc
, NULL
);
614 static char *totitle_func(const struct unicode_info
*u
,
615 const char *cp
, int *ip
)
617 unicode_char
*uc
= (*u
->c2u
)(u
, cp
, ip
);
623 /* Uh, sorry, what's "title" char? */
625 * for (i=0; uc[i]; i++)
626 * uc[i] = unicode_tc(uc[i]);
629 s
= (*u
->u2c
)(u
, uc
, NULL
);
634 extern const struct unicode_info unicode_UTF8
;
636 const struct unicode_info unicode_ISO2022_KR
= {
638 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_SISO
|
639 UNICODE_HEADER_BASE64
,
648 const struct unicode_info unicode_EUC_KR
= {
650 UNICODE_MB
| UNICODE_REPLACEABLE
| UNICODE_USASCII
|
651 UNICODE_HEADER_BASE64
| UNICODE_BODY_BASE64
,
660 const struct unicode_info unicode_CP949
= {
662 UNICODE_MB
| UNICODE_REPLACEABLE
|
663 UNICODE_HEADER_BASE64
| UNICODE_BODY_BASE64
,