+++ /dev/null
-/*
- * ISO-2022-KR, EUC-KR & CP949 <=> Unicode translate functions.
- * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
- */
-
-#include <stdio.h>
-#include <string.h>
-#include "unicode.h"
-#include "ksx1001.h"
-
-#define EUCKR_CP949_EXTENSION 1
-
-/*
- * ISO-2022-KR (RFC1557) Converters
- */
-
-struct kschar_t {
- int state;
- unsigned int value;
-};
-
-static size_t read_char(const char* src, struct kschar_t *ch)
-{
-unsigned int hi, lo;
-
- switch (src[0]) {
- case KS_CHAR_SI:
- /* Shift-in */
- ch->state = KS_STATE_ASCII;
- ch->value = 0;
- return 1;
- case KS_CHAR_SO:
- /* Shift-out */
- ch->state = KS_STATE_KSX1001;
- ch->value = 0;
- return 1;
- case KS_CHAR_ESC:
- /* Announcer sequence */
- if (src[1] == '$' && src[2] == ')' && src[3] == 'C') {
- ch->value = 0;
- return 4;
- }
- /* ESC character */
- else
- {
- ch->state = KS_STATE_BINARY;
- ch->value = KS_CHAR_ESC;
- return 1;
- }
- }
-
- /* Control Characters */
- if ((unsigned char)src[0] < 0x20)
- {
- /* state will not be changed. */
- ch->value = (unsigned int)src[0];
- }
- /* US-ASCII */
- if ((ch->state == KS_STATE_ASCII || ch->state == KS_STATE_BINARY)
- && (unsigned char)src[0] < 0x80)
- {
- ch->state = KS_STATE_ASCII;
- ch->value = (unsigned int)src[0];
- return 1;
- }
- /* KS X 1001 */
- else if (ch->state == KS_STATE_KSX1001
- && 0x21 <= src[0] && src[0] <= 0x7E
- && 0x21 <= src[1] && src[1] <= 0x7E)
- {
- hi = (unsigned int)src[0];
- lo = (unsigned int)src[1];
- if (cp949_to_uni_tbls[hi-1] != NULL
- && cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
- {
- ch->value = hi * 256 + lo;
- return 2;
- }
- else
- {
- ch->value = 0x003F;
- return 2;
- }
- }
- else
- {
- ch->state = KS_STATE_BINARY;
- ch->value = 0x003F;
- return 1;
- }
-}
-
-static unicode_char c2u_iso2022kr_convchar(unsigned int c, int state)
-{
- unsigned int hi = (c >> 8);
- unsigned int lo = c & 0x00FF;
-
- /* Control characters */
- if (c < (unsigned int)0x0020)
- return (unicode_char)c;
- /* US-ASCII */
- else if (state == KS_STATE_ASCII && c < (unsigned int)0x0080)
- return (unicode_char)c;
- /* KS X 1001 */
- else if (state == KS_STATE_KSX1001 && c != 0x003F
- && cp949_to_uni_tbls[hi-1] != NULL
- && cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
- return cp949_to_uni_tbls[hi-1][lo+0x3F];
- /* Uniknown */
- else
- return (unicode_char)0xFFFD;
-}
-
-static unicode_char *c2u_iso2022kr(const struct unicode_info *u,
- const char *ks_str, int *err)
-{
-size_t i, cnt, w;
-unicode_char *uc;
-struct kschar_t ch;
-
- if (err)
- *err = -1;
-
- /* Count the number of potential unicode characters first. */
- i = cnt = 0;
- ch.state = KS_STATE_ASCII;
- ch.value = 0;
- while (ks_str[i]) {
- i += read_char(ks_str+i, &ch);
- if (ch.value)
- ++cnt;
- }
-
- uc = malloc((cnt+1) * sizeof(unicode_char));
- if (!uc)
- return NULL;
-
- i = cnt = 0;
- ch.state = KS_STATE_ASCII;
- ch.value = 0;
- while (ks_str[i]) {
- w = read_char(ks_str+i, &ch);
- if (ch.value) {
- uc[cnt] = c2u_iso2022kr_convchar(ch.value, ch.state);
- if (uc[cnt] == (unicode_char)0xFFFD && err) {
- *err = i;
- free(uc);
- return NULL;
- }
- ++cnt;
- }
- i+=w;
- }
- uc[cnt] = 0;
-
- return uc;
-}
-
-static void revlookup(unicode_char u, struct kschar_t *ch)
-{
-unsigned int hi = u >> 8;
-unsigned int lo = u & 0x00ff;
-unsigned int k;
-unsigned char c1, c2;
-
- /* ISO-2022-KR is mapped inside BMP range. */
- if (u >= (unicode_char)0x10000)
- {
- ch->state = KS_STATE_BINARY;
- ch->value = 0x003F;
- return;
- }
-
- /* US-ASCII */
- if (u < (unicode_char)0x0080)
- {
- ch->state = KS_STATE_ASCII;
- ch->value = (unsigned int)u;
- return;
- }
-
- /* For compatibility: 2 Characters replaced by KS X 1003 */
- if (u == (unicode_char)0x20A9) /* WON SIGN */
- {
- ch->state = KS_STATE_ASCII;
- ch->value = 0x5C;
- return;
- }
- if (u == (unicode_char)0x203E) /* OVERLINE */
- {
- ch->state = KS_STATE_ASCII;
- ch->value = 0x7E;
- return;
- }
-
- /* KS X 1001 */
- if (uni_to_ksx1001_tbls[hi] != NULL
- && (k = uni_to_ksx1001_tbls[hi][lo]) != 0x003F)
- {
- c1 = (k >> 8);
- c2 = (k & 0x00FF);
- if (c1 >= (unsigned char)0xA1 && c2 >= (unsigned char)0xA1)
- {
- c1 -= 0x80;
- c2 -= 0x80;
- ch->state = KS_STATE_KSX1001;
- ch->value = c1*256 + c2;
- return;
- }
- else
- {
- ch->state = KS_STATE_BINARY;
- ch->value = 0x003F;
- return;
- }
- }
-
- /* Otherwise, return 'unknown' characters */
- ch->state = KS_STATE_BINARY;
- ch->value = 0x003F;
- return;
-}
-
-static char *u2c_iso2022kr(const struct unicode_info *u,
- const unicode_char *str, int *err)
-{
-size_t i, cnt;
-int k;
-int kstate = KS_STATE_ASCII;
-int ks;
-int has_ksx1001=0;
-char *s;
-struct kschar_t ch;
-
- if (err)
- *err = -1;
-
- /* Count the number of potential octets first. */
- ch.state = KS_STATE_ASCII;
- ch.value = 0;
- kstate = KS_STATE_ASCII;
- has_ksx1001 = 0;
- for (i = cnt = 0; str[i]; i++) {
- revlookup(str[i], &ch);
- ks = ch.state;
- k = ch.value;
- if (ks != kstate)
- {
- cnt++;
- kstate = ks;
- }
- if (k)
- cnt += ((kstate == KS_STATE_KSX1001)? 2: 1);
- if (kstate == KS_STATE_KSX1001)
- has_ksx1001 = 1;
- }
- if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
- cnt++;
- if (has_ksx1001)
- cnt+=4;
-
- s = malloc(cnt+1);
- if (!s)
- return NULL;
-
- cnt = 0;
- if (has_ksx1001)
- {
- s[cnt++] = KS_CHAR_ESC;
- s[cnt++] = '$';
- s[cnt++] = ')';
- s[cnt++] = 'C';
- }
- ch.state = KS_STATE_ASCII;
- ch.value = 0;
- kstate = KS_STATE_ASCII;
- for (i = 0; str[i]; i++)
- {
- revlookup(str[i], &ch);
- ks = ch.state;
- k = ch.value;
- if (ks != kstate)
- {
- switch (ks)
- {
- case KS_STATE_KSX1001:
- s[cnt++] = KS_CHAR_SO;
- break;
- default:
- s[cnt++] = KS_CHAR_SI;
- break;
- }
- kstate = ks;
- }
- switch (kstate)
- {
- case KS_STATE_KSX1001:
- s[cnt++] = (char)(k >> 8);
- s[cnt++] = (char)(k & 0x00FF);
- break;
- default:
- s[cnt++] = (char)k;
- }
-
- if (kstate == KS_STATE_BINARY && k == 0x003F)
- if (err)
- {
- *err = i;
- free(s);
- return NULL;
- }
- }
- if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
- {
- s[cnt++] = KS_CHAR_SI;
- }
- s[cnt] = 0;
-
- return s;
-}
-
-
-/*
- * EUC-KR / CP949 (UHC) Converters
- */
-
-static unicode_char *c2u_euckr_doconv(const struct unicode_info *u,
- const char *euckr_str, int *err,
- int compat)
-{
- unicode_char *uc=0;
- unicode_char c;
- unsigned char hi=0, lo=0;
- int len=0;
- int i=0;
- int pos=0;
-
- if(err) *err = -1;
-
- len = strlen(euckr_str);
- uc = (unicode_char*)malloc((len+1) * sizeof(unicode_char) *2);
-
- if (!uc)
- return NULL;
-
- for(i=0; i<len;) {
- /* 2 Characters replaced by KS X 1003 */
- if ((compat & EUCKR_CP949_EXTENSION)
- && euckr_str[i] == 0x5C) /* WON SIGN */
- {
- uc[pos++] = (unicode_char)0x20A9;
- i++;
- }
- else if ((compat & EUCKR_CP949_EXTENSION)
- && euckr_str[i] == 0x7E) /* OVERLINE */
- {
- uc[pos++] = (unicode_char)0x203E;
- i++;
- }
- /* US-ASCII or KS X 1003 */
- else if((unsigned char)euckr_str[i] < 0x80)
- {
- uc[pos++] = (unicode_char)(euckr_str[i]);
- i++;
- }
- /* KS X 1001 */
- else if ((unsigned char)euckr_str[i] >= 0xa1
- && (unsigned char)euckr_str[i+1] >= 0xa1)
- {
- hi = (unsigned char)euckr_str[i];
- lo = (unsigned char)euckr_str[i+1];
-
- if (cp949_to_uni_tbls[hi-0x81] == NULL)
- c = (unicode_char)0xFFFD;
- else
- c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
-
- uc[pos++] = c;
- if (c == (unicode_char)0xFFFD && err)
- {
- *err = i;
- free(uc);
- return NULL;
- }
-
- i+=2;
- }
- /* CP949 extension */
- else if ((0x81 <= (unsigned)euckr_str[i]
- && (unsigned)euckr_str[i] <= 0xFE)
- && ((0x41 <= (unsigned)euckr_str[i+1]
- && (unsigned)euckr_str[i+1] <= 0x5A)
- || (0x61 <= (unsigned)euckr_str[i+1]
- && (unsigned)euckr_str[i+1] <= 0x7A)
- || (0x81 <= (unsigned)euckr_str[i+1]
- && (unsigned)euckr_str[i+1] <= 0xFE)))
- {
- hi = (unsigned char)euckr_str[i];
- lo = (unsigned char)euckr_str[i+1];
-
- if (!(compat & EUCKR_CP949_EXTENSION))
- c = 0xFFFD;
- else if (cp949_to_uni_tbls[hi-0x81] != NULL)
- c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
- else
- c = 0xFFFD;
-
- uc[pos++] = c;
- if (c == 0xFFFD && err)
- *err = i;
- free(uc);
- return NULL;
- i+=2;
- }
- /* Not found */
- else if (err)
- {
- *err = i;
- free(uc);
- return NULL;
- }
- else
- {
- uc[pos++] = (unicode_char)0xFFFD;
- i++;
- }
- }
- uc[pos++] = 0;
-
- return uc;
-}
-
-static unicode_char *c2u_euckr(const struct unicode_info *u,
- const char *euckr_str, int *err)
-{
- return c2u_euckr_doconv(u, euckr_str, err, 0);
-}
-
-static unicode_char *c2u_cp949(const struct unicode_info *u,
- const char *euckr_str, int *err)
-{
- return c2u_euckr_doconv(u, euckr_str, err, EUCKR_CP949_EXTENSION);
-}
-
-
-static char *u2c_euckr_doconv(const struct unicode_info *u,
- const unicode_char *str, int *err,
- int compat)
-{
- int i=0;
- int pos=0;
- int len=0;
- char* s;
-
- if(err) *err = -1;
-
- while(str[len])
- len++;
- s = malloc((len+1)*2);
-
- if (!s)
- return NULL;
-
- for(i=0; str[i]; i++)
- {
- int ksx_char = 0;
- unsigned char hi=0, lo=0;
-
- unsigned char str_i_high=str[i] >> 8;
-
- /* EUC-KR is mapped inside BMP range. */
- if (str[i] >= (unicode_char)0x10000)
- {
- if (err)
- {
- *err = i;
- free(s);
- return NULL;
- }
- s[pos++] = '?';
- }
- /* US-ASCII */
- else if (str[i] < (unicode_char)0x0080)
- s[pos++] = (char)str[i];
- /* For compatibility: 2 characters replaced by KS X 1003 */
- else if (str[i] == (unicode_char)0x20A9) /* WON SIGN */
- s[pos++] = 0x5C;
- else if (str[i] == (unicode_char)0x203E) /* OVERLINE */
- s[pos++] = 0x7E;
- /* KS X 1001 */
- else if (uni_to_ksx1001_tbls[str_i_high] != NULL)
- {
- ksx_char = uni_to_ksx1001_tbls[str_i_high][str[i] & 0xff];
- hi = ksx_char >> 8;
- lo = ksx_char & 0xff;
-
- if (hi)
- {
- s[pos++] = hi;
- s[pos++] = lo;
- }
- else
- {
- ksx_char = 0x003F;
- s[pos++] = '?';
- }
-
- if (ksx_char == 0x003F && err)
- {
- *err = i;
- free(s);
- return NULL;
- }
- }
- /* CP949 Extension */
- else if (uni_to_cp949_tbls[str_i_high] != NULL)
- {
-
- if (!(compat & EUCKR_CP949_EXTENSION))
- ksx_char = 0x003F;
- else
- ksx_char = uni_to_cp949_tbls[str_i_high][str[i] & 0xff];
- hi = ksx_char >> 8;
- lo = ksx_char & 0xff;
-
- if (hi)
- {
- s[pos++] = hi;
- s[pos++] = lo;
- }
- else
- {
- ksx_char = 0x003F;
- s[pos++] = '?';
- }
-
- if (ksx_char == 0x003F && err)
- {
- *err = i;
- free(s);
- return NULL;
- }
- }
- /* Not found */
- else if (err)
- {
- *err = i;
- free(s);
- return NULL;
- }
- else
- s[pos++] = '?';
- }
- s[pos] = 0;
-
- return s;
-}
-
-static char *u2c_euckr(const struct unicode_info *u,
- const unicode_char *str, int *err)
-{
- return u2c_euckr_doconv(u, str, err, 0);
-}
-
-static char *u2c_cp949(const struct unicode_info *u,
- const unicode_char *str, int *err)
-{
- return u2c_euckr_doconv(u, str, err, EUCKR_CP949_EXTENSION);
-}
-
-
-static char *toupper_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc = (*u->c2u)(u, cp, ip);
- char *s;
- size_t i;
-
- if (!uc)
- return (NULL);
-
- for (i=0; uc[i] && i<10000; i++) {
- if ((unicode_char)'a' <= uc[i] && uc[i] <= (unicode_char)'z')
- uc[i] = uc[i] - ((unicode_char)'a' - (unicode_char)'A');
- }
-
- s = (*u->u2c)(u, uc, NULL);
- free(uc);
- return (s);
-}
-
-static char *tolower_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc = (*u->c2u)(u, cp, ip);
- char *s;
- size_t i;
-
- if (!uc)
- return (NULL);
-
- for (i=0; uc[i]; i++) {
- if ((unicode_char)'A' <= uc[i] && uc[i] <= (unicode_char)'Z')
- uc[i] = uc[i] + ((unicode_char)'a' - (unicode_char)'A');
- }
-
- s = (*u->u2c)(u, uc, NULL);
- free(uc);
-
- return (s);
-}
-
-
-static char *totitle_func(const struct unicode_info *u,
- const char *cp, int *ip)
-{
- unicode_char *uc = (*u->c2u)(u, cp, ip);
- char *s;
-
- if (!uc)
- return (NULL);
-
- /* Uh, sorry, what's "title" char? */
- /*
- * for (i=0; uc[i]; i++)
- * uc[i] = unicode_tc(uc[i]);
- */
-
- s = (*u->u2c)(u, uc, NULL);
- free(uc);
- return (s);
-}
-
-extern const struct unicode_info unicode_UTF8;
-
-const struct unicode_info unicode_ISO2022_KR = {
- "ISO-2022-KR",
- UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_SISO |
- UNICODE_HEADER_BASE64,
- c2u_iso2022kr,
- u2c_iso2022kr,
- toupper_func,
- tolower_func,
- totitle_func,
- &unicode_UTF8
-};
-
-const struct unicode_info unicode_EUC_KR = {
- "EUC-KR",
- UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
- UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
- c2u_euckr,
- u2c_euckr,
- toupper_func,
- tolower_func,
- totitle_func,
- &unicode_UTF8
-};
-
-const struct unicode_info unicode_CP949 = {
- "CP949",
- UNICODE_MB | UNICODE_REPLACEABLE |
- UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
- c2u_cp949,
- u2c_cp949,
- toupper_func,
- tolower_func,
- totitle_func,
- &unicode_UTF8
-};
-