Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / ksx1001.c
CommitLineData
8d138742
CE
1/*
2 * ISO-2022-KR, EUC-KR & CP949 <=> Unicode translate functions.
3 * by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
4 */
5
6#include <stdio.h>
7#include <string.h>
8#include "unicode.h"
9#include "ksx1001.h"
10
11#define EUCKR_CP949_EXTENSION 1
12
13/*
14 * ISO-2022-KR (RFC1557) Converters
15 */
16
17struct kschar_t {
18 int state;
19 unsigned int value;
20};
21
22static size_t read_char(const char* src, struct kschar_t *ch)
23{
24unsigned int hi, lo;
25
26 switch (src[0]) {
27 case KS_CHAR_SI:
28 /* Shift-in */
29 ch->state = KS_STATE_ASCII;
30 ch->value = 0;
31 return 1;
32 case KS_CHAR_SO:
33 /* Shift-out */
34 ch->state = KS_STATE_KSX1001;
35 ch->value = 0;
36 return 1;
37 case KS_CHAR_ESC:
38 /* Announcer sequence */
39 if (src[1] == '$' && src[2] == ')' && src[3] == 'C') {
40 ch->value = 0;
41 return 4;
42 }
43 /* ESC character */
44 else
45 {
46 ch->state = KS_STATE_BINARY;
47 ch->value = KS_CHAR_ESC;
48 return 1;
49 }
50 }
51
52 /* Control Characters */
53 if ((unsigned char)src[0] < 0x20)
54 {
55 /* state will not be changed. */
56 ch->value = (unsigned int)src[0];
57 }
58 /* US-ASCII */
59 if ((ch->state == KS_STATE_ASCII || ch->state == KS_STATE_BINARY)
60 && (unsigned char)src[0] < 0x80)
61 {
62 ch->state = KS_STATE_ASCII;
63 ch->value = (unsigned int)src[0];
64 return 1;
65 }
66 /* KS X 1001 */
67 else if (ch->state == KS_STATE_KSX1001
68 && 0x21 <= src[0] && src[0] <= 0x7E
69 && 0x21 <= src[1] && src[1] <= 0x7E)
70 {
71 hi = (unsigned int)src[0];
72 lo = (unsigned int)src[1];
73 if (cp949_to_uni_tbls[hi-1] != NULL
74 && cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
75 {
76 ch->value = hi * 256 + lo;
77 return 2;
78 }
79 else
80 {
81 ch->value = 0x003F;
82 return 2;
83 }
84 }
85 else
86 {
87 ch->state = KS_STATE_BINARY;
88 ch->value = 0x003F;
89 return 1;
90 }
91}
92
93static unicode_char c2u_iso2022kr_convchar(unsigned int c, int state)
94{
95 unsigned int hi = (c >> 8);
96 unsigned int lo = c & 0x00FF;
97
98 /* Control characters */
99 if (c < (unsigned int)0x0020)
100 return (unicode_char)c;
101 /* US-ASCII */
102 else if (state == KS_STATE_ASCII && c < (unsigned int)0x0080)
103 return (unicode_char)c;
104 /* KS X 1001 */
105 else if (state == KS_STATE_KSX1001 && c != 0x003F
106 && cp949_to_uni_tbls[hi-1] != NULL
107 && cp949_to_uni_tbls[hi-1][lo+0x3F] != 0xFFFD)
108 return cp949_to_uni_tbls[hi-1][lo+0x3F];
109 /* Uniknown */
110 else
111 return (unicode_char)0xFFFD;
112}
113
114static unicode_char *c2u_iso2022kr(const struct unicode_info *u,
115 const char *ks_str, int *err)
116{
117size_t i, cnt, w;
118unicode_char *uc;
119struct kschar_t ch;
120
121 if (err)
122 *err = -1;
123
124 /* Count the number of potential unicode characters first. */
125 i = cnt = 0;
126 ch.state = KS_STATE_ASCII;
127 ch.value = 0;
128 while (ks_str[i]) {
129 i += read_char(ks_str+i, &ch);
130 if (ch.value)
131 ++cnt;
132 }
133
134 uc = malloc((cnt+1) * sizeof(unicode_char));
135 if (!uc)
136 return NULL;
137
138 i = cnt = 0;
139 ch.state = KS_STATE_ASCII;
140 ch.value = 0;
141 while (ks_str[i]) {
142 w = read_char(ks_str+i, &ch);
143 if (ch.value) {
144 uc[cnt] = c2u_iso2022kr_convchar(ch.value, ch.state);
145 if (uc[cnt] == (unicode_char)0xFFFD && err) {
146 *err = i;
147 free(uc);
148 return NULL;
149 }
150 ++cnt;
151 }
152 i+=w;
153 }
154 uc[cnt] = 0;
155
156 return uc;
157}
158
159static void revlookup(unicode_char u, struct kschar_t *ch)
160{
161unsigned int hi = u >> 8;
162unsigned int lo = u & 0x00ff;
163unsigned int k;
164unsigned char c1, c2;
165
166 /* ISO-2022-KR is mapped inside BMP range. */
167 if (u >= (unicode_char)0x10000)
168 {
169 ch->state = KS_STATE_BINARY;
170 ch->value = 0x003F;
171 return;
172 }
173
174 /* US-ASCII */
175 if (u < (unicode_char)0x0080)
176 {
177 ch->state = KS_STATE_ASCII;
178 ch->value = (unsigned int)u;
179 return;
180 }
181
182 /* For compatibility: 2 Characters replaced by KS X 1003 */
183 if (u == (unicode_char)0x20A9) /* WON SIGN */
184 {
185 ch->state = KS_STATE_ASCII;
186 ch->value = 0x5C;
187 return;
188 }
189 if (u == (unicode_char)0x203E) /* OVERLINE */
190 {
191 ch->state = KS_STATE_ASCII;
192 ch->value = 0x7E;
193 return;
194 }
195
196 /* KS X 1001 */
197 if (uni_to_ksx1001_tbls[hi] != NULL
198 && (k = uni_to_ksx1001_tbls[hi][lo]) != 0x003F)
199 {
200 c1 = (k >> 8);
201 c2 = (k & 0x00FF);
202 if (c1 >= (unsigned char)0xA1 && c2 >= (unsigned char)0xA1)
203 {
204 c1 -= 0x80;
205 c2 -= 0x80;
206 ch->state = KS_STATE_KSX1001;
207 ch->value = c1*256 + c2;
208 return;
209 }
210 else
211 {
212 ch->state = KS_STATE_BINARY;
213 ch->value = 0x003F;
214 return;
215 }
216 }
217
218 /* Otherwise, return 'unknown' characters */
219 ch->state = KS_STATE_BINARY;
220 ch->value = 0x003F;
221 return;
222}
223
224static char *u2c_iso2022kr(const struct unicode_info *u,
225 const unicode_char *str, int *err)
226{
227size_t i, cnt;
228int k;
229int kstate = KS_STATE_ASCII;
230int ks;
231int has_ksx1001=0;
232char *s;
233struct kschar_t ch;
234
235 if (err)
236 *err = -1;
237
238 /* Count the number of potential octets first. */
239 ch.state = KS_STATE_ASCII;
240 ch.value = 0;
241 kstate = KS_STATE_ASCII;
242 has_ksx1001 = 0;
243 for (i = cnt = 0; str[i]; i++) {
244 revlookup(str[i], &ch);
245 ks = ch.state;
246 k = ch.value;
247 if (ks != kstate)
248 {
249 cnt++;
250 kstate = ks;
251 }
252 if (k)
253 cnt += ((kstate == KS_STATE_KSX1001)? 2: 1);
254 if (kstate == KS_STATE_KSX1001)
255 has_ksx1001 = 1;
256 }
257 if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
258 cnt++;
259 if (has_ksx1001)
260 cnt+=4;
261
262 s = malloc(cnt+1);
263 if (!s)
264 return NULL;
265
266 cnt = 0;
267 if (has_ksx1001)
268 {
269 s[cnt++] = KS_CHAR_ESC;
270 s[cnt++] = '$';
271 s[cnt++] = ')';
272 s[cnt++] = 'C';
273 }
274 ch.state = KS_STATE_ASCII;
275 ch.value = 0;
276 kstate = KS_STATE_ASCII;
277 for (i = 0; str[i]; i++)
278 {
279 revlookup(str[i], &ch);
280 ks = ch.state;
281 k = ch.value;
282 if (ks != kstate)
283 {
284 switch (ks)
285 {
286 case KS_STATE_KSX1001:
287 s[cnt++] = KS_CHAR_SO;
288 break;
289 default:
290 s[cnt++] = KS_CHAR_SI;
291 break;
292 }
293 kstate = ks;
294 }
295 switch (kstate)
296 {
297 case KS_STATE_KSX1001:
298 s[cnt++] = (char)(k >> 8);
299 s[cnt++] = (char)(k & 0x00FF);
300 break;
301 default:
302 s[cnt++] = (char)k;
303 }
304
305 if (kstate == KS_STATE_BINARY && k == 0x003F)
306 if (err)
307 {
308 *err = i;
309 free(s);
310 return NULL;
311 }
312 }
313 if (kstate != KS_STATE_ASCII && kstate != KS_STATE_BINARY)
314 {
315 s[cnt++] = KS_CHAR_SI;
316 }
317 s[cnt] = 0;
318
319 return s;
320}
321
322
323/*
324 * EUC-KR / CP949 (UHC) Converters
325 */
326
327static unicode_char *c2u_euckr_doconv(const struct unicode_info *u,
328 const char *euckr_str, int *err,
329 int compat)
330{
331 unicode_char *uc=0;
332 unicode_char c;
333 unsigned char hi=0, lo=0;
334 int len=0;
335 int i=0;
336 int pos=0;
337
338 if(err) *err = -1;
339
340 len = strlen(euckr_str);
341 uc = (unicode_char*)malloc((len+1) * sizeof(unicode_char) *2);
342
343 if (!uc)
344 return NULL;
345
346 for(i=0; i<len;) {
347 /* 2 Characters replaced by KS X 1003 */
348 if ((compat & EUCKR_CP949_EXTENSION)
349 && euckr_str[i] == 0x5C) /* WON SIGN */
350 {
351 uc[pos++] = (unicode_char)0x20A9;
352 i++;
353 }
354 else if ((compat & EUCKR_CP949_EXTENSION)
355 && euckr_str[i] == 0x7E) /* OVERLINE */
356 {
357 uc[pos++] = (unicode_char)0x203E;
358 i++;
359 }
360 /* US-ASCII or KS X 1003 */
361 else if((unsigned char)euckr_str[i] < 0x80)
362 {
363 uc[pos++] = (unicode_char)(euckr_str[i]);
364 i++;
365 }
366 /* KS X 1001 */
367 else if ((unsigned char)euckr_str[i] >= 0xa1
368 && (unsigned char)euckr_str[i+1] >= 0xa1)
369 {
370 hi = (unsigned char)euckr_str[i];
371 lo = (unsigned char)euckr_str[i+1];
372
373 if (cp949_to_uni_tbls[hi-0x81] == NULL)
374 c = (unicode_char)0xFFFD;
375 else
376 c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
377
378 uc[pos++] = c;
379 if (c == (unicode_char)0xFFFD && err)
380 {
381 *err = i;
382 free(uc);
383 return NULL;
384 }
385
386 i+=2;
387 }
388 /* CP949 extension */
389 else if ((0x81 <= (unsigned)euckr_str[i]
390 && (unsigned)euckr_str[i] <= 0xFE)
391 && ((0x41 <= (unsigned)euckr_str[i+1]
392 && (unsigned)euckr_str[i+1] <= 0x5A)
393 || (0x61 <= (unsigned)euckr_str[i+1]
394 && (unsigned)euckr_str[i+1] <= 0x7A)
395 || (0x81 <= (unsigned)euckr_str[i+1]
396 && (unsigned)euckr_str[i+1] <= 0xFE)))
397 {
398 hi = (unsigned char)euckr_str[i];
399 lo = (unsigned char)euckr_str[i+1];
400
401 if (!(compat & EUCKR_CP949_EXTENSION))
402 c = 0xFFFD;
403 else if (cp949_to_uni_tbls[hi-0x81] != NULL)
404 c = cp949_to_uni_tbls[hi-0x81][lo-0x41];
405 else
406 c = 0xFFFD;
407
408 uc[pos++] = c;
409 if (c == 0xFFFD && err)
410 *err = i;
411 free(uc);
412 return NULL;
413 i+=2;
414 }
415 /* Not found */
416 else if (err)
417 {
418 *err = i;
419 free(uc);
420 return NULL;
421 }
422 else
423 {
424 uc[pos++] = (unicode_char)0xFFFD;
425 i++;
426 }
427 }
428 uc[pos++] = 0;
429
430 return uc;
431}
432
433static unicode_char *c2u_euckr(const struct unicode_info *u,
434 const char *euckr_str, int *err)
435{
436 return c2u_euckr_doconv(u, euckr_str, err, 0);
437}
438
439static unicode_char *c2u_cp949(const struct unicode_info *u,
440 const char *euckr_str, int *err)
441{
442 return c2u_euckr_doconv(u, euckr_str, err, EUCKR_CP949_EXTENSION);
443}
444
445
446static char *u2c_euckr_doconv(const struct unicode_info *u,
447 const unicode_char *str, int *err,
448 int compat)
449{
450 int i=0;
451 int pos=0;
452 int len=0;
453 char* s;
454
455 if(err) *err = -1;
456
457 while(str[len])
458 len++;
459 s = malloc((len+1)*2);
460
461 if (!s)
462 return NULL;
463
464 for(i=0; str[i]; i++)
465 {
466 int ksx_char = 0;
467 unsigned char hi=0, lo=0;
468
469 unsigned char str_i_high=str[i] >> 8;
470
471 /* EUC-KR is mapped inside BMP range. */
472 if (str[i] >= (unicode_char)0x10000)
473 {
474 if (err)
475 {
476 *err = i;
477 free(s);
478 return NULL;
479 }
480 s[pos++] = '?';
481 }
482 /* US-ASCII */
483 else if (str[i] < (unicode_char)0x0080)
484 s[pos++] = (char)str[i];
485 /* For compatibility: 2 characters replaced by KS X 1003 */
486 else if (str[i] == (unicode_char)0x20A9) /* WON SIGN */
487 s[pos++] = 0x5C;
488 else if (str[i] == (unicode_char)0x203E) /* OVERLINE */
489 s[pos++] = 0x7E;
490 /* KS X 1001 */
491 else if (uni_to_ksx1001_tbls[str_i_high] != NULL)
492 {
493 ksx_char = uni_to_ksx1001_tbls[str_i_high][str[i] & 0xff];
494 hi = ksx_char >> 8;
495 lo = ksx_char & 0xff;
496
497 if (hi)
498 {
499 s[pos++] = hi;
500 s[pos++] = lo;
501 }
502 else
503 {
504 ksx_char = 0x003F;
505 s[pos++] = '?';
506 }
507
508 if (ksx_char == 0x003F && err)
509 {
510 *err = i;
511 free(s);
512 return NULL;
513 }
514 }
515 /* CP949 Extension */
516 else if (uni_to_cp949_tbls[str_i_high] != NULL)
517 {
518
519 if (!(compat & EUCKR_CP949_EXTENSION))
520 ksx_char = 0x003F;
521 else
522 ksx_char = uni_to_cp949_tbls[str_i_high][str[i] & 0xff];
523 hi = ksx_char >> 8;
524 lo = ksx_char & 0xff;
525
526 if (hi)
527 {
528 s[pos++] = hi;
529 s[pos++] = lo;
530 }
531 else
532 {
533 ksx_char = 0x003F;
534 s[pos++] = '?';
535 }
536
537 if (ksx_char == 0x003F && err)
538 {
539 *err = i;
540 free(s);
541 return NULL;
542 }
543 }
544 /* Not found */
545 else if (err)
546 {
547 *err = i;
548 free(s);
549 return NULL;
550 }
551 else
552 s[pos++] = '?';
553 }
554 s[pos] = 0;
555
556 return s;
557}
558
559static char *u2c_euckr(const struct unicode_info *u,
560 const unicode_char *str, int *err)
561{
562 return u2c_euckr_doconv(u, str, err, 0);
563}
564
565static char *u2c_cp949(const struct unicode_info *u,
566 const unicode_char *str, int *err)
567{
568 return u2c_euckr_doconv(u, str, err, EUCKR_CP949_EXTENSION);
569}
570
571
572static char *toupper_func(const struct unicode_info *u,
573 const char *cp, int *ip)
574{
575 unicode_char *uc = (*u->c2u)(u, cp, ip);
576 char *s;
577 size_t i;
578
579 if (!uc)
580 return (NULL);
581
582 for (i=0; uc[i] && i<10000; i++) {
583 if ((unicode_char)'a' <= uc[i] && uc[i] <= (unicode_char)'z')
584 uc[i] = uc[i] - ((unicode_char)'a' - (unicode_char)'A');
585 }
586
587 s = (*u->u2c)(u, uc, NULL);
588 free(uc);
589 return (s);
590}
591
592static char *tolower_func(const struct unicode_info *u,
593 const char *cp, int *ip)
594{
595 unicode_char *uc = (*u->c2u)(u, cp, ip);
596 char *s;
597 size_t i;
598
599 if (!uc)
600 return (NULL);
601
602 for (i=0; uc[i]; i++) {
603 if ((unicode_char)'A' <= uc[i] && uc[i] <= (unicode_char)'Z')
604 uc[i] = uc[i] + ((unicode_char)'a' - (unicode_char)'A');
605 }
606
607 s = (*u->u2c)(u, uc, NULL);
608 free(uc);
609
610 return (s);
611}
612
613
614static char *totitle_func(const struct unicode_info *u,
615 const char *cp, int *ip)
616{
617 unicode_char *uc = (*u->c2u)(u, cp, ip);
618 char *s;
619
620 if (!uc)
621 return (NULL);
622
623 /* Uh, sorry, what's "title" char? */
624 /*
625 * for (i=0; uc[i]; i++)
626 * uc[i] = unicode_tc(uc[i]);
627 */
628
629 s = (*u->u2c)(u, uc, NULL);
630 free(uc);
631 return (s);
632}
633
634extern const struct unicode_info unicode_UTF8;
635
636const struct unicode_info unicode_ISO2022_KR = {
637 "ISO-2022-KR",
638 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_SISO |
639 UNICODE_HEADER_BASE64,
640 c2u_iso2022kr,
641 u2c_iso2022kr,
642 toupper_func,
643 tolower_func,
644 totitle_func,
645 &unicode_UTF8
646};
647
648const struct unicode_info unicode_EUC_KR = {
649 "EUC-KR",
650 UNICODE_MB | UNICODE_REPLACEABLE | UNICODE_USASCII |
651 UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
652 c2u_euckr,
653 u2c_euckr,
654 toupper_func,
655 tolower_func,
656 totitle_func,
657 &unicode_UTF8
658};
659
660const struct unicode_info unicode_CP949 = {
661 "CP949",
662 UNICODE_MB | UNICODE_REPLACEABLE |
663 UNICODE_HEADER_BASE64 | UNICODE_BODY_BASE64,
664 c2u_cp949,
665 u2c_cp949,
666 toupper_func,
667 tolower_func,
668 totitle_func,
669 &unicode_UTF8
670};
671