Commit | Line | Data |
---|---|---|
8d138742 CE |
1 | #! /usr/bin/perl |
2 | ||
3 | # USAGE: perl ksx1001.pl > ksx1001.h | |
4 | ||
5 | # Requires CP949.TXT, found on: | |
6 | # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT" | |
7 | ||
8 | require "cjkcompat.pl"; | |
9 | ||
10 | $cp949 = 'CP949.TXT'; | |
11 | $perline = 8; | |
12 | ||
13 | die "${cp949}: File not found.\n" if (!(-f $cp949)); | |
14 | open (SET, $cp949) or die "${cp949}: $!\n"; | |
15 | ||
16 | while (<SET>) { | |
17 | chomp; | |
18 | ||
19 | s/\#.*//; | |
20 | ||
21 | next unless /^0x([0-9A-F]{2,4})\s+0x([0-9A-F]{4})\s*$/; | |
22 | ||
23 | my ($code, $ucode) = (hex($1), hex($2)); | |
24 | ||
25 | if ($code > 0x8000) { | |
26 | print STDERR "Warning: duplicated: $code->$k2u{$code},$ucode\n" | |
27 | if defined $k2u{$code}; | |
28 | $k2u{$code} = $ucode; | |
29 | } | |
30 | } | |
31 | ||
32 | close SET; | |
33 | ||
34 | # make reversal map. | |
35 | ||
36 | foreach (keys %k2u) { | |
37 | print STDERR "Warning: duplicated: $u2k{$k2u{$_}},$_<-$k2u{$_}\n" | |
38 | if defined $u2k{$k2u{$_}}; | |
39 | $c1 = $_ >> 8; | |
40 | $c2 = $_ & 0x00FF; | |
41 | if ($c1 >= 0xA1 && $c2 >= 0xA1) { | |
42 | $u2k_ksx1001{$k2u{$_}} = $_; | |
43 | } else { | |
44 | $u2k_cp949{$k2u{$_}} = $_; | |
45 | } | |
46 | } | |
47 | ||
48 | # Add maps for CJK compatibility ideographs of Unicode. | |
49 | ####&add_cjkcompat(%compat_ksx1001); | |
50 | &add_cjkcompat(%compat_big5); | |
51 | &add_cjkcompat(%compat_ibm32); | |
52 | &add_cjkcompat(%compat_jisx0213); | |
53 | &add_cjkcompat(%compat_cns11643); | |
54 | ||
55 | sub add_cjkcompat { | |
56 | local(%compat) = @_; | |
57 | foreach (keys %compat) { | |
58 | if (defined $u2k_ksx1001{$compat{$_}}) { | |
59 | $u2k_ksx1001{$_} = $u2k_ksx1001{$compat{$_}}; | |
60 | } | |
61 | if (defined $u2k_cp949{$compat{$_}}) { | |
62 | $u2k_cp949{$_} = $u2k_cp949{$compat{$_}}; | |
63 | } | |
64 | } | |
65 | } | |
66 | ||
67 | ||
68 | print <<"EOF"; | |
69 | #ifndef _KSX1001_HDR_ | |
70 | #define _KSX1001_HDR_ | |
71 | /* | |
72 | * KS X 1001 and CP949 (UHC) support | |
73 | * by Hatuka*nezumi - IKEDA Soji <nezumi\@jca.apc.org> | |
74 | * $Id: ksx1001.pl,v 1.1 2004/02/03 02:00:00 mrsam Exp $ | |
75 | * | |
76 | */ | |
77 | ||
78 | #include "unicode.h" | |
79 | ||
80 | #define KS_CHAR_SO 0x0E | |
81 | #define KS_CHAR_SI 0x0F | |
82 | #define KS_CHAR_ESC 0x1B | |
83 | ||
84 | /* ISOREG #1/#3: US-ASCII (identical to ISO 646 IRV) */ | |
85 | #define KS_STATE_ASCII 0x0 | |
86 | /* ISOREG #149: KS X 1001:1992 Wansung */ | |
87 | #define KS_STATE_KSX1001 0x4 | |
88 | /* Unknown state */ | |
89 | #define KS_STATE_BINARY 0xF | |
90 | ||
91 | EOF | |
92 | ||
93 | print "/* map: CP949 to Unicode */\n"; | |
94 | for ($hb = 0x81; $hb <= 0xFE; $hb++) { | |
95 | $items = 0; | |
96 | for ($lb = 0x41; $lb <= 0xFE; $lb++) { | |
97 | $items++ if $k2u{$hb*256 + $lb}; | |
98 | } | |
99 | if ($items) { | |
100 | $items = 0; | |
101 | printf "static const unicode_char cp949_to_uni_tbl_%02x[] = {", $hb; | |
102 | for ($lb = 0x41; $lb <= 0xFE; $lb++) { | |
103 | $code = $hb*256 + $lb; | |
104 | print ", " if ($items > 0); | |
105 | print "\n " if ($items % $perline == 0); | |
106 | $k2u{$code} = 0xFFFD unless $k2u{$code}; | |
107 | printf("0x%04X", $k2u{$code}); | |
108 | $items++; | |
109 | } | |
110 | print "\n};\n"; | |
111 | $k2uout{$hb} = 1; | |
112 | } | |
113 | } | |
114 | ||
115 | print "const unicode_char * cp949_to_uni_tbls[] = {\n"; | |
116 | for ($hb=0x81; $hb <= 0xFE; $hb++) { | |
117 | print (($hb > 0x81) ? ",\n " : " "); | |
118 | if ($k2uout{$hb}) { | |
119 | printf "cp949_to_uni_tbl_%02x", $hb; | |
120 | } else { | |
121 | print "NULL"; | |
122 | } | |
123 | } | |
124 | ||
125 | print "\n};\n"; | |
126 | print "\n\n"; | |
127 | ||
128 | ||
129 | %u2kout = (); | |
130 | print "/* map: Unicode to KS X 1001 */\n"; | |
131 | for ($hb = 0x00; $hb <= 0xFF; $hb++) { | |
132 | $items = 0; | |
133 | for ($lb = 0x00; $lb <= 0xFF; $lb++) { | |
134 | $items++ if $u2k_ksx1001{$hb*256 + $lb}; | |
135 | } | |
136 | if ($items) { | |
137 | $items = 0; | |
138 | printf "static const unicode_char uni_to_ksx1001_tbl_%02x[] = {", $hb; | |
139 | for ($lb = 0x00; $lb <= 0xFF; $lb++) { | |
140 | $code = $hb*256 + $lb; | |
141 | print ", " if ($items > 0); | |
142 | print "\n " if ($items % $perline == 0); | |
143 | $u2k_ksx1001{$code} = 0x003F unless $u2k_ksx1001{$code}; | |
144 | printf("0x%04X", $u2k_ksx1001{$code}); | |
145 | $items++; | |
146 | } | |
147 | print "\n};\n"; | |
148 | $u2kout{$hb} = 1; | |
149 | } | |
150 | } | |
151 | ||
152 | print "const unicode_char * uni_to_ksx1001_tbls[] = {\n"; | |
153 | for ($hb=0x00; $hb <= 0xFF; $hb++) { | |
154 | print (($hb > 0x00) ? ",\n " : " "); | |
155 | if ($u2kout{$hb}) { | |
156 | printf "uni_to_ksx1001_tbl_%02x", $hb; | |
157 | } else { | |
158 | print "NULL"; | |
159 | } | |
160 | } | |
161 | ||
162 | print "\n};\n"; | |
163 | print "\n\n"; | |
164 | ||
165 | %u2kout = (); | |
166 | print "/* map: Unicode to CP949 extension */\n"; | |
167 | for ($hb = 0x00; $hb <= 0xFF; $hb++) { | |
168 | $items = 0; | |
169 | for ($lb = 0x00; $lb <= 0xFF; $lb++) { | |
170 | $items++ if $u2k_cp949{$hb*256 + $lb}; | |
171 | } | |
172 | if ($items) { | |
173 | $items = 0; | |
174 | printf "static const unicode_char uni_to_cp949_tbl_%02x[] = {", $hb; | |
175 | for ($lb = 0x00; $lb <= 0xFF; $lb++) { | |
176 | $code = $hb*256 + $lb; | |
177 | print ", " if ($items > 0); | |
178 | print "\n " if ($items % $perline == 0); | |
179 | $u2k_cp949{$code} = 0x003F unless $u2k_cp949{$code}; | |
180 | printf("0x%04X", $u2k_cp949{$code}); | |
181 | $items++; | |
182 | } | |
183 | print "\n};\n"; | |
184 | $u2kout{$hb} = 1; | |
185 | } | |
186 | } | |
187 | ||
188 | print "const unicode_char * uni_to_cp949_tbls[] = {\n"; | |
189 | for ($hb=0x00; $hb <= 0xFF; $hb++) { | |
190 | print (($hb > 0x00) ? ",\n " : " "); | |
191 | if ($u2kout{$hb}) { | |
192 | printf "uni_to_cp949_tbl_%02x", $hb; | |
193 | } else { | |
194 | print "NULL"; | |
195 | } | |
196 | } | |
197 | ||
198 | print "\n};\n"; | |
199 | print "\n\n"; | |
200 | ||
201 | ||
202 | print "#endif /* _KSX1001_HDR_ */\n"; |