Commit | Line | Data |
---|---|---|
8d138742 CE |
1 | #!/usr/bin/perl |
2 | ||
3 | require "cjkcompat.pl"; | |
4 | ||
5 | my $revhash=1050; | |
6 | ||
7 | open (SET, "gunzip -cd <Unihan-3.2.0.txt.gz |") || die "gb2312.txt: $!\n"; | |
8 | while (<SET>) | |
9 | { | |
10 | chomp; | |
11 | s/\#.*//; | |
12 | ||
13 | next unless /^U\+(....)\s+kIRG_GSource\s+0\-(....)/; | |
14 | ||
15 | ($code, $unicode)=("0x$2", "0x$1"); | |
16 | ||
17 | next unless $code ne ""; | |
18 | ||
19 | eval "\$code=$code;"; | |
20 | eval "\$unicode=$unicode;"; | |
21 | ||
22 | die if $code < 0 || $code > 65535; | |
23 | ||
24 | $code |= 0x8080; | |
25 | ||
26 | $codeh= int($code/256); | |
27 | $codel= $code % 256; | |
28 | ||
29 | &add($codeh,$codel,$unicode); | |
30 | } | |
31 | close SET; | |
32 | ||
33 | # Unihan-3.2 does not make mention of GB 2312-80 non-hanzi. | |
34 | # So manually add a converting map... | |
35 | # cf. ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/GB/GB2312.TXT | |
36 | ||
37 | &add(0xA1,0xA1,0x3000); # IDEOGRAPHIC SPACE | |
38 | &add(0xA1,0xA2,0x3001); # IDEOGRAPHIC COMMA | |
39 | &add(0xA1,0xA3,0x3002); # IDEOGRAPHIC FULL STOP | |
40 | &add(0xA1,0xA4,0x30FB); # KATAKANA MIDDLE DOT | |
41 | &add(0xA1,0xA5,0x02C9); # MODIFIER LETTER MACRON (Mandarin Chinese first tone) | |
42 | &add(0xA1,0xA6,0x02C7); # CARON (Mandarin Chinese third tone) | |
43 | &add(0xA1,0xA7,0x00A8); # DIAERESIS | |
44 | &add(0xA1,0xA8,0x3003); # DITTO MARK | |
45 | &add(0xA1,0xA9,0x3005); # IDEOGRAPHIC ITERATION MARK | |
46 | &add(0xA1,0xAA,0x2015); # HORIZONTAL BAR | |
47 | &add(0xA1,0xAB,0xFF5E); # FULLWIDTH TILDE | |
48 | &add(0xA1,0xAC,0x2016); # DOUBLE VERTICAL LINE | |
49 | &add(0xA1,0xAD,0x2026); # HORIZONTAL ELLIPSIS | |
50 | &add(0xA1,0xAE,0x2018); # LEFT SINGLE QUOTATION MARK | |
51 | &add(0xA1,0xAF,0x2019); # RIGHT SINGLE QUOTATION MARK | |
52 | &add(0xA1,0xB0,0x201C); # LEFT DOUBLE QUOTATION MARK | |
53 | &add(0xA1,0xB1,0x201D); # RIGHT DOUBLE QUOTATION MARK | |
54 | &add(0xA1,0xB2,0x3014); # LEFT TORTOISE SHELL BRACKET | |
55 | &add(0xA1,0xB3,0x3015); # RIGHT TORTOISE SHELL BRACKET | |
56 | &add(0xA1,0xB4,0x3008); # LEFT ANGLE BRACKET | |
57 | &add(0xA1,0xB5,0x3009); # RIGHT ANGLE BRACKET | |
58 | &add(0xA1,0xB6,0x300A); # LEFT DOUBLE ANGLE BRACKET | |
59 | &add(0xA1,0xB7,0x300B); # RIGHT DOUBLE ANGLE BRACKET | |
60 | &add(0xA1,0xB8,0x300C); # LEFT CORNER BRACKET | |
61 | &add(0xA1,0xB9,0x300D); # RIGHT CORNER BRACKET | |
62 | &add(0xA1,0xBA,0x300E); # LEFT WHITE CORNER BRACKET | |
63 | &add(0xA1,0xBB,0x300F); # RIGHT WHITE CORNER BRACKET | |
64 | &add(0xA1,0xBC,0x3016); # LEFT WHITE LENTICULAR BRACKET | |
65 | &add(0xA1,0xBD,0x3017); # RIGHT WHITE LENTICULAR BRACKET | |
66 | &add(0xA1,0xBE,0x3010); # LEFT BLACK LENTICULAR BRACKET | |
67 | &add(0xA1,0xBF,0x3011); # RIGHT BLACK LENTICULAR BRACKET | |
68 | &add(0xA1,0xC0,0x00B1); # PLUS-MINUS SIGN | |
69 | &add(0xA1,0xC1,0x00D7); # MULTIPLICATION SIGN | |
70 | &add(0xA1,0xC2,0x00F7); # DIVISION SIGN | |
71 | &add(0xA1,0xC3,0x2236); # RATIO | |
72 | &add(0xA1,0xC4,0x2227); # LOGICAL AND | |
73 | &add(0xA1,0xC5,0x2228); # LOGICAL OR | |
74 | &add(0xA1,0xC6,0x2211); # N-ARY SUMMATION | |
75 | &add(0xA1,0xC7,0x220F); # N-ARY PRODUCT | |
76 | &add(0xA1,0xC8,0x222A); # UNION | |
77 | &add(0xA1,0xC9,0x2229); # INTERSECTION | |
78 | &add(0xA1,0xCA,0x2208); # ELEMENT OF | |
79 | &add(0xA1,0xCB,0x2237); # PROPORTION | |
80 | &add(0xA1,0xCC,0x221A); # SQUARE ROOT | |
81 | &add(0xA1,0xCD,0x22A5); # UP TACK | |
82 | &add(0xA1,0xCE,0x2225); # PARALLEL TO | |
83 | &add(0xA1,0xCF,0x2220); # ANGLE | |
84 | &add(0xA1,0xD0,0x2312); # ARC | |
85 | &add(0xA1,0xD1,0x2299); # CIRCLED DOT OPERATOR | |
86 | &add(0xA1,0xD2,0x222B); # INTEGRAL | |
87 | &add(0xA1,0xD3,0x222E); # CONTOUR INTEGRAL | |
88 | &add(0xA1,0xD4,0x2261); # IDENTICAL TO | |
89 | &add(0xA1,0xD5,0x224C); # ALL EQUAL TO | |
90 | &add(0xA1,0xD6,0x2248); # ALMOST EQUAL TO | |
91 | &add(0xA1,0xD7,0x223D); # REVERSED TILDE | |
92 | &add(0xA1,0xD8,0x221D); # PROPORTIONAL TO | |
93 | &add(0xA1,0xD9,0x2260); # NOT EQUAL TO | |
94 | &add(0xA1,0xDA,0x226E); # NOT LESS-THAN | |
95 | &add(0xA1,0xDB,0x226F); # NOT GREATER-THAN | |
96 | &add(0xA1,0xDC,0x2264); # LESS-THAN OR EQUAL TO | |
97 | &add(0xA1,0xDD,0x2265); # GREATER-THAN OR EQUAL TO | |
98 | &add(0xA1,0xDE,0x221E); # INFINITY | |
99 | &add(0xA1,0xDF,0x2235); # BECAUSE | |
100 | &add(0xA1,0xE0,0x2234); # THEREFORE | |
101 | &add(0xA1,0xE1,0x2642); # MALE SIGN | |
102 | &add(0xA1,0xE2,0x2640); # FEMALE SIGN | |
103 | &add(0xA1,0xE3,0x00B0); # DEGREE SIGN | |
104 | &add(0xA1,0xE4,0x2032); # PRIME | |
105 | &add(0xA1,0xE5,0x2033); # DOUBLE PRIME | |
106 | &add(0xA1,0xE6,0x2103); # DEGREE CELSIUS | |
107 | &add(0xA1,0xE7,0xFF04); # FULLWIDTH DOLLAR SIGN | |
108 | &add(0xA1,0xE8,0x00A4); # CURRENCY SIGN | |
109 | &add(0xA1,0xE9,0xFFE0); # FULLWIDTH CENT SIGN | |
110 | &add(0xA1,0xEA,0xFFE1); # FULLWIDTH POUND SIGN | |
111 | &add(0xA1,0xEB,0x2030); # PER MILLE SIGN | |
112 | &add(0xA1,0xEC,0x00A7); # SECTION SIGN | |
113 | &add(0xA1,0xED,0x2116); # NUMERO SIGN | |
114 | &add(0xA1,0xEE,0x2606); # WHITE STAR | |
115 | &add(0xA1,0xEF,0x2605); # BLACK STAR | |
116 | &add(0xA1,0xF0,0x25CB); # WHITE CIRCLE | |
117 | &add(0xA1,0xF1,0x25CF); # BLACK CIRCLE | |
118 | &add(0xA1,0xF2,0x25CE); # BULLSEYE | |
119 | &add(0xA1,0xF3,0x25C7); # WHITE DIAMOND | |
120 | &add(0xA1,0xF4,0x25C6); # BLACK DIAMOND | |
121 | &add(0xA1,0xF5,0x25A1); # WHITE SQUARE | |
122 | &add(0xA1,0xF6,0x25A0); # BLACK SQUARE | |
123 | &add(0xA1,0xF7,0x25B3); # WHITE UP-POINTING TRIANGLE | |
124 | &add(0xA1,0xF8,0x25B2); # BLACK UP-POINTING TRIANGLE | |
125 | &add(0xA1,0xF9,0x203B); # REFERENCE MARK | |
126 | &add(0xA1,0xFA,0x2192); # RIGHTWARDS ARROW | |
127 | &add(0xA1,0xFB,0x2190); # LEFTWARDS ARROW | |
128 | &add(0xA1,0xFC,0x2191); # UPWARDS ARROW | |
129 | &add(0xA1,0xFD,0x2193); # DOWNWARDS ARROW | |
130 | &add(0xA1,0xFE,0x3013); # GETA MARK | |
131 | # DIGIT/NUMBER FULL STOP | |
132 | foreach ((0xB1..0xC4)) { | |
133 | &add(0xA2,$_,0x2488+$_-0xB1); | |
134 | } | |
135 | # PARENTHESIZED DIGIT/NUMBER | |
136 | foreach ((0xC5..0xD8)) { | |
137 | &add(0xA2,$_,0x2474+$_-0xC5); | |
138 | } | |
139 | # CIRCLED DIGIT/NUMBER | |
140 | foreach ((0xD9..0xE2)) { | |
141 | &add(0xA2,$_,0x2460+$_-0xD9); | |
142 | } | |
143 | # PARENTHESIZED IDEOGRAPH | |
144 | foreach ((0xE5..0xEE)) { | |
145 | &add(0xA2,$_,0x3220+$_-0xE5); | |
146 | } | |
147 | # ROMAN NUMERAL | |
148 | foreach ((0xF1..0xFC)) { | |
149 | &add(0xA2,$_,0x2160+$_-0xF1); | |
150 | } | |
151 | # Fullwidth forms of BASIC LATIN | |
152 | foreach ((0xA1..0xA3,0xA5..0xFD)) { | |
153 | &add(0xA3,$_,0xFF01+$_-0xA1); | |
154 | } | |
155 | &add(0xA3,0xA4,0xFFE5); # FULLWIDTH YEN SIGN | |
156 | &add(0xA3,0xFE,0xFFE3); # FULLWIDTH MACRON | |
157 | # HIRAGANA | |
158 | foreach ((0xA1..0xF3)) { | |
159 | &add(0xA4,$_,0x3041+$_-0xA1); | |
160 | } | |
161 | # KATAKANA | |
162 | foreach ((0xA1..0xF6)) { | |
163 | &add(0xA5,$_,0x30A1+$_-0xA1); | |
164 | } | |
165 | &add(0xA6,0xA1,0x0391); # GREEK CAPITAL LETTER ALPHA | |
166 | &add(0xA6,0xA2,0x0392); # GREEK CAPITAL LETTER BETA | |
167 | &add(0xA6,0xA3,0x0393); # GREEK CAPITAL LETTER GAMMA | |
168 | &add(0xA6,0xA4,0x0394); # GREEK CAPITAL LETTER DELTA | |
169 | &add(0xA6,0xA5,0x0395); # GREEK CAPITAL LETTER EPSILON | |
170 | &add(0xA6,0xA6,0x0396); # GREEK CAPITAL LETTER ZETA | |
171 | &add(0xA6,0xA7,0x0397); # GREEK CAPITAL LETTER ETA | |
172 | &add(0xA6,0xA8,0x0398); # GREEK CAPITAL LETTER THETA | |
173 | &add(0xA6,0xA9,0x0399); # GREEK CAPITAL LETTER IOTA | |
174 | &add(0xA6,0xAA,0x039A); # GREEK CAPITAL LETTER KAPPA | |
175 | &add(0xA6,0xAB,0x039B); # GREEK CAPITAL LETTER LAMDA | |
176 | &add(0xA6,0xAC,0x039C); # GREEK CAPITAL LETTER MU | |
177 | &add(0xA6,0xAD,0x039D); # GREEK CAPITAL LETTER NU | |
178 | &add(0xA6,0xAE,0x039E); # GREEK CAPITAL LETTER XI | |
179 | &add(0xA6,0xAF,0x039F); # GREEK CAPITAL LETTER OMICRON | |
180 | &add(0xA6,0xB0,0x03A0); # GREEK CAPITAL LETTER PI | |
181 | &add(0xA6,0xB1,0x03A1); # GREEK CAPITAL LETTER RHO | |
182 | &add(0xA6,0xB2,0x03A3); # GREEK CAPITAL LETTER SIGMA | |
183 | &add(0xA6,0xB3,0x03A4); # GREEK CAPITAL LETTER TAU | |
184 | &add(0xA6,0xB4,0x03A5); # GREEK CAPITAL LETTER UPSILON | |
185 | &add(0xA6,0xB5,0x03A6); # GREEK CAPITAL LETTER PHI | |
186 | &add(0xA6,0xB6,0x03A7); # GREEK CAPITAL LETTER CHI | |
187 | &add(0xA6,0xB7,0x03A8); # GREEK CAPITAL LETTER PSI | |
188 | &add(0xA6,0xB8,0x03A9); # GREEK CAPITAL LETTER OMEGA | |
189 | &add(0xA6,0xC1,0x03B1); # GREEK SMALL LETTER ALPHA | |
190 | &add(0xA6,0xC2,0x03B2); # GREEK SMALL LETTER BETA | |
191 | &add(0xA6,0xC3,0x03B3); # GREEK SMALL LETTER GAMMA | |
192 | &add(0xA6,0xC4,0x03B4); # GREEK SMALL LETTER DELTA | |
193 | &add(0xA6,0xC5,0x03B5); # GREEK SMALL LETTER EPSILON | |
194 | &add(0xA6,0xC6,0x03B6); # GREEK SMALL LETTER ZETA | |
195 | &add(0xA6,0xC7,0x03B7); # GREEK SMALL LETTER ETA | |
196 | &add(0xA6,0xC8,0x03B8); # GREEK SMALL LETTER THETA | |
197 | &add(0xA6,0xC9,0x03B9); # GREEK SMALL LETTER IOTA | |
198 | &add(0xA6,0xCA,0x03BA); # GREEK SMALL LETTER KAPPA | |
199 | &add(0xA6,0xCB,0x03BB); # GREEK SMALL LETTER LAMDA | |
200 | &add(0xA6,0xCC,0x03BC); # GREEK SMALL LETTER MU | |
201 | &add(0xA6,0xCD,0x03BD); # GREEK SMALL LETTER NU | |
202 | &add(0xA6,0xCE,0x03BE); # GREEK SMALL LETTER XI | |
203 | &add(0xA6,0xCF,0x03BF); # GREEK SMALL LETTER OMICRON | |
204 | &add(0xA6,0xD0,0x03C0); # GREEK SMALL LETTER PI | |
205 | &add(0xA6,0xD1,0x03C1); # GREEK SMALL LETTER RHO | |
206 | &add(0xA6,0xD2,0x03C3); # GREEK SMALL LETTER SIGMA | |
207 | &add(0xA6,0xD3,0x03C4); # GREEK SMALL LETTER TAU | |
208 | &add(0xA6,0xD4,0x03C5); # GREEK SMALL LETTER UPSILON | |
209 | &add(0xA6,0xD5,0x03C6); # GREEK SMALL LETTER PHI | |
210 | &add(0xA6,0xD6,0x03C7); # GREEK SMALL LETTER CHI | |
211 | &add(0xA6,0xD7,0x03C8); # GREEK SMALL LETTER PSI | |
212 | &add(0xA6,0xD8,0x03C9); # GREEK SMALL LETTER OMEGA | |
213 | &add(0xA7,0xA1,0x0410); # CYRILLIC CAPITAL LETTER A | |
214 | &add(0xA7,0xA2,0x0411); # CYRILLIC CAPITAL LETTER BE | |
215 | &add(0xA7,0xA3,0x0412); # CYRILLIC CAPITAL LETTER VE | |
216 | &add(0xA7,0xA4,0x0413); # CYRILLIC CAPITAL LETTER GHE | |
217 | &add(0xA7,0xA5,0x0414); # CYRILLIC CAPITAL LETTER DE | |
218 | &add(0xA7,0xA6,0x0415); # CYRILLIC CAPITAL LETTER IE | |
219 | &add(0xA7,0xA7,0x0401); # CYRILLIC CAPITAL LETTER IO | |
220 | &add(0xA7,0xA8,0x0416); # CYRILLIC CAPITAL LETTER ZHE | |
221 | &add(0xA7,0xA9,0x0417); # CYRILLIC CAPITAL LETTER ZE | |
222 | &add(0xA7,0xAA,0x0418); # CYRILLIC CAPITAL LETTER I | |
223 | &add(0xA7,0xAB,0x0419); # CYRILLIC CAPITAL LETTER SHORT I | |
224 | &add(0xA7,0xAC,0x041A); # CYRILLIC CAPITAL LETTER KA | |
225 | &add(0xA7,0xAD,0x041B); # CYRILLIC CAPITAL LETTER EL | |
226 | &add(0xA7,0xAE,0x041C); # CYRILLIC CAPITAL LETTER EM | |
227 | &add(0xA7,0xAF,0x041D); # CYRILLIC CAPITAL LETTER EN | |
228 | &add(0xA7,0xB0,0x041E); # CYRILLIC CAPITAL LETTER O | |
229 | &add(0xA7,0xB1,0x041F); # CYRILLIC CAPITAL LETTER PE | |
230 | &add(0xA7,0xB2,0x0420); # CYRILLIC CAPITAL LETTER ER | |
231 | &add(0xA7,0xB3,0x0421); # CYRILLIC CAPITAL LETTER ES | |
232 | &add(0xA7,0xB4,0x0422); # CYRILLIC CAPITAL LETTER TE | |
233 | &add(0xA7,0xB5,0x0423); # CYRILLIC CAPITAL LETTER U | |
234 | &add(0xA7,0xB6,0x0424); # CYRILLIC CAPITAL LETTER EF | |
235 | &add(0xA7,0xB7,0x0425); # CYRILLIC CAPITAL LETTER HA | |
236 | &add(0xA7,0xB8,0x0426); # CYRILLIC CAPITAL LETTER TSE | |
237 | &add(0xA7,0xB9,0x0427); # CYRILLIC CAPITAL LETTER CHE | |
238 | &add(0xA7,0xBA,0x0428); # CYRILLIC CAPITAL LETTER SHA | |
239 | &add(0xA7,0xBB,0x0429); # CYRILLIC CAPITAL LETTER SHCHA | |
240 | &add(0xA7,0xBC,0x042A); # CYRILLIC CAPITAL LETTER HARD SIGN | |
241 | &add(0xA7,0xBD,0x042B); # CYRILLIC CAPITAL LETTER YERU | |
242 | &add(0xA7,0xBE,0x042C); # CYRILLIC CAPITAL LETTER SOFT SIGN | |
243 | &add(0xA7,0xBF,0x042D); # CYRILLIC CAPITAL LETTER E | |
244 | &add(0xA7,0xC0,0x042E); # CYRILLIC CAPITAL LETTER YU | |
245 | &add(0xA7,0xC1,0x042F); # CYRILLIC CAPITAL LETTER YA | |
246 | &add(0xA7,0xD1,0x0430); # CYRILLIC SMALL LETTER A | |
247 | &add(0xA7,0xD2,0x0431); # CYRILLIC SMALL LETTER BE | |
248 | &add(0xA7,0xD3,0x0432); # CYRILLIC SMALL LETTER VE | |
249 | &add(0xA7,0xD4,0x0433); # CYRILLIC SMALL LETTER GHE | |
250 | &add(0xA7,0xD5,0x0434); # CYRILLIC SMALL LETTER DE | |
251 | &add(0xA7,0xD6,0x0435); # CYRILLIC SMALL LETTER IE | |
252 | &add(0xA7,0xD7,0x0451); # CYRILLIC SMALL LETTER IO | |
253 | &add(0xA7,0xD8,0x0436); # CYRILLIC SMALL LETTER ZHE | |
254 | &add(0xA7,0xD9,0x0437); # CYRILLIC SMALL LETTER ZE | |
255 | &add(0xA7,0xDA,0x0438); # CYRILLIC SMALL LETTER I | |
256 | &add(0xA7,0xDB,0x0439); # CYRILLIC SMALL LETTER SHORT I | |
257 | &add(0xA7,0xDC,0x043A); # CYRILLIC SMALL LETTER KA | |
258 | &add(0xA7,0xDD,0x043B); # CYRILLIC SMALL LETTER EL | |
259 | &add(0xA7,0xDE,0x043C); # CYRILLIC SMALL LETTER EM | |
260 | &add(0xA7,0xDF,0x043D); # CYRILLIC SMALL LETTER EN | |
261 | &add(0xA7,0xE0,0x043E); # CYRILLIC SMALL LETTER O | |
262 | &add(0xA7,0xE1,0x043F); # CYRILLIC SMALL LETTER PE | |
263 | &add(0xA7,0xE2,0x0440); # CYRILLIC SMALL LETTER ER | |
264 | &add(0xA7,0xE3,0x0441); # CYRILLIC SMALL LETTER ES | |
265 | &add(0xA7,0xE4,0x0442); # CYRILLIC SMALL LETTER TE | |
266 | &add(0xA7,0xE5,0x0443); # CYRILLIC SMALL LETTER U | |
267 | &add(0xA7,0xE6,0x0444); # CYRILLIC SMALL LETTER EF | |
268 | &add(0xA7,0xE7,0x0445); # CYRILLIC SMALL LETTER HA | |
269 | &add(0xA7,0xE8,0x0446); # CYRILLIC SMALL LETTER TSE | |
270 | &add(0xA7,0xE9,0x0447); # CYRILLIC SMALL LETTER CHE | |
271 | &add(0xA7,0xEA,0x0448); # CYRILLIC SMALL LETTER SHA | |
272 | &add(0xA7,0xEB,0x0449); # CYRILLIC SMALL LETTER SHCHA | |
273 | &add(0xA7,0xEC,0x044A); # CYRILLIC SMALL LETTER HARD SIGN | |
274 | &add(0xA7,0xED,0x044B); # CYRILLIC SMALL LETTER YERU | |
275 | &add(0xA7,0xEE,0x044C); # CYRILLIC SMALL LETTER SOFT SIGN | |
276 | &add(0xA7,0xEF,0x044D); # CYRILLIC SMALL LETTER E | |
277 | &add(0xA7,0xF0,0x044E); # CYRILLIC SMALL LETTER YU | |
278 | &add(0xA7,0xF1,0x044F); # CYRILLIC SMALL LETTER YA | |
279 | &add(0xA8,0xA1,0x0101); # LATIN SMALL LETTER A WITH MACRON | |
280 | &add(0xA8,0xA2,0x00E1); # LATIN SMALL LETTER A WITH ACUTE | |
281 | &add(0xA8,0xA3,0x01CE); # LATIN SMALL LETTER A WITH CARON | |
282 | &add(0xA8,0xA4,0x00E0); # LATIN SMALL LETTER A WITH GRAVE | |
283 | &add(0xA8,0xA5,0x0113); # LATIN SMALL LETTER E WITH MACRON | |
284 | &add(0xA8,0xA6,0x00E9); # LATIN SMALL LETTER E WITH ACUTE | |
285 | &add(0xA8,0xA7,0x011B); # LATIN SMALL LETTER E WITH CARON | |
286 | &add(0xA8,0xA8,0x00E8); # LATIN SMALL LETTER E WITH GRAVE | |
287 | &add(0xA8,0xA9,0x012B); # LATIN SMALL LETTER I WITH MACRON | |
288 | &add(0xA8,0xAA,0x00ED); # LATIN SMALL LETTER I WITH ACUTE | |
289 | &add(0xA8,0xAB,0x01D0); # LATIN SMALL LETTER I WITH CARON | |
290 | &add(0xA8,0xAC,0x00EC); # LATIN SMALL LETTER I WITH GRAVE | |
291 | &add(0xA8,0xAD,0x014D); # LATIN SMALL LETTER O WITH MACRON | |
292 | &add(0xA8,0xAE,0x00F3); # LATIN SMALL LETTER O WITH ACUTE | |
293 | &add(0xA8,0xAF,0x01D2); # LATIN SMALL LETTER O WITH CARON | |
294 | &add(0xA8,0xB0,0x00F2); # LATIN SMALL LETTER O WITH GRAVE | |
295 | &add(0xA8,0xB1,0x016B); # LATIN SMALL LETTER U WITH MACRON | |
296 | &add(0xA8,0xB2,0x00FA); # LATIN SMALL LETTER U WITH ACUTE | |
297 | &add(0xA8,0xB3,0x01D4); # LATIN SMALL LETTER U WITH CARON | |
298 | &add(0xA8,0xB4,0x00F9); # LATIN SMALL LETTER U WITH GRAVE | |
299 | &add(0xA8,0xB5,0x01D6); # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON | |
300 | &add(0xA8,0xB6,0x01D8); # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE | |
301 | &add(0xA8,0xB7,0x01DA); # LATIN SMALL LETTER U WITH DIAERESIS AND CARON | |
302 | &add(0xA8,0xB8,0x01DC); # LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE | |
303 | &add(0xA8,0xB9,0x00FC); # LATIN SMALL LETTER U WITH DIAERESIS | |
304 | &add(0xA8,0xBA,0x00EA); # LATIN SMALL LETTER E WITH CIRCUMFLEX | |
305 | # BOPOMOFO | |
306 | foreach ((0xC5..0xE9)) { | |
307 | &add(0xA8,$_,0x3105+$_-0xC5); | |
308 | } | |
309 | # BOX DRAWINGS | |
310 | foreach ((0xA4..0xEF)) { | |
311 | &add(0xA9,$_,0x2500+$_-0xA4); | |
312 | } | |
313 | ||
314 | sub add { | |
315 | local($codeh,$codel,$unicode) = @_; | |
316 | ||
317 | my $code = $codeh*256+$codel; | |
318 | my $unicodehash= int($unicode % $revhash); | |
319 | ||
320 | die if $codeh < 0xA1 || $codeh > 0xF7; | |
321 | die if $codel < 0xA1 || $codel > 0xFE; | |
322 | ||
323 | if (! defined $fwd{$codeh}) | |
324 | { | |
325 | my %dummy; | |
326 | ||
327 | $fwd{$codeh}= \%dummy; | |
328 | } | |
329 | ||
330 | $fwd{$codeh}{$codel}=$unicode; | |
331 | ||
332 | if (! defined $rev[$unicodehash]) | |
333 | { | |
334 | my @dummy; | |
335 | ||
336 | $rev[$unicodehash]= \@dummy; | |
337 | } | |
338 | ||
339 | my $r=$rev[$unicodehash]; | |
340 | ||
341 | push @$r, "$unicode $code"; | |
342 | ||
343 | $revmap{$unicode} = $code; | |
344 | } | |
345 | ||
346 | # Add maps for CJK compatibility ideographs of Unicode. | |
347 | &add_cjkcompat(%compat_ksx1001); | |
348 | &add_cjkcompat(%compat_big5); | |
349 | &add_cjkcompat(%compat_ibm32); | |
350 | &add_cjkcompat(%compat_jisx0213); | |
351 | &add_cjkcompat(%compat_cns11643); | |
352 | ||
353 | sub add_cjkcompat { | |
354 | local(%compat) = @_; | |
355 | foreach (keys %compat) { | |
356 | if (defined $revmap{$compat{$_}}) { | |
357 | my $unicodehash = int($_ % $revhash); | |
358 | if (! defined $rev[$unicodehash]) | |
359 | { | |
360 | my @dummy; | |
361 | $rev[$unicodehash]= \@dummy; | |
362 | } | |
363 | my $r=$rev[$unicodehash]; | |
364 | push @$r, "$_ $revmap{$compat{$_}}"; | |
365 | } | |
366 | } | |
367 | } | |
368 | ||
369 | ||
370 | ||
371 | print ' | |
372 | /* | |
373 | ** Copyright 2000-2001 Double Precision, Inc. | |
374 | ** See COPYING for distribution information. | |
375 | ** | |
376 | ** $Id: gb2312.pl,v 1.4 2004/02/08 04:59:15 mrsam Exp $ | |
377 | ** Non-hanzi support by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org> | |
378 | */ | |
379 | ||
380 | #include "unicode.h" | |
381 | '; | |
382 | ||
383 | foreach (sort keys %fwd) | |
384 | { | |
385 | my $h=$_; | |
386 | my $l; | |
387 | ||
388 | printf ("static const unicode_char gb2312_%02x[94]={", $h); | |
389 | ||
390 | for ($l=0xA1; $l < 0xFF; $l++) | |
391 | { | |
392 | print "\n" if ($l % 16) == 0; | |
393 | printf ("%d", $fwd{$h}{$l}); | |
394 | print "," unless $l >= 0xFE; | |
395 | } | |
396 | print "};\n"; | |
397 | ||
398 | } | |
399 | ||
400 | print "static const unsigned gb2312_revhash_size=$revhash; | |
401 | static const unicode_char gb2312_revtable_uc[]={\n"; | |
402 | ||
403 | my $index=0; | |
404 | ||
405 | for ($i=0; $i<$revhash; $i++) | |
406 | { | |
407 | my $a= $rev[$i]; | |
408 | ||
409 | $revindex[$i]=$index; | |
410 | ||
411 | my $v; | |
412 | ||
413 | my @aa=@$a; | |
414 | ||
415 | while (defined ($v=shift @aa)) | |
416 | { | |
417 | print "," if $index > 0; | |
418 | print "\n" if $index && ($index % 16) == 0; | |
419 | ||
420 | $v =~ s/ .*//; | |
421 | print $v; | |
422 | ++$index; | |
423 | } | |
424 | } | |
425 | ||
426 | print "};\nstatic const unsigned gb2312_revtable_octets[]={\n"; | |
427 | ||
428 | $maxl=0; | |
429 | $index=0; | |
430 | for ($i=0; $i<$revhash; $i++) | |
431 | { | |
432 | my $a= $rev[$i]; | |
433 | ||
434 | my $v; | |
435 | ||
436 | my @aa=@$a; | |
437 | ||
438 | $maxl=$#aa if $#aa > $maxl; | |
439 | while (defined ($v=shift @aa)) | |
440 | { | |
441 | print "," if $index > 0; | |
442 | print "\n" if $index && ($index % 16) == 0; | |
443 | ||
444 | $v =~ s/.* //; | |
445 | print $v; | |
446 | ++$index; | |
447 | } | |
448 | } | |
449 | ||
450 | print "};\nstatic const unsigned gb2312_revtable_index[]={\n"; | |
451 | ||
452 | for ($i=0; $i<$revhash; $i++) | |
453 | { | |
454 | print "," if $i > 0; | |
455 | print "\n" if $i && ($i % 16) == 0; | |
456 | print $revindex[$i]; | |
457 | } | |
458 | ||
459 | print "};\n"; |