Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / gb2312.pl
1 #!/usr/bin/perl
2
3 require "cjkcompat.pl";
4
5 my $revhash=1050;
6
7 open (SET, "gunzip -cd <Unihan-3.2.0.txt.gz |") || die "gb2312.txt: $!\n";
8 while (<SET>)
9 {
10 chomp;
11 s/\#.*//;
12
13 next unless /^U\+(....)\s+kIRG_GSource\s+0\-(....)/;
14
15 ($code, $unicode)=("0x$2", "0x$1");
16
17 next unless $code ne "";
18
19 eval "\$code=$code;";
20 eval "\$unicode=$unicode;";
21
22 die if $code < 0 || $code > 65535;
23
24 $code |= 0x8080;
25
26 $codeh= int($code/256);
27 $codel= $code % 256;
28
29 &add($codeh,$codel,$unicode);
30 }
31 close SET;
32
33 # Unihan-3.2 does not make mention of GB 2312-80 non-hanzi.
34 # So manually add a converting map...
35 # cf. ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/GB/GB2312.TXT
36
37 &add(0xA1,0xA1,0x3000); # IDEOGRAPHIC SPACE
38 &add(0xA1,0xA2,0x3001); # IDEOGRAPHIC COMMA
39 &add(0xA1,0xA3,0x3002); # IDEOGRAPHIC FULL STOP
40 &add(0xA1,0xA4,0x30FB); # KATAKANA MIDDLE DOT
41 &add(0xA1,0xA5,0x02C9); # MODIFIER LETTER MACRON (Mandarin Chinese first tone)
42 &add(0xA1,0xA6,0x02C7); # CARON (Mandarin Chinese third tone)
43 &add(0xA1,0xA7,0x00A8); # DIAERESIS
44 &add(0xA1,0xA8,0x3003); # DITTO MARK
45 &add(0xA1,0xA9,0x3005); # IDEOGRAPHIC ITERATION MARK
46 &add(0xA1,0xAA,0x2015); # HORIZONTAL BAR
47 &add(0xA1,0xAB,0xFF5E); # FULLWIDTH TILDE
48 &add(0xA1,0xAC,0x2016); # DOUBLE VERTICAL LINE
49 &add(0xA1,0xAD,0x2026); # HORIZONTAL ELLIPSIS
50 &add(0xA1,0xAE,0x2018); # LEFT SINGLE QUOTATION MARK
51 &add(0xA1,0xAF,0x2019); # RIGHT SINGLE QUOTATION MARK
52 &add(0xA1,0xB0,0x201C); # LEFT DOUBLE QUOTATION MARK
53 &add(0xA1,0xB1,0x201D); # RIGHT DOUBLE QUOTATION MARK
54 &add(0xA1,0xB2,0x3014); # LEFT TORTOISE SHELL BRACKET
55 &add(0xA1,0xB3,0x3015); # RIGHT TORTOISE SHELL BRACKET
56 &add(0xA1,0xB4,0x3008); # LEFT ANGLE BRACKET
57 &add(0xA1,0xB5,0x3009); # RIGHT ANGLE BRACKET
58 &add(0xA1,0xB6,0x300A); # LEFT DOUBLE ANGLE BRACKET
59 &add(0xA1,0xB7,0x300B); # RIGHT DOUBLE ANGLE BRACKET
60 &add(0xA1,0xB8,0x300C); # LEFT CORNER BRACKET
61 &add(0xA1,0xB9,0x300D); # RIGHT CORNER BRACKET
62 &add(0xA1,0xBA,0x300E); # LEFT WHITE CORNER BRACKET
63 &add(0xA1,0xBB,0x300F); # RIGHT WHITE CORNER BRACKET
64 &add(0xA1,0xBC,0x3016); # LEFT WHITE LENTICULAR BRACKET
65 &add(0xA1,0xBD,0x3017); # RIGHT WHITE LENTICULAR BRACKET
66 &add(0xA1,0xBE,0x3010); # LEFT BLACK LENTICULAR BRACKET
67 &add(0xA1,0xBF,0x3011); # RIGHT BLACK LENTICULAR BRACKET
68 &add(0xA1,0xC0,0x00B1); # PLUS-MINUS SIGN
69 &add(0xA1,0xC1,0x00D7); # MULTIPLICATION SIGN
70 &add(0xA1,0xC2,0x00F7); # DIVISION SIGN
71 &add(0xA1,0xC3,0x2236); # RATIO
72 &add(0xA1,0xC4,0x2227); # LOGICAL AND
73 &add(0xA1,0xC5,0x2228); # LOGICAL OR
74 &add(0xA1,0xC6,0x2211); # N-ARY SUMMATION
75 &add(0xA1,0xC7,0x220F); # N-ARY PRODUCT
76 &add(0xA1,0xC8,0x222A); # UNION
77 &add(0xA1,0xC9,0x2229); # INTERSECTION
78 &add(0xA1,0xCA,0x2208); # ELEMENT OF
79 &add(0xA1,0xCB,0x2237); # PROPORTION
80 &add(0xA1,0xCC,0x221A); # SQUARE ROOT
81 &add(0xA1,0xCD,0x22A5); # UP TACK
82 &add(0xA1,0xCE,0x2225); # PARALLEL TO
83 &add(0xA1,0xCF,0x2220); # ANGLE
84 &add(0xA1,0xD0,0x2312); # ARC
85 &add(0xA1,0xD1,0x2299); # CIRCLED DOT OPERATOR
86 &add(0xA1,0xD2,0x222B); # INTEGRAL
87 &add(0xA1,0xD3,0x222E); # CONTOUR INTEGRAL
88 &add(0xA1,0xD4,0x2261); # IDENTICAL TO
89 &add(0xA1,0xD5,0x224C); # ALL EQUAL TO
90 &add(0xA1,0xD6,0x2248); # ALMOST EQUAL TO
91 &add(0xA1,0xD7,0x223D); # REVERSED TILDE
92 &add(0xA1,0xD8,0x221D); # PROPORTIONAL TO
93 &add(0xA1,0xD9,0x2260); # NOT EQUAL TO
94 &add(0xA1,0xDA,0x226E); # NOT LESS-THAN
95 &add(0xA1,0xDB,0x226F); # NOT GREATER-THAN
96 &add(0xA1,0xDC,0x2264); # LESS-THAN OR EQUAL TO
97 &add(0xA1,0xDD,0x2265); # GREATER-THAN OR EQUAL TO
98 &add(0xA1,0xDE,0x221E); # INFINITY
99 &add(0xA1,0xDF,0x2235); # BECAUSE
100 &add(0xA1,0xE0,0x2234); # THEREFORE
101 &add(0xA1,0xE1,0x2642); # MALE SIGN
102 &add(0xA1,0xE2,0x2640); # FEMALE SIGN
103 &add(0xA1,0xE3,0x00B0); # DEGREE SIGN
104 &add(0xA1,0xE4,0x2032); # PRIME
105 &add(0xA1,0xE5,0x2033); # DOUBLE PRIME
106 &add(0xA1,0xE6,0x2103); # DEGREE CELSIUS
107 &add(0xA1,0xE7,0xFF04); # FULLWIDTH DOLLAR SIGN
108 &add(0xA1,0xE8,0x00A4); # CURRENCY SIGN
109 &add(0xA1,0xE9,0xFFE0); # FULLWIDTH CENT SIGN
110 &add(0xA1,0xEA,0xFFE1); # FULLWIDTH POUND SIGN
111 &add(0xA1,0xEB,0x2030); # PER MILLE SIGN
112 &add(0xA1,0xEC,0x00A7); # SECTION SIGN
113 &add(0xA1,0xED,0x2116); # NUMERO SIGN
114 &add(0xA1,0xEE,0x2606); # WHITE STAR
115 &add(0xA1,0xEF,0x2605); # BLACK STAR
116 &add(0xA1,0xF0,0x25CB); # WHITE CIRCLE
117 &add(0xA1,0xF1,0x25CF); # BLACK CIRCLE
118 &add(0xA1,0xF2,0x25CE); # BULLSEYE
119 &add(0xA1,0xF3,0x25C7); # WHITE DIAMOND
120 &add(0xA1,0xF4,0x25C6); # BLACK DIAMOND
121 &add(0xA1,0xF5,0x25A1); # WHITE SQUARE
122 &add(0xA1,0xF6,0x25A0); # BLACK SQUARE
123 &add(0xA1,0xF7,0x25B3); # WHITE UP-POINTING TRIANGLE
124 &add(0xA1,0xF8,0x25B2); # BLACK UP-POINTING TRIANGLE
125 &add(0xA1,0xF9,0x203B); # REFERENCE MARK
126 &add(0xA1,0xFA,0x2192); # RIGHTWARDS ARROW
127 &add(0xA1,0xFB,0x2190); # LEFTWARDS ARROW
128 &add(0xA1,0xFC,0x2191); # UPWARDS ARROW
129 &add(0xA1,0xFD,0x2193); # DOWNWARDS ARROW
130 &add(0xA1,0xFE,0x3013); # GETA MARK
131 # DIGIT/NUMBER FULL STOP
132 foreach ((0xB1..0xC4)) {
133 &add(0xA2,$_,0x2488+$_-0xB1);
134 }
135 # PARENTHESIZED DIGIT/NUMBER
136 foreach ((0xC5..0xD8)) {
137 &add(0xA2,$_,0x2474+$_-0xC5);
138 }
139 # CIRCLED DIGIT/NUMBER
140 foreach ((0xD9..0xE2)) {
141 &add(0xA2,$_,0x2460+$_-0xD9);
142 }
143 # PARENTHESIZED IDEOGRAPH
144 foreach ((0xE5..0xEE)) {
145 &add(0xA2,$_,0x3220+$_-0xE5);
146 }
147 # ROMAN NUMERAL
148 foreach ((0xF1..0xFC)) {
149 &add(0xA2,$_,0x2160+$_-0xF1);
150 }
151 # Fullwidth forms of BASIC LATIN
152 foreach ((0xA1..0xA3,0xA5..0xFD)) {
153 &add(0xA3,$_,0xFF01+$_-0xA1);
154 }
155 &add(0xA3,0xA4,0xFFE5); # FULLWIDTH YEN SIGN
156 &add(0xA3,0xFE,0xFFE3); # FULLWIDTH MACRON
157 # HIRAGANA
158 foreach ((0xA1..0xF3)) {
159 &add(0xA4,$_,0x3041+$_-0xA1);
160 }
161 # KATAKANA
162 foreach ((0xA1..0xF6)) {
163 &add(0xA5,$_,0x30A1+$_-0xA1);
164 }
165 &add(0xA6,0xA1,0x0391); # GREEK CAPITAL LETTER ALPHA
166 &add(0xA6,0xA2,0x0392); # GREEK CAPITAL LETTER BETA
167 &add(0xA6,0xA3,0x0393); # GREEK CAPITAL LETTER GAMMA
168 &add(0xA6,0xA4,0x0394); # GREEK CAPITAL LETTER DELTA
169 &add(0xA6,0xA5,0x0395); # GREEK CAPITAL LETTER EPSILON
170 &add(0xA6,0xA6,0x0396); # GREEK CAPITAL LETTER ZETA
171 &add(0xA6,0xA7,0x0397); # GREEK CAPITAL LETTER ETA
172 &add(0xA6,0xA8,0x0398); # GREEK CAPITAL LETTER THETA
173 &add(0xA6,0xA9,0x0399); # GREEK CAPITAL LETTER IOTA
174 &add(0xA6,0xAA,0x039A); # GREEK CAPITAL LETTER KAPPA
175 &add(0xA6,0xAB,0x039B); # GREEK CAPITAL LETTER LAMDA
176 &add(0xA6,0xAC,0x039C); # GREEK CAPITAL LETTER MU
177 &add(0xA6,0xAD,0x039D); # GREEK CAPITAL LETTER NU
178 &add(0xA6,0xAE,0x039E); # GREEK CAPITAL LETTER XI
179 &add(0xA6,0xAF,0x039F); # GREEK CAPITAL LETTER OMICRON
180 &add(0xA6,0xB0,0x03A0); # GREEK CAPITAL LETTER PI
181 &add(0xA6,0xB1,0x03A1); # GREEK CAPITAL LETTER RHO
182 &add(0xA6,0xB2,0x03A3); # GREEK CAPITAL LETTER SIGMA
183 &add(0xA6,0xB3,0x03A4); # GREEK CAPITAL LETTER TAU
184 &add(0xA6,0xB4,0x03A5); # GREEK CAPITAL LETTER UPSILON
185 &add(0xA6,0xB5,0x03A6); # GREEK CAPITAL LETTER PHI
186 &add(0xA6,0xB6,0x03A7); # GREEK CAPITAL LETTER CHI
187 &add(0xA6,0xB7,0x03A8); # GREEK CAPITAL LETTER PSI
188 &add(0xA6,0xB8,0x03A9); # GREEK CAPITAL LETTER OMEGA
189 &add(0xA6,0xC1,0x03B1); # GREEK SMALL LETTER ALPHA
190 &add(0xA6,0xC2,0x03B2); # GREEK SMALL LETTER BETA
191 &add(0xA6,0xC3,0x03B3); # GREEK SMALL LETTER GAMMA
192 &add(0xA6,0xC4,0x03B4); # GREEK SMALL LETTER DELTA
193 &add(0xA6,0xC5,0x03B5); # GREEK SMALL LETTER EPSILON
194 &add(0xA6,0xC6,0x03B6); # GREEK SMALL LETTER ZETA
195 &add(0xA6,0xC7,0x03B7); # GREEK SMALL LETTER ETA
196 &add(0xA6,0xC8,0x03B8); # GREEK SMALL LETTER THETA
197 &add(0xA6,0xC9,0x03B9); # GREEK SMALL LETTER IOTA
198 &add(0xA6,0xCA,0x03BA); # GREEK SMALL LETTER KAPPA
199 &add(0xA6,0xCB,0x03BB); # GREEK SMALL LETTER LAMDA
200 &add(0xA6,0xCC,0x03BC); # GREEK SMALL LETTER MU
201 &add(0xA6,0xCD,0x03BD); # GREEK SMALL LETTER NU
202 &add(0xA6,0xCE,0x03BE); # GREEK SMALL LETTER XI
203 &add(0xA6,0xCF,0x03BF); # GREEK SMALL LETTER OMICRON
204 &add(0xA6,0xD0,0x03C0); # GREEK SMALL LETTER PI
205 &add(0xA6,0xD1,0x03C1); # GREEK SMALL LETTER RHO
206 &add(0xA6,0xD2,0x03C3); # GREEK SMALL LETTER SIGMA
207 &add(0xA6,0xD3,0x03C4); # GREEK SMALL LETTER TAU
208 &add(0xA6,0xD4,0x03C5); # GREEK SMALL LETTER UPSILON
209 &add(0xA6,0xD5,0x03C6); # GREEK SMALL LETTER PHI
210 &add(0xA6,0xD6,0x03C7); # GREEK SMALL LETTER CHI
211 &add(0xA6,0xD7,0x03C8); # GREEK SMALL LETTER PSI
212 &add(0xA6,0xD8,0x03C9); # GREEK SMALL LETTER OMEGA
213 &add(0xA7,0xA1,0x0410); # CYRILLIC CAPITAL LETTER A
214 &add(0xA7,0xA2,0x0411); # CYRILLIC CAPITAL LETTER BE
215 &add(0xA7,0xA3,0x0412); # CYRILLIC CAPITAL LETTER VE
216 &add(0xA7,0xA4,0x0413); # CYRILLIC CAPITAL LETTER GHE
217 &add(0xA7,0xA5,0x0414); # CYRILLIC CAPITAL LETTER DE
218 &add(0xA7,0xA6,0x0415); # CYRILLIC CAPITAL LETTER IE
219 &add(0xA7,0xA7,0x0401); # CYRILLIC CAPITAL LETTER IO
220 &add(0xA7,0xA8,0x0416); # CYRILLIC CAPITAL LETTER ZHE
221 &add(0xA7,0xA9,0x0417); # CYRILLIC CAPITAL LETTER ZE
222 &add(0xA7,0xAA,0x0418); # CYRILLIC CAPITAL LETTER I
223 &add(0xA7,0xAB,0x0419); # CYRILLIC CAPITAL LETTER SHORT I
224 &add(0xA7,0xAC,0x041A); # CYRILLIC CAPITAL LETTER KA
225 &add(0xA7,0xAD,0x041B); # CYRILLIC CAPITAL LETTER EL
226 &add(0xA7,0xAE,0x041C); # CYRILLIC CAPITAL LETTER EM
227 &add(0xA7,0xAF,0x041D); # CYRILLIC CAPITAL LETTER EN
228 &add(0xA7,0xB0,0x041E); # CYRILLIC CAPITAL LETTER O
229 &add(0xA7,0xB1,0x041F); # CYRILLIC CAPITAL LETTER PE
230 &add(0xA7,0xB2,0x0420); # CYRILLIC CAPITAL LETTER ER
231 &add(0xA7,0xB3,0x0421); # CYRILLIC CAPITAL LETTER ES
232 &add(0xA7,0xB4,0x0422); # CYRILLIC CAPITAL LETTER TE
233 &add(0xA7,0xB5,0x0423); # CYRILLIC CAPITAL LETTER U
234 &add(0xA7,0xB6,0x0424); # CYRILLIC CAPITAL LETTER EF
235 &add(0xA7,0xB7,0x0425); # CYRILLIC CAPITAL LETTER HA
236 &add(0xA7,0xB8,0x0426); # CYRILLIC CAPITAL LETTER TSE
237 &add(0xA7,0xB9,0x0427); # CYRILLIC CAPITAL LETTER CHE
238 &add(0xA7,0xBA,0x0428); # CYRILLIC CAPITAL LETTER SHA
239 &add(0xA7,0xBB,0x0429); # CYRILLIC CAPITAL LETTER SHCHA
240 &add(0xA7,0xBC,0x042A); # CYRILLIC CAPITAL LETTER HARD SIGN
241 &add(0xA7,0xBD,0x042B); # CYRILLIC CAPITAL LETTER YERU
242 &add(0xA7,0xBE,0x042C); # CYRILLIC CAPITAL LETTER SOFT SIGN
243 &add(0xA7,0xBF,0x042D); # CYRILLIC CAPITAL LETTER E
244 &add(0xA7,0xC0,0x042E); # CYRILLIC CAPITAL LETTER YU
245 &add(0xA7,0xC1,0x042F); # CYRILLIC CAPITAL LETTER YA
246 &add(0xA7,0xD1,0x0430); # CYRILLIC SMALL LETTER A
247 &add(0xA7,0xD2,0x0431); # CYRILLIC SMALL LETTER BE
248 &add(0xA7,0xD3,0x0432); # CYRILLIC SMALL LETTER VE
249 &add(0xA7,0xD4,0x0433); # CYRILLIC SMALL LETTER GHE
250 &add(0xA7,0xD5,0x0434); # CYRILLIC SMALL LETTER DE
251 &add(0xA7,0xD6,0x0435); # CYRILLIC SMALL LETTER IE
252 &add(0xA7,0xD7,0x0451); # CYRILLIC SMALL LETTER IO
253 &add(0xA7,0xD8,0x0436); # CYRILLIC SMALL LETTER ZHE
254 &add(0xA7,0xD9,0x0437); # CYRILLIC SMALL LETTER ZE
255 &add(0xA7,0xDA,0x0438); # CYRILLIC SMALL LETTER I
256 &add(0xA7,0xDB,0x0439); # CYRILLIC SMALL LETTER SHORT I
257 &add(0xA7,0xDC,0x043A); # CYRILLIC SMALL LETTER KA
258 &add(0xA7,0xDD,0x043B); # CYRILLIC SMALL LETTER EL
259 &add(0xA7,0xDE,0x043C); # CYRILLIC SMALL LETTER EM
260 &add(0xA7,0xDF,0x043D); # CYRILLIC SMALL LETTER EN
261 &add(0xA7,0xE0,0x043E); # CYRILLIC SMALL LETTER O
262 &add(0xA7,0xE1,0x043F); # CYRILLIC SMALL LETTER PE
263 &add(0xA7,0xE2,0x0440); # CYRILLIC SMALL LETTER ER
264 &add(0xA7,0xE3,0x0441); # CYRILLIC SMALL LETTER ES
265 &add(0xA7,0xE4,0x0442); # CYRILLIC SMALL LETTER TE
266 &add(0xA7,0xE5,0x0443); # CYRILLIC SMALL LETTER U
267 &add(0xA7,0xE6,0x0444); # CYRILLIC SMALL LETTER EF
268 &add(0xA7,0xE7,0x0445); # CYRILLIC SMALL LETTER HA
269 &add(0xA7,0xE8,0x0446); # CYRILLIC SMALL LETTER TSE
270 &add(0xA7,0xE9,0x0447); # CYRILLIC SMALL LETTER CHE
271 &add(0xA7,0xEA,0x0448); # CYRILLIC SMALL LETTER SHA
272 &add(0xA7,0xEB,0x0449); # CYRILLIC SMALL LETTER SHCHA
273 &add(0xA7,0xEC,0x044A); # CYRILLIC SMALL LETTER HARD SIGN
274 &add(0xA7,0xED,0x044B); # CYRILLIC SMALL LETTER YERU
275 &add(0xA7,0xEE,0x044C); # CYRILLIC SMALL LETTER SOFT SIGN
276 &add(0xA7,0xEF,0x044D); # CYRILLIC SMALL LETTER E
277 &add(0xA7,0xF0,0x044E); # CYRILLIC SMALL LETTER YU
278 &add(0xA7,0xF1,0x044F); # CYRILLIC SMALL LETTER YA
279 &add(0xA8,0xA1,0x0101); # LATIN SMALL LETTER A WITH MACRON
280 &add(0xA8,0xA2,0x00E1); # LATIN SMALL LETTER A WITH ACUTE
281 &add(0xA8,0xA3,0x01CE); # LATIN SMALL LETTER A WITH CARON
282 &add(0xA8,0xA4,0x00E0); # LATIN SMALL LETTER A WITH GRAVE
283 &add(0xA8,0xA5,0x0113); # LATIN SMALL LETTER E WITH MACRON
284 &add(0xA8,0xA6,0x00E9); # LATIN SMALL LETTER E WITH ACUTE
285 &add(0xA8,0xA7,0x011B); # LATIN SMALL LETTER E WITH CARON
286 &add(0xA8,0xA8,0x00E8); # LATIN SMALL LETTER E WITH GRAVE
287 &add(0xA8,0xA9,0x012B); # LATIN SMALL LETTER I WITH MACRON
288 &add(0xA8,0xAA,0x00ED); # LATIN SMALL LETTER I WITH ACUTE
289 &add(0xA8,0xAB,0x01D0); # LATIN SMALL LETTER I WITH CARON
290 &add(0xA8,0xAC,0x00EC); # LATIN SMALL LETTER I WITH GRAVE
291 &add(0xA8,0xAD,0x014D); # LATIN SMALL LETTER O WITH MACRON
292 &add(0xA8,0xAE,0x00F3); # LATIN SMALL LETTER O WITH ACUTE
293 &add(0xA8,0xAF,0x01D2); # LATIN SMALL LETTER O WITH CARON
294 &add(0xA8,0xB0,0x00F2); # LATIN SMALL LETTER O WITH GRAVE
295 &add(0xA8,0xB1,0x016B); # LATIN SMALL LETTER U WITH MACRON
296 &add(0xA8,0xB2,0x00FA); # LATIN SMALL LETTER U WITH ACUTE
297 &add(0xA8,0xB3,0x01D4); # LATIN SMALL LETTER U WITH CARON
298 &add(0xA8,0xB4,0x00F9); # LATIN SMALL LETTER U WITH GRAVE
299 &add(0xA8,0xB5,0x01D6); # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
300 &add(0xA8,0xB6,0x01D8); # LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE
301 &add(0xA8,0xB7,0x01DA); # LATIN SMALL LETTER U WITH DIAERESIS AND CARON
302 &add(0xA8,0xB8,0x01DC); # LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE
303 &add(0xA8,0xB9,0x00FC); # LATIN SMALL LETTER U WITH DIAERESIS
304 &add(0xA8,0xBA,0x00EA); # LATIN SMALL LETTER E WITH CIRCUMFLEX
305 # BOPOMOFO
306 foreach ((0xC5..0xE9)) {
307 &add(0xA8,$_,0x3105+$_-0xC5);
308 }
309 # BOX DRAWINGS
310 foreach ((0xA4..0xEF)) {
311 &add(0xA9,$_,0x2500+$_-0xA4);
312 }
313
314 sub add {
315 local($codeh,$codel,$unicode) = @_;
316
317 my $code = $codeh*256+$codel;
318 my $unicodehash= int($unicode % $revhash);
319
320 die if $codeh < 0xA1 || $codeh > 0xF7;
321 die if $codel < 0xA1 || $codel > 0xFE;
322
323 if (! defined $fwd{$codeh})
324 {
325 my %dummy;
326
327 $fwd{$codeh}= \%dummy;
328 }
329
330 $fwd{$codeh}{$codel}=$unicode;
331
332 if (! defined $rev[$unicodehash])
333 {
334 my @dummy;
335
336 $rev[$unicodehash]= \@dummy;
337 }
338
339 my $r=$rev[$unicodehash];
340
341 push @$r, "$unicode $code";
342
343 $revmap{$unicode} = $code;
344 }
345
346 # Add maps for CJK compatibility ideographs of Unicode.
347 &add_cjkcompat(%compat_ksx1001);
348 &add_cjkcompat(%compat_big5);
349 &add_cjkcompat(%compat_ibm32);
350 &add_cjkcompat(%compat_jisx0213);
351 &add_cjkcompat(%compat_cns11643);
352
353 sub add_cjkcompat {
354 local(%compat) = @_;
355 foreach (keys %compat) {
356 if (defined $revmap{$compat{$_}}) {
357 my $unicodehash = int($_ % $revhash);
358 if (! defined $rev[$unicodehash])
359 {
360 my @dummy;
361 $rev[$unicodehash]= \@dummy;
362 }
363 my $r=$rev[$unicodehash];
364 push @$r, "$_ $revmap{$compat{$_}}";
365 }
366 }
367 }
368
369
370
371 print '
372 /*
373 ** Copyright 2000-2001 Double Precision, Inc.
374 ** See COPYING for distribution information.
375 **
376 ** $Id: gb2312.pl,v 1.4 2004/02/08 04:59:15 mrsam Exp $
377 ** Non-hanzi support by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>
378 */
379
380 #include "unicode.h"
381 ';
382
383 foreach (sort keys %fwd)
384 {
385 my $h=$_;
386 my $l;
387
388 printf ("static const unicode_char gb2312_%02x[94]={", $h);
389
390 for ($l=0xA1; $l < 0xFF; $l++)
391 {
392 print "\n" if ($l % 16) == 0;
393 printf ("%d", $fwd{$h}{$l});
394 print "," unless $l >= 0xFE;
395 }
396 print "};\n";
397
398 }
399
400 print "static const unsigned gb2312_revhash_size=$revhash;
401 static const unicode_char gb2312_revtable_uc[]={\n";
402
403 my $index=0;
404
405 for ($i=0; $i<$revhash; $i++)
406 {
407 my $a= $rev[$i];
408
409 $revindex[$i]=$index;
410
411 my $v;
412
413 my @aa=@$a;
414
415 while (defined ($v=shift @aa))
416 {
417 print "," if $index > 0;
418 print "\n" if $index && ($index % 16) == 0;
419
420 $v =~ s/ .*//;
421 print $v;
422 ++$index;
423 }
424 }
425
426 print "};\nstatic const unsigned gb2312_revtable_octets[]={\n";
427
428 $maxl=0;
429 $index=0;
430 for ($i=0; $i<$revhash; $i++)
431 {
432 my $a= $rev[$i];
433
434 my $v;
435
436 my @aa=@$a;
437
438 $maxl=$#aa if $#aa > $maxl;
439 while (defined ($v=shift @aa))
440 {
441 print "," if $index > 0;
442 print "\n" if $index && ($index % 16) == 0;
443
444 $v =~ s/.* //;
445 print $v;
446 ++$index;
447 }
448 }
449
450 print "};\nstatic const unsigned gb2312_revtable_index[]={\n";
451
452 for ($i=0; $i<$revhash; $i++)
453 {
454 print "," if $i > 0;
455 print "\n" if $i && ($i % 16) == 0;
456 print $revindex[$i];
457 }
458
459 print "};\n";