Imported Upstream version 0.63.0
[hcoop/debian/courier-authlib.git] / unicode / big5.pl
1 #!/usr/bin/perl
2
3 # USAGE: perl big5.pl > big5.h
4
5 # Requires:
6 # - Unihan-3.2.0.txt.gz
7 # - big5-iso.txt, found on
8 # http://www.info.gov.hk/digital21/chi/hkscs/download/big5-iso.txt
9
10 require "cjkcompat.pl";
11
12 my $revhash=1501;
13 $unihan = "Unihan-3.2.0.txt.gz";
14 $hkscs = "big5-iso.txt";
15
16 # Get hanzi maps
17 open (SET, "gunzip -cd <${unihan} |") || die "${unihan}: $!\n";
18 while (<SET>)
19 {
20 chomp;
21 s/\#.*//;
22
23 next unless /^U\+([0-9A-F]{4,5})\s+kBigFive\s+(....)/i;
24
25 my ($unicode, $code)=("0x$1", "0x$2");
26
27 eval "\$code=$code;";
28 eval "\$unicode=$unicode;";
29
30 next if $code == 0xA2CD; # See below.
31
32 die if $code < 0 || $code > 65535;
33
34 my $codeh= int($code/256);
35 my $codel= $code % 256;
36
37 &add($codeh,$codel,$unicode);
38 }
39 close SET;
40
41 # Get HKSCS extension maps
42 open (SET, $hkscs) || die "${hkscs}: $!\n";
43 while (<SET>)
44 {
45 chomp;
46 s/\#.*//;
47
48 @_ = split(/\s+/, $_);
49 next unless scalar(@_) == 4;
50
51 my ($unicode, $code)=(hex($_[3]), hex($_[0]));
52 next unless $code;
53 die unless $unicode;
54
55 # PUA
56 ####next if 0xE000 <= $unicode && $unicode <= 0xF8FF;
57
58 die if $code < 0 || $code > 65535;
59
60 my $codeh= int($code/256);
61 my $codel= $code % 256;
62
63 &add($codeh,$codel,$unicode,1);
64 $count++;
65 }
66 close SET;
67
68 if ($count != 4818) {
69 die "$count characters are found. HKSCS extension table has been updated. Check ${hkscs}.";
70 }
71
72 # Unihan-3.2.0 does not make mention of Big5 non-hanzi.
73 # So manually add a converting map...
74 #
75 # Note:
76 # non-HKSCS Map is based on:
77 # http://wakaba-web.hp.infoseek.co.jp/table/big5-eten.txt
78
79 &add(0xA1,0x40,0x3000); # IDEOGRAPHIC SPACE
80 &add(0xA1,0x41,0xFF0C); # FULLWIDTH COMMA
81 &add(0xA1,0x42,0x3001); # IDEOGRAPHIC COMMA
82 &add(0xA1,0x43,0x3002); # IDEOGRAPHIC FULL STOP
83 &add(0xA1,0x44,0xFF0E); # FULLWIDTH FULL STOP
84 &add(0xA1,0x45,0x2027); # HYPHENATION POINT
85 &add(0xA1,0x46,0xFF1B); # FULLWIDTH SEMICOLON
86 &add(0xA1,0x47,0xFF1A); # FULLWIDTH COLON
87 &add(0xA1,0x48,0xFF1F); # FULLWIDTH QUESTION MARK
88 &add(0xA1,0x49,0xFF01); # FULLWIDTH EXCLAMATION MARK
89 &add(0xA1,0x4A,0xFE30); # PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
90 &add(0xA1,0x4B,0x2026); # HORIZONTAL ELLIPSIS
91 &add(0xA1,0x4C,0x2025); # TWO DOT LEADER
92 &add(0xA1,0x4D,0xFE50); # SMALL COMMA
93 &add(0xA1,0x4E,0xFE51); # SMALL IDEOGRAPHIC COMMA
94 &add(0xA1,0x4F,0xFE52); # SMALL FULL STOP
95 &add(0xA1,0x50,0x00B7); # MIDDLE DOT
96 &add(0xA1,0x51,0xFE54); # SMALL SEMICOLON
97 &add(0xA1,0x52,0xFE55); # SMALL COLON
98 &add(0xA1,0x53,0xFE56); # SMALL QUESTION MARK
99 &add(0xA1,0x54,0xFE57); # SMALL EXCLAMATION MARK
100 &add(0xA1,0x55,0xFF5C); # FULLWIDTH VERTICAL LINE
101 &add(0xA1,0x56,0x2013); # EN DASH
102 &add(0xA1,0x57,0xFE31); # PRESENTATION FORM FOR VERTICAL EM DASH
103 &add(0xA1,0x58,0x2014); # EM DASH
104 &add(0xA1,0x59,0xFE33); # PRESENTATION FORM FOR VERTICAL LOW LINE
105 &add(0xA1,0x5A,0x2574); # BOX DRAWINGS LIGHT LEFT
106 &add(0xA1,0x5B,0xFE34); # PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
107 &add(0xA1,0x5C,0xFE4F); # WAVY LOW LINE
108 &add(0xA1,0x5D,0xFF08); # FULLWIDTH LEFT PARENTHESIS
109 &add(0xA1,0x5E,0xFF09); # FULLWIDTH RIGHT PARENTHESIS
110 &add(0xA1,0x5F,0xFE35); # PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
111 &add(0xA1,0x60,0xFE36); # PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
112 &add(0xA1,0x61,0xFF5B); # FULLWIDTH LEFT CURLY BRACKET
113 &add(0xA1,0x62,0xFF5D); # FULLWIDTH RIGHT CURLY BRACKET
114 &add(0xA1,0x63,0xFE37); # PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
115 &add(0xA1,0x64,0xFE38); # PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
116 &add(0xA1,0x65,0x3014); # LEFT TORTOISE SHELL BRACKET
117 &add(0xA1,0x66,0x3015); # RIGHT TORTOISE SHELL BRACKET
118 &add(0xA1,0x67,0xFE39); # PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
119 &add(0xA1,0x68,0xFE3A); # PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
120 &add(0xA1,0x69,0x3010); # LEFT BLACK LENTICULAR BRACKET
121 &add(0xA1,0x6A,0x3011); # RIGHT BLACK LENTICULAR BRACKET
122 &add(0xA1,0x6B,0xFE3B); # PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
123 &add(0xA1,0x6C,0xFE3C); # PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
124 &add(0xA1,0x6D,0x300A); # LEFT DOUBLE ANGLE BRACKET
125 &add(0xA1,0x6E,0x300B); # RIGHT DOUBLE ANGLE BRACKET
126 &add(0xA1,0x6F,0xFE3D); # PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
127 &add(0xA1,0x70,0xFE3E); # PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
128 &add(0xA1,0x71,0x3008); # LEFT ANGLE BRACKET
129 &add(0xA1,0x72,0x3009); # RIGHT ANGLE BRACKET
130 &add(0xA1,0x73,0xFE3F); # PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
131 &add(0xA1,0x74,0xFE40); # PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
132 &add(0xA1,0x75,0x300C); # LEFT CORNER BRACKET
133 &add(0xA1,0x76,0x300D); # RIGHT CORNER BRACKET
134 &add(0xA1,0x77,0xFE41); # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
135 &add(0xA1,0x78,0xFE42); # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
136 &add(0xA1,0x79,0x300E); # LEFT WHITE CORNER BRACKET
137 &add(0xA1,0x7A,0x300F); # RIGHT WHITE CORNER BRACKET
138 &add(0xA1,0x7B,0xFE43); # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
139 &add(0xA1,0x7C,0xFE44); # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
140 &add(0xA1,0x7D,0xFE59); # SMALL LEFT PARENTHESIS
141 &add(0xA1,0x7E,0xFE5A); # SMALL RIGHT PARENTHESIS
142 &add(0xA1,0xA1,0xFE5B); # SMALL LEFT CURLY BRACKET
143 &add(0xA1,0xA2,0xFE5C); # SMALL RIGHT CURLY BRACKET
144 &add(0xA1,0xA3,0xFE5D); # SMALL LEFT TORTOISE SHELL BRACKET
145 &add(0xA1,0xA4,0xFE5E); # SMALL RIGHT TORTOISE SHELL BRACKET
146 &add(0xA1,0xA5,0x2018); # LEFT SINGLE QUOTATION MARK
147 &add(0xA1,0xA6,0x2019); # RIGHT SINGLE QUOTATION MARK
148 &add(0xA1,0xA7,0x201C); # LEFT DOUBLE QUOTATION MARK
149 &add(0xA1,0xA8,0x201D); # RIGHT DOUBLE QUOTATION MARK
150 &add(0xA1,0xA9,0x301D); # REVERSED DOUBLE PRIME QUOTATION MARK
151 &add(0xA1,0xAA,0x301E); # DOUBLE PRIME QUOTATION MARK
152 &add(0xA1,0xAB,0x2035); # REVERSED PRIME
153 &add(0xA1,0xAC,0x2032); # PRIME
154 &add(0xA1,0xAD,0xFF03); # FULLWIDTH NUMBER SIGN
155 &add(0xA1,0xAE,0xFF06); # FULLWIDTH AMPERSAND
156 &add(0xA1,0xAF,0xFF0A); # FULLWIDTH ASTERISK
157 &add(0xA1,0xB0,0x203B); # REFERENCE MARK
158 &add(0xA1,0xB1,0x00A7); # SECTION SIGN
159 &add(0xA1,0xB2,0x3003); # DITTO MARK
160 &add(0xA1,0xB3,0x25CB); # WHITE CIRCLE
161 &add(0xA1,0xB4,0x25CF); # BLACK CIRCLE
162 &add(0xA1,0xB5,0x25B3); # WHITE UP-POINTING TRIANGLE
163 &add(0xA1,0xB6,0x25B2); # BLACK UP-POINTING TRIANGLE
164 &add(0xA1,0xB7,0x25CE); # BULLSEYE
165 &add(0xA1,0xB8,0x2606); # WHITE STAR
166 &add(0xA1,0xB9,0x2605); # BLACK STAR
167 &add(0xA1,0xBA,0x25C7); # WHITE DIAMOND
168 &add(0xA1,0xBB,0x25C6); # BLACK DIAMOND
169 &add(0xA1,0xBC,0x25A1); # WHITE SQUARE
170 &add(0xA1,0xBD,0x25A0); # BLACK SQUARE
171 &add(0xA1,0xBE,0x25BD); # WHITE DOWN-POINTING TRIANGLE
172 &add(0xA1,0xBF,0x25BC); # BLACK DOWN-POINTING TRIANGLE
173 &add(0xA1,0xC0,0x32A3); # CIRCLED IDEOGRAPH CORRECT
174 &add(0xA1,0xC1,0x2105); # CARE OF
175 &add(0xA1,0xC2,0x00AF); # MACRON
176 &add(0xA1,0xC3,0xFFE3); # FULLWIDTH MACRON
177 &add(0xA1,0xC4,0xFF3F); # FULLWIDTH LOW LINE
178 &add(0xA1,0xC5,0x02CD); # MODIFIER LETTER LOW MACRON
179 &add(0xA1,0xC6,0xFE49); # DASHED OVERLINE
180 &add(0xA1,0xC7,0xFE4A); # CENTRELINE OVERLINE
181 &add(0xA1,0xC8,0xFE4D); # DASHED LOW LINE
182 &add(0xA1,0xC9,0xFE4E); # CENTRELINE LOW LINE
183 &add(0xA1,0xCA,0xFE4B); # WAVY OVERLINE
184 &add(0xA1,0xCB,0xFE4C); # DOUBLE WAVY OVERLINE
185 &add(0xA1,0xCC,0xFE5F); # SMALL NUMBER SIGN
186 &add(0xA1,0xCD,0xFE60); # SMALL AMPERSAND
187 &add(0xA1,0xCE,0xFE61); # SMALL ASTERISK
188 &add(0xA1,0xCF,0xFF0B); # FULLWIDTH PLUS SIGN
189 &add(0xA1,0xD0,0xFF0D); # FULLWIDTH HYPHEN-MINUS
190 &add(0xA1,0xD1,0x00D7); # MULTIPLICATION SIGN
191 &add(0xA1,0xD2,0x00F7); # DIVISION SIGN
192 &add(0xA1,0xD3,0x00B1); # PLUS-MINUS SIGN
193 &add(0xA1,0xD4,0x221A); # SQUARE ROOT
194 &add(0xA1,0xD5,0xFF1C); # FULLWIDTH LESS-THAN SIGN
195 &add(0xA1,0xD6,0xFF1E); # FULLWIDTH GREATER-THAN SIGN
196 &add(0xA1,0xD7,0xFF1D); # FULLWIDTH EQUALS SIGN
197 &add(0xA1,0xD8,0x2266); # LESS-THAN OVER EQUAL TO
198 &add(0xA1,0xD9,0x2267); # GREATER-THAN OVER EQUAL TO
199 &add(0xA1,0xDA,0x2260); # NOT EQUAL TO
200 &add(0xA1,0xDB,0x221E); # INFINITY
201 &add(0xA1,0xDC,0x2252); # APPROXIMATELY EQUAL TO OR THE IMAGE OF
202 &add(0xA1,0xDD,0x2261); # IDENTICAL TO
203 &add(0xA1,0xDE,0xFE62); # SMALL PLUS SIGN
204 &add(0xA1,0xDF,0xFE63); # SMALL HYPHEN-MINUS
205 &add(0xA1,0xE0,0xFE64); # SMALL LESS-THAN SIGN
206 &add(0xA1,0xE1,0xFE65); # SMALL GREATER-THAN SIGN
207 &add(0xA1,0xE2,0xFE66); # SMALL EQUALS SIGN
208 &add(0xA1,0xE3,0xFF5E); # FULLWIDTH TILDE
209 &add(0xA1,0xE4,0x2229); # INTERSECTION
210 &add(0xA1,0xE5,0x222A); # UNION
211 &add(0xA1,0xE6,0x22A5); # UP TACK
212 &add(0xA1,0xE7,0x2220); # ANGLE
213 &add(0xA1,0xE8,0x221F); # RIGHT ANGLE
214 &add(0xA1,0xE9,0x22BF); # RIGHT TRIANGLE
215 &add(0xA1,0xEA,0x33D2); # SQUARE LOG
216 &add(0xA1,0xEB,0x33D1); # SQUARE LN
217 &add(0xA1,0xEC,0x222B); # INTEGRAL
218 &add(0xA1,0xED,0x222E); # CONTOUR INTEGRAL
219 &add(0xA1,0xEE,0x2235); # BECAUSE
220 &add(0xA1,0xEF,0x2234); # THEREFORE
221 &add(0xA1,0xF0,0x2640); # FEMALE SIGN
222 &add(0xA1,0xF1,0x2642); # MALE SIGN
223 &add(0xA1,0xF2,0x2295); # CIRCLED PLUS
224 &add(0xA1,0xF3,0x2299); # CIRCLED DOT OPERATOR
225 &add(0xA1,0xF4,0x2191); # UPWARDS ARROW
226 &add(0xA1,0xF5,0x2193); # DOWNWARDS ARROW
227 &add(0xA1,0xF6,0x2190); # LEFTWARDS ARROW
228 &add(0xA1,0xF7,0x2192); # RIGHTWARDS ARROW
229 &add(0xA1,0xF8,0x2196); # NORTH WEST ARROW
230 &add(0xA1,0xF9,0x2197); # NORTH EAST ARROW
231 &add(0xA1,0xFA,0x2199); # SOUTH WEST ARROW
232 &add(0xA1,0xFB,0x2198); # SOUTH EAST ARROW
233 &add(0xA1,0xFC,0x2225); # PARALLEL TO
234 &add(0xA1,0xFD,0x2223); # DIVIDES
235 &add(0xA1,0xFE,0xFF0F); # FULLWIDTH SOLIDUS
236 &add(0xA2,0x40,0xFF3C); # FULLWIDTH REVERSE SOLIDUS
237 &add(0xA2,0x41,0x2215); # DIVISION SLASH
238 &add(0xA2,0x42,0xFE68); # SMALL REVERSE SOLIDUS
239 &add(0xA2,0x43,0xFF04); # FULLWIDTH DOLLAR SIGN
240 &add(0xA2,0x44,0xFFE5); # FULLWIDTH YEN SIGN
241 &add(0xA2,0x45,0x3012); # POSTAL MARK
242 &add(0xA2,0x46,0xFFE0); # FULLWIDTH CENT SIGN
243 &add(0xA2,0x47,0xFFE1); # FULLWIDTH POUND SIGN
244 &add(0xA2,0x48,0xFF05); # FULLWIDTH PERCENT SIGN
245 &add(0xA2,0x49,0xFF20); # FULLWIDTH COMMERCIAL AT
246 &add(0xA2,0x4A,0x2103); # DEGREE CELSIUS
247 &add(0xA2,0x4B,0x2109); # DEGREE FAHRENHEIT
248 &add(0xA2,0x4C,0xFE69); # SMALL DOLLAR SIGN
249 &add(0xA2,0x4D,0xFE6A); # SMALL PERCENT SIGN
250 &add(0xA2,0x4E,0xFE6B); # SMALL COMMERCIAL AT
251 &add(0xA2,0x4F,0x33D5); # SQUARE MIL
252 &add(0xA2,0x50,0x339C); # SQUARE MM
253 &add(0xA2,0x51,0x339D); # SQUARE CM
254 &add(0xA2,0x52,0x339E); # SQUARE KM
255 &add(0xA2,0x53,0x33CE); # SQUARE KM CAPITAL
256 &add(0xA2,0x54,0x33A1); # SQUARE M SQUARED
257 &add(0xA2,0x55,0x338E); # SQUARE MG
258 &add(0xA2,0x56,0x338F); # SQUARE KG
259 &add(0xA2,0x57,0x33C4); # SQUARE CC
260 &add(0xA2,0x58,0x00B0); # DEGREE SIGN
261 # 0xA259-0xA261: <CJK>
262 &add(0xA2,0x62,0x2581); # LOWER ONE EIGHTH BLOCK
263 &add(0xA2,0x63,0x2582); # LOWER ONE QUARTER BLOCK
264 &add(0xA2,0x64,0x2583); # LOWER THREE EIGHTHS BLOCK
265 &add(0xA2,0x65,0x2584); # LOWER HALF BLOCK
266 &add(0xA2,0x66,0x2585); # LOWER FIVE EIGHTHS BLOCK
267 &add(0xA2,0x67,0x2586); # LOWER THREE QUARTERS BLOCK
268 &add(0xA2,0x68,0x2587); # LOWER SEVEN EIGHTHS BLOCK
269 &add(0xA2,0x69,0x2588); # FULL BLOCK
270 &add(0xA2,0x6A,0x258F); # LEFT ONE EIGHTH BLOCK
271 &add(0xA2,0x6B,0x258E); # LEFT ONE QUARTER BLOCK
272 &add(0xA2,0x6C,0x258D); # LEFT THREE EIGHTHS BLOCK
273 &add(0xA2,0x6D,0x258C); # LEFT HALF BLOCK
274 &add(0xA2,0x6E,0x258B); # LEFT FIVE EIGHTHS BLOCK
275 &add(0xA2,0x6F,0x258A); # LEFT THREE QUARTERS BLOCK
276 &add(0xA2,0x70,0x2589); # LEFT SEVEN EIGHTHS BLOCK
277 &add(0xA2,0x71,0x253C); # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
278 &add(0xA2,0x72,0x2534); # BOX DRAWINGS LIGHT UP AND HORIZONTAL
279 &add(0xA2,0x73,0x252C); # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
280 &add(0xA2,0x74,0x2524); # BOX DRAWINGS LIGHT VERTICAL AND LEFT
281 &add(0xA2,0x75,0x251C); # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
282 &add(0xA2,0x76,0x2594); # UPPER ONE EIGHTH BLOCK
283 &add(0xA2,0x77,0x2500); # BOX DRAWINGS LIGHT HORIZONTAL
284 &add(0xA2,0x78,0x2502); # BOX DRAWINGS LIGHT VERTICAL
285 &add(0xA2,0x79,0x2595); # RIGHT ONE EIGHTH BLOCK
286 &add(0xA2,0x7A,0x250C); # BOX DRAWINGS LIGHT DOWN AND RIGHT
287 &add(0xA2,0x7B,0x2510); # BOX DRAWINGS LIGHT DOWN AND LEFT
288 &add(0xA2,0x7C,0x2514); # BOX DRAWINGS LIGHT UP AND RIGHT
289 &add(0xA2,0x7D,0x2518); # BOX DRAWINGS LIGHT UP AND LEFT
290
291 # 0xA27E-0xA2A7:
292 # Duplicated maps with HKSCS:2001 0xF9E9-0xF9EB,0xF9F9-0xF9FD
293 &dup(0xA2,0x7E,0x256D); # BOX DRAWINGS LIGHT ARC DOWN AND RIGHT
294 &dup(0xA2,0xA1,0x256E); # BOX DRAWINGS LIGHT ARC DOWN AND LEFT
295 &dup(0xA2,0xA2,0x2570); # BOX DRAWINGS LIGHT ARC UP AND RIGHT
296 &dup(0xA2,0xA3,0x256F); # BOX DRAWINGS LIGHT ARC UP AND LEFT
297 &dup(0xA2,0xA4,0x2550); # BOX DRAWINGS DOUBLE HORIZONTAL
298 &dup(0xA2,0xA5,0x255E); # BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
299 &dup(0xA2,0xA6,0x256A); # BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
300 &dup(0xA2,0xA7,0x2561); # BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
301
302 &add(0xA2,0xA8,0x25E2); # BLACK LOWER RIGHT TRIANGLE
303 &add(0xA2,0xA9,0x25E3); # BLACK LOWER LEFT TRIANGLE
304 &add(0xA2,0xAA,0x25E5); # BLACK UPPER RIGHT TRIANGLE
305 &add(0xA2,0xAB,0x25E4); # BLACK UPPER LEFT TRIANGLE
306 &add(0xA2,0xAC,0x2571); # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
307 &add(0xA2,0xAD,0x2572); # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
308 &add(0xA2,0xAE,0x2573); # BOX DRAWINGS LIGHT DIAGONAL CROSS
309 # FULLWIDTH DIGIT
310 foreach ((0xAF..0xB8)) {
311 &add(0xA2,$_,0xFF10+$_-0xAF);
312 }
313 # ROMAN NUMERAL
314 foreach ((0xB9..0xC2)) {
315 &add(0xA2,$_,0x2160+$_-0xB9);
316 }
317 # HANGZHOU NUMERAL
318 foreach ((0xC3..0xCB)) {
319 &add(0xA2,$_,0x3021+$_-0xC3);
320 }
321 # 0xA2CC-0xA2CE:
322 # unified to CJK by original Big5 (and causes duplicated mapping) but they
323 # might be mapped to same block as preceding code points (HANGZHOU NUMERAL).
324 &add(0xA2,0xCC,0x3038); # HANGZHOU NUMERAL TEN
325 &add(0xA2,0xCD,0x3039); # HANGZHOU NUMERAL TWENTY
326 &add(0xA2,0xCE,0x303A); # HANGZHOU NUMERAL THIRTY
327 # FULLWIDTH LATIN CAPITAL LETTER
328 foreach ((0xCF..0xE8)) {
329 &add(0xA2,$_,0xFF21+$_-0xCF);
330 }
331 # FULLWIDTH LATIN SMALL LETTER
332 foreach ((0xE9..0xFE)) {
333 &add(0xA2,$_,0xFF41+$_-0xE9);
334 }
335 foreach ((0x40..0x43)) {
336 &add(0xA3,$_,0xFF57+$_-0x40);
337 }
338 &add(0xA3,0x44,0x0391); # GREEK CAPITAL LETTER ALPHA
339 &add(0xA3,0x45,0x0392); # GREEK CAPITAL LETTER BETA
340 &add(0xA3,0x46,0x0393); # GREEK CAPITAL LETTER GAMMA
341 &add(0xA3,0x47,0x0394); # GREEK CAPITAL LETTER DELTA
342 &add(0xA3,0x48,0x0395); # GREEK CAPITAL LETTER EPSILON
343 &add(0xA3,0x49,0x0396); # GREEK CAPITAL LETTER ZETA
344 &add(0xA3,0x4A,0x0397); # GREEK CAPITAL LETTER ETA
345 &add(0xA3,0x4B,0x0398); # GREEK CAPITAL LETTER THETA
346 &add(0xA3,0x4C,0x0399); # GREEK CAPITAL LETTER IOTA
347 &add(0xA3,0x4D,0x039A); # GREEK CAPITAL LETTER KAPPA
348 &add(0xA3,0x4E,0x039B); # GREEK CAPITAL LETTER LAMDA
349 &add(0xA3,0x4F,0x039C); # GREEK CAPITAL LETTER MU
350 &add(0xA3,0x50,0x039D); # GREEK CAPITAL LETTER NU
351 &add(0xA3,0x51,0x039E); # GREEK CAPITAL LETTER XI
352 &add(0xA3,0x52,0x039F); # GREEK CAPITAL LETTER OMICRON
353 &add(0xA3,0x53,0x03A0); # GREEK CAPITAL LETTER PI
354 &add(0xA3,0x54,0x03A1); # GREEK CAPITAL LETTER RHO
355 &add(0xA3,0x55,0x03A3); # GREEK CAPITAL LETTER SIGMA
356 &add(0xA3,0x56,0x03A4); # GREEK CAPITAL LETTER TAU
357 &add(0xA3,0x57,0x03A5); # GREEK CAPITAL LETTER UPSILON
358 &add(0xA3,0x58,0x03A6); # GREEK CAPITAL LETTER PHI
359 &add(0xA3,0x59,0x03A7); # GREEK CAPITAL LETTER CHI
360 &add(0xA3,0x5A,0x03A8); # GREEK CAPITAL LETTER PSI
361 &add(0xA3,0x5B,0x03A9); # GREEK CAPITAL LETTER OMEGA
362 &add(0xA3,0x5C,0x03B1); # GREEK SMALL LETTER ALPHA
363 &add(0xA3,0x5D,0x03B2); # GREEK SMALL LETTER BETA
364 &add(0xA3,0x5E,0x03B3); # GREEK SMALL LETTER GAMMA
365 &add(0xA3,0x5F,0x03B4); # GREEK SMALL LETTER DELTA
366 &add(0xA3,0x60,0x03B5); # GREEK SMALL LETTER EPSILON
367 &add(0xA3,0x61,0x03B6); # GREEK SMALL LETTER ZETA
368 &add(0xA3,0x62,0x03B7); # GREEK SMALL LETTER ETA
369 &add(0xA3,0x63,0x03B8); # GREEK SMALL LETTER THETA
370 &add(0xA3,0x64,0x03B9); # GREEK SMALL LETTER IOTA
371 &add(0xA3,0x65,0x03BA); # GREEK SMALL LETTER KAPPA
372 &add(0xA3,0x66,0x03BB); # GREEK SMALL LETTER LAMDA
373 &add(0xA3,0x67,0x03BC); # GREEK SMALL LETTER MU
374 &add(0xA3,0x68,0x03BD); # GREEK SMALL LETTER NU
375 &add(0xA3,0x69,0x03BE); # GREEK SMALL LETTER XI
376 &add(0xA3,0x6A,0x03BF); # GREEK SMALL LETTER OMICRON
377 &add(0xA3,0x6B,0x03C0); # GREEK SMALL LETTER PI
378 &add(0xA3,0x6C,0x03C1); # GREEK SMALL LETTER RHO
379 &add(0xA3,0x6D,0x03C3); # GREEK SMALL LETTER SIGMA
380 &add(0xA3,0x6E,0x03C4); # GREEK SMALL LETTER TAU
381 &add(0xA3,0x6F,0x03C5); # GREEK SMALL LETTER UPSILON
382 &add(0xA3,0x70,0x03C6); # GREEK SMALL LETTER PHI
383 &add(0xA3,0x71,0x03C7); # GREEK SMALL LETTER CHI
384 &add(0xA3,0x72,0x03C8); # GREEK SMALL LETTER PSI
385 &add(0xA3,0x73,0x03C9); # GREEK SMALL LETTER OMEGA
386 # BOPOMOFO LETTER
387 foreach ((0x74..0x7E)) {
388 &add(0xA3,$_,0x3105+$_-0x74);
389 }
390 foreach ((0xA1..0xBA)) {
391 &add(0xA3,$_,0x3110+$_-0xA1);
392 }
393 &add(0xA3,0xBB,0x02D9); # DOT ABOVE
394 &add(0xA3,0xBC,0x02C9); # MODIFIER LETTER MACRON
395 &add(0xA3,0xBD,0x02CA); # MODIFIER LETTER ACUTE ACCENT
396 &add(0xA3,0xBE,0x02C7); # CARON
397 &add(0xA3,0xBF,0x02CB); # MODIFIER LETTER GRAVE ACCENT
398
399 # 0xA3E1:
400 # found in some vendor codepages for Big5 (e.g. CP950).
401 &add(0xA3,0xE1,0x20AC); # EURO SIGN
402
403 # 0xC6BF-0xC6D7:
404 # Some implementations for Big5 (non-HKSCS) remove these code points
405 # and Big5-HKSCS:2001 assigns other CJK characters.
406 # So this range will not be mapped for Big5-ETen.
407
408 # 0xC8A5-0xC8CC: ETen input codes; not mapped.
409
410
411 # Following characters are not included in the HKSCS. However,
412 # the code points are reserved as compatibility points for backward
413 # compatibility.
414 # cf. http://www.info.gov.hk/digital21/chi/hkscs/download/big5cmp.txt
415 &dup(0x8E,0x69,$fwd{0xBA}{0xE6},1);
416 &dup(0x8E,0x6F,$fwd{0xED}{0xCA},1);
417 &dup(0x8E,0x7E,$fwd{0xA2}{0x61},1);
418 &dup(0x8E,0xAB,$fwd{0xBA}{0xFC},1);
419 &dup(0x8E,0xB4,$fwd{0xBF}{0xA6},1);
420 &dup(0x8E,0xCD,$fwd{0xAA}{0xCC},1);
421 &dup(0x8E,0xD0,$fwd{0xBF}{0xAE},1);
422 &dup(0x8F,0x57,$fwd{0xB5}{0xD7},1);
423 &dup(0x8F,0x69,$fwd{0xE3}{0xC8},1);
424 &dup(0x8F,0x6E,$fwd{0xDB}{0x79},1);
425 &dup(0x8F,0xCB,$fwd{0xBF}{0xCC},1);
426 &dup(0x8F,0xCC,$fwd{0xA0}{0xD4},1);
427 &dup(0x8F,0xFE,$fwd{0xB0}{0x5F},1);
428 &dup(0x90,0x6D,$fwd{0xB3}{0xA3},1);
429 &dup(0x90,0x7A,$fwd{0xF9}{0xD7},1);
430 &dup(0x90,0xDC,$fwd{0xC0}{0x52},1);
431 &dup(0x90,0xF1,$fwd{0xC5}{0x54},1);
432 &dup(0x91,0xBF,$fwd{0xF1}{0xE3},1);
433 &dup(0x92,0x44,$fwd{0x92}{0x42},1);
434 &dup(0x92,0xAF,$fwd{0xA2}{0x59},1);
435 &dup(0x92,0xB0,$fwd{0xA2}{0x5A},1);
436 &dup(0x92,0xB1,$fwd{0xA2}{0x5C},1);
437 &dup(0x92,0xB2,$fwd{0xA2}{0x5B},1);
438 &dup(0x92,0xC8,$fwd{0xA0}{0x5F},1);
439 &dup(0x92,0xD1,$fwd{0xE6}{0xAB},1);
440 &dup(0x94,0x47,$fwd{0xD2}{0x56},1);
441 &dup(0x94,0xCA,$fwd{0xE6}{0xD0},1);
442 &dup(0x95,0xD9,$fwd{0xCA}{0x52},1);
443 &dup(0x96,0x44,$fwd{0x9C}{0xE4},1);
444 &dup(0x96,0xED,$fwd{0x96}{0xEE},1);
445 &dup(0x96,0xFC,$fwd{0xE9}{0x59},1);
446 &dup(0x9B,0x76,$fwd{0xEF}{0xF9},1);
447 &dup(0x9B,0x78,$fwd{0xC5}{0xF7},1);
448 &dup(0x9B,0x7B,$fwd{0xF5}{0xE8},1);
449 &dup(0x9B,0xC6,$fwd{0xE8}{0xCD},1);
450 &dup(0x9B,0xDE,$fwd{0xD0}{0xC0},1);
451 &dup(0x9B,0xEC,$fwd{0xFD}{0x64},1);
452 &dup(0x9B,0xF6,$fwd{0xBF}{0x47},1);
453 &dup(0x9C,0x42,$fwd{0xEB}{0xC9},1);
454 &dup(0x9C,0x53,$fwd{0xCD}{0xE7},1);
455 &dup(0x9C,0x62,$fwd{0xC0}{0xE7},1);
456 &dup(0x9C,0x68,$fwd{0xDC}{0x52},1);
457 &dup(0x9C,0x6B,$fwd{0xF8}{0x6D},1);
458 &dup(0x9C,0x77,$fwd{0xDB}{0x5D},1);
459 &dup(0x9C,0xBC,$fwd{0xC9}{0x5C},1);
460 &dup(0x9C,0xBD,$fwd{0xAF}{0xB0},1);
461 &dup(0x9C,0xD0,$fwd{0xD4}{0xD1},1);
462 &dup(0x9D,0x57,$fwd{0xE0}{0x7C},1);
463 &dup(0x9D,0x5A,$fwd{0xB5}{0xAE},1);
464 &dup(0x9D,0xC4,$fwd{0xA9}{0xE4},1);
465 &dup(0x9E,0xA9,$fwd{0xAB}{0xEC},1);
466 &dup(0x9E,0xEF,$fwd{0xDE}{0xCD},1);
467 &dup(0x9E,0xFD,$fwd{0xC9}{0xFC},1);
468 &dup(0x9F,0x60,$fwd{0xF9}{0xC4},1);
469 &dup(0x9F,0x66,$fwd{0x91}{0xBE},1);
470 &dup(0x9F,0xCB,$fwd{0xB9}{0xB0},1);
471 &dup(0x9F,0xD8,$fwd{0x93}{0x61},1);
472 &dup(0xA0,0x63,$fwd{0x8F}{0xB6},1);
473 &dup(0xA0,0x77,$fwd{0xA9}{0xF0},1);
474 &dup(0xA0,0xD5,$fwd{0x94}{0x7A},1);
475 &dup(0xA0,0xDF,$fwd{0xDE}{0x72},1);
476 &dup(0xA0,0xE4,$fwd{0x94}{0x55},1);
477 &dup(0xFA,0x5F,$fwd{0xAD}{0xC5},1);
478 &dup(0xFA,0x66,$fwd{0xB0}{0xB0},1);
479 &dup(0xFA,0xBD,$fwd{0xA5}{0x5D},1);
480 &dup(0xFA,0xC5,$fwd{0xA2}{0xCD},1);
481 &dup(0xFA,0xD5,$fwd{0xAD}{0xEB},1);
482 &dup(0xFB,0x48,$fwd{0x9D}{0xEF},1);
483 &dup(0xFB,0xB8,$fwd{0xB4}{0x40},1);
484 &dup(0xFB,0xF3,$fwd{0xC9}{0xDB},1);
485 &dup(0xFB,0xF9,$fwd{0x9D}{0xFB},1);
486 &dup(0xFC,0x4F,$fwd{0xD8}{0xF4},1);
487 &dup(0xFC,0x6C,$fwd{0xA0}{0xDC},1);
488 &dup(0xFC,0xB9,$fwd{0xBC}{0xB5},1);
489 &dup(0xFC,0xE2,$fwd{0xB4}{0xB8},1);
490 &dup(0xFC,0xF1,$fwd{0xA7}{0xFB},1);
491 &dup(0xFD,0xB7,$fwd{0xCB}{0x58},1);
492 &dup(0xFD,0xB8,$fwd{0xB4}{0xFC},1);
493 &dup(0xFD,0xBB,$fwd{0xB4}{0xE4},1);
494 &dup(0xFD,0xF1,$fwd{0xB5}{0x4E},1);
495 &dup(0xFE,0x52,$fwd{0x99}{0x75},1);
496 &dup(0xFE,0x6F,$fwd{0xB7}{0xEC},1);
497 &dup(0xFE,0xAA,$fwd{0xA2}{0x60},1);
498 &dup(0xFE,0xDD,$fwd{0xCF}{0xF1},1);
499
500
501 sub add {
502 local($codeh,$codel,$unicode, $ishkscs) = @_;
503
504 my $code = $codeh*256+$codel;
505 my $unicodehash= int($unicode % $revhash);
506
507 if ($ishkscs) {
508 die sprintf("0x%04X",$code) if $codeh < 0x88 || $codeh > 0xFE;
509 } else {
510 die sprintf("0x%04X",$code) if $codeh < 161 || $codeh > 249;
511 }
512 die if $codel < 64 || ($codel >= 128 && $codel < 161) || $codel >= 255;
513
514 die unless $unicode;
515
516 die sprintf("0x%04X->U+%04X is duplicated with 0x%04X. use dup()\n",$code,$unicode,$revmap{$unicode})
517 if defined $revmap{$unicode};
518
519 if (! defined $fwd{$codeh})
520 {
521 my %dummy;
522
523 $fwd{$codeh}= \%dummy;
524 }
525
526 die sprintf("0x%04X->U+%04X is already mapped to U+%04X\n",$code,$unicode,$fwd{$codeh}{$codel})
527 if defined $fwd{$codeh}{$codel};
528
529 $fwd{$codeh}{$codel}=$unicode;
530
531 if (! defined $rev[$unicodehash])
532 {
533 my @dummy;
534
535 $rev[$unicodehash]= \@dummy;
536 }
537
538 my $r=$rev[$unicodehash];
539
540 push @$r, "$unicode $code";
541
542 $revmap{$unicode} = $code;
543 }
544
545 sub dup {
546 local($codeh,$codel,$unicode,$ishkscs) = @_;
547
548 my $code = $codeh*256+$codel;
549 my $unicodehash= int($unicode % $revhash);
550
551 if ($ishkscs) {
552 die sprintf("0x%04X",$code) if $codeh < 0x88 || $codeh > 0xFE;
553 } else {
554 die sprintf("0x%04X",$code) if $codeh < 161 || $codeh > 249;
555 }
556 die if $codel < 64 || ($codel >= 128 && $codel < 161) || $codel >= 255;
557
558 die unless $unicode;
559
560 die sprintf("%04X->U+%04X is not duplicated. use add()\n",$code,$unicode)
561 if ! defined $revmap{$unicode};
562
563 if (! defined $fwd{$codeh})
564 {
565 my %dummy;
566
567 $fwd{$codeh}= \%dummy;
568 }
569
570 $fwd{$codeh}{$codel}=$unicode;
571 }
572
573
574 # Add maps for CJK compatibility ideographs of Unicode.
575 &add_cjkcompat(%compat_ksx1001);
576 ####&add_cjkcompat(%compat_big5);
577 &add_cjkcompat(%compat_ibm32);
578 &add_cjkcompat(%compat_jisx0213);
579 ####&add_cjkcompat(%compat_cns11643);
580
581 sub add_cjkcompat {
582 local(%compat) = @_;
583 foreach (keys %compat) {
584 if (defined $revmap{$compat{$_}}) {
585 my $unicodehash = int($_ % $revhash);
586 if (! defined $rev[$unicodehash])
587 {
588 my @dummy;
589 $rev[$unicodehash]= \@dummy;
590 }
591 my $r=$rev[$unicodehash];
592 push @$r, "$_ $revmap{$compat{$_}}";
593 }
594 }
595 }
596
597
598 print '
599 /*
600 ** Copyright 2000 Double Precision, Inc.
601 ** See COPYING for distribution information.
602 **
603 ** $Id: big5.pl,v 1.4 2004/02/08 04:59:14 mrsam Exp $
604 ** Non-hanzi and ETen / HKSCS extension support
605 ** by Hatuka*nezumi - IKEDA Soji <nezumi@jca.apc.org>.
606 */
607
608 #include "unicode.h"
609 ';
610
611 foreach (sort keys %fwd)
612 {
613 my $h=$_;
614 my $l;
615
616 printf ("static const unicode_char big5_%02x_lo[64]={", $h);
617
618 for ($l=64; $l < 128; $l++)
619 {
620 print "\n" if ($l % 16) == 0;
621 printf ("%d", $fwd{$h}{$l});
622 print "," unless $l >= 127;
623 }
624 print "};\n";
625
626 printf ("static const unicode_char big5_%02x_hi[94]={\n", $h);
627
628 for ($l=161; $l < 255; $l++)
629 {
630 print "\n" if ($l % 16) == 0;
631 printf ("%d", $fwd{$h}{$l});
632 print "," unless $l >= 254;
633 }
634 print "};\n";
635 }
636
637 print "static const unsigned big5_revhash_size=$revhash;
638 static const unicode_char big5_revtable_uc[]={\n";
639
640 my $index=0;
641
642 my $maxl=0;
643
644 for ($i=0; $i<$revhash; $i++)
645 {
646 my $a= $rev[$i];
647
648 $revindex[$i]=$index;
649
650 my $v;
651
652 my @aa=@$a;
653
654 $maxl= $#aa if $#aa > $maxl;
655
656 while (defined ($v=shift @aa))
657 {
658 print "," if $index > 0;
659 print "\n" if $index && ($index % 16) == 0;
660
661 $v =~ s/ .*//;
662 print $v;
663 ++$index;
664 }
665 }
666
667 print "};\nstatic const unsigned big5_revtable_octets[]={\n";
668
669 $index=0;
670 for ($i=0; $i<$revhash; $i++)
671 {
672 my $a= $rev[$i];
673
674 my $v;
675
676 my @aa=@$a;
677
678 while (defined ($v=shift @aa))
679 {
680 print "," if $index > 0;
681 print "\n" if $index && ($index % 16) == 0;
682
683 $v =~ s/.* //;
684 print $v;
685 ++$index;
686 }
687 }
688
689 print "};\nstatic const unsigned big5_revtable_index[]={\n";
690
691 for ($i=0; $i<$revhash; $i++)
692 {
693 print "," if $i > 0;
694 print "\n" if $i && ($i % 16) == 0;
695 print $revindex[$i];
696 }
697
698 print "};\n";