32-way branching in intmap.scm, not 16-way
[bpt/guile.git] / libguile / unidata_to_charset.pl
1 #!/usr/bin/perl
2 # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
3 #
4 # Copyright (C) 2009, 2010 Free Software Foundation, Inc.
5 #
6 # This library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 3 of the License, or (at your option) any later version.
10 #
11 # This library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
15 #
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with this library; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
20 open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!";
21 open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
22
23 # For Unicode, we follow Java's specification: a character is
24 # lowercase if
25 # * it is not in the range [U+2000,U+2FFF], and
26 # * the Unicode attribute table does not give a lowercase mapping
27 # for it, and
28 # * at least one of the following is true:
29 # o the Unicode attribute table gives a mapping to uppercase
30 # for the character, or
31 # o the name for the character in the Unicode attribute table
32 # contains the words "SMALL LETTER" or "SMALL LIGATURE".
33
34 sub lower_case {
35 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
36 if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
37 && (!defined($lowercase) || $lowercase eq "")
38 && ((defined($uppercase) && $uppercase ne "")
39 || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
40 return 1;
41 } else {
42 return 0;
43 }
44 }
45
46 # For Unicode, we follow Java's specification: a character is
47 # uppercase if
48 # * it is not in the range [U+2000,U+2FFF], and
49 # * the Unicode attribute table does not give an uppercase mapping
50 # for it (this excludes titlecase characters), and
51 # * at least one of the following is true:
52 # o the Unicode attribute table gives a mapping to lowercase
53 # for the character, or
54 # o the name for the character in the Unicode attribute table
55 # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
56
57 sub upper_case {
58 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
59 if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
60 && (!defined($uppercase) || $uppercase eq "")
61 && ((defined($lowercase) && $lowercase ne "")
62 || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
63 return 1;
64 } else {
65 return 0;
66 }
67 }
68
69 # A character is titlecase if it has the category Lt in the character
70 # attribute database.
71
72 sub title_case {
73 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
74 if (defined($category) && $category eq "Lt") {
75 return 1;
76 } else {
77 return 0;
78 }
79 }
80
81 # A letter is any character with one of the letter categories (Lu, Ll,
82 # Lt, Lm, Lo) in the Unicode character database.
83
84 sub letter {
85 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
86 if (defined($category) && ($category eq "Lu"
87 || $category eq "Ll"
88 || $category eq "Lt"
89 || $category eq "Lm"
90 || $category eq "Lo")) {
91 return 1;
92 } else {
93 return 0;
94 }
95 }
96
97 # A character is a digit if it has the category Nd in the character
98 # attribute database. In Latin-1 and ASCII, the only such characters
99 # are 0123456789. In Unicode, there are other digit characters in
100 # other code blocks, such as Gujarati digits and Tibetan digits.
101
102 sub digit {
103 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
104 if (defined($category) && $category eq "Nd") {
105 return 1;
106 } else {
107 return 0;
108 }
109 }
110
111 # The only hex digits are 0123456789abcdefABCDEF.
112
113 sub hex_digit {
114 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
115 if (($codepoint >= 0x30 && $codepoint <= 0x39)
116 || ($codepoint >= 0x41 && $codepoint <= 0x46)
117 || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
118 return 1;
119 } else {
120 return 0;
121 }
122 }
123
124 # The union of char-set:letter and char-set:digit.
125
126 sub letter_plus_digit {
127 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
128 if (letter($codepoint, $name, $category, $uppercase, $lowercase)
129 || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
130 return 1;
131 } else {
132 return 0;
133 }
134 }
135
136 # Characters that would 'use ink' when printed
137 sub graphic {
138 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
139 if ($category =~ (/L|M|N|P|S/)) {
140 return 1;
141 } else {
142 return 0;
143 }
144 }
145
146 # A whitespace character is either
147 # * a character with one of the space, line, or paragraph separator
148 # categories (Zs, Zl or Zp) of the Unicode character database.
149 # * U+0009 Horizontal tabulation (\t control-I)
150 # * U+000A Line feed (\n control-J)
151 # * U+000B Vertical tabulation (\v control-K)
152 # * U+000C Form feed (\f control-L)
153 # * U+000D Carriage return (\r control-M)
154
155 sub whitespace {
156 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
157 if ($category =~ (/Zs|Zl|Zp/)
158 || $codepoint == 0x9
159 || $codepoint == 0xA
160 || $codepoint == 0xB
161 || $codepoint == 0xC
162 || $codepoint == 0xD) {
163 return 1;
164 } else {
165 return 0;
166 }
167 }
168
169 # A printing character is one that would occupy space when printed,
170 # i.e., a graphic character or a space character. char-set:printing is
171 # the union of char-set:whitespace and char-set:graphic.
172
173 sub printing {
174 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
175 if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
176 || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
177 return 1;
178 } else {
179 return 0;
180 }
181 }
182
183 # The ISO control characters are the Unicode/Latin-1 characters in the
184 # ranges [U+0000,U+001F] and [U+007F,U+009F].
185
186 sub iso_control {
187 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
188 if (($codepoint >= 0x00 && $codepoint <= 0x1F)
189 || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
190 return 1;
191 } else {
192 return 0;
193 }
194 }
195
196 # A punctuation character is any character that has one of the
197 # punctuation categories in the Unicode character database (Pc, Pd,
198 # Ps, Pe, Pi, Pf, or Po.)
199
200 # Note that srfi-14 gives conflicting requirements!! It claims that
201 # only the Unicode punctuation is necessary, but, explicitly calls out
202 # the soft hyphen character (U+00AD) as punctution. Current versions
203 # of Unicode consider U+00AD to be a formatting character, not
204 # punctuation.
205
206 sub punctuation {
207 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
208 if ($category =~ (/P/)) {
209 return 1;
210 } else {
211 return 0;
212 }
213 }
214
215 # A symbol is any character that has one of the symbol categories in
216 # the Unicode character database (Sm, Sc, Sk, or So).
217
218 sub symbol {
219 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
220 if ($category =~ (/S/)) {
221 return 1;
222 } else {
223 return 0;
224 }
225 }
226
227 # Blank chars are horizontal whitespace. A blank character is either
228 # * a character with the space separator category (Zs) in the
229 # Unicode character database.
230 # * U+0009 Horizontal tabulation (\t control-I)
231 sub blank {
232 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
233 if ($category =~ (/Zs/)
234 || $codepoint == 0x9) {
235 return 1;
236 } else {
237 return 0;
238 }
239 }
240
241 # ASCII
242 sub ascii {
243 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
244 if ($codepoint <= 0x7F) {
245 return 1;
246 } else {
247 return 0;
248 }
249 }
250
251 # Empty
252 sub empty {
253 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
254 return 0;
255 }
256
257 # Designated -- All characters except for the surrogates
258 sub designated {
259 my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
260 if ($category =~ (/Cs/)) {
261 return 0;
262 } else {
263 return 1;
264 }
265 }
266
267
268 # The procedure generates the two C structures necessary to describe a
269 # given category.
270 sub compute {
271 my($f) = @_;
272 my $start = -1;
273 my $end = -1;
274 my $len = 0;
275 my @rstart = (-1);
276 my @rend = (-1);
277
278 seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
279
280 print "$f\n";
281
282 while (<$in>) {
283 # Parse the 14 column, semicolon-delimited UnicodeData.txt
284 # file
285 chomp;
286 my(@fields) = split(/;/);
287
288 # The codepoint: an integer
289 my $codepoint = hex($fields[0]);
290
291 # If this is a character range, the last character in this
292 # range
293 my $codepoint_end = $codepoint;
294
295 # The name of the character
296 my $name = $fields[1];
297
298 # A two-character category code, such as Ll (lower-case
299 # letter)
300 my $category = $fields[2];
301
302 # The codepoint of the uppercase version of this char
303 my $uppercase = $fields[12];
304
305 # The codepoint of the lowercase version of this char
306 my $lowercase = $fields[13];
307
308 my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
309 if ($pass == 1) {
310
311 # Some pairs of lines in UnicodeData.txt delimit ranges of
312 # characters.
313 if ($name =~ /First/) {
314 $line = <$in>;
315 die $! if $!;
316 $codepoint_end = hex( (split(/;/, $line))[0] );
317 }
318
319 # Compute ranges of characters [start:end] that meet the
320 # criteria. Store the ranges.
321 if ($start == -1) {
322 $start = $codepoint;
323 $end = $codepoint_end;
324 } elsif ($end + 1 == $codepoint) {
325 $end = $codepoint_end;
326 } else {
327 $rstart[$len] = $start;
328 $rend[$len] = $end;
329 $len++;
330 $start = $codepoint;
331 $end = $codepoint_end;
332 }
333 }
334 }
335
336 # Extra logic to ensure that the last range is included
337 if ($start != -1) {
338 if ($len > 0 && $rstart[@rstart-1] != $start) {
339 $rstart[$len] = $start;
340 $rend[$len] = $end;
341 $len++;
342 } elsif ($len == 0) {
343 $rstart[0] = $start;
344 $rend[0] = $end;
345 $len++;
346 }
347 }
348
349 # Print the C struct that contains the range list.
350 print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
351 if ($rstart[0] != -1) {
352 for (my $i=0; $i<@rstart-1; $i++) {
353 printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
354 }
355 printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
356 }
357 print $out "};\n\n";
358
359 # Print the C struct that contains the range list length and
360 # pointer to the range list.
361 print $out "scm_t_char_set cs_${f} = {\n";
362 print $out " $len,\n";
363 print $out " cs_" . $f . "_ranges\n";
364 print $out "};\n\n";
365 }
366
367 # Write a bit of a header
368 print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
369 print $out "/* This file is #include'd by srfi-14.c. */\n\n";
370 print $out "/* This file was generated from\n";
371 print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
372 print $out " with the unidata_to_charset.pl script. */\n\n";
373
374 # Write the C structs for each SRFI-14 charset
375 compute "lower_case";
376 compute "upper_case";
377 compute "title_case";
378 compute "letter";
379 compute "digit";
380 compute "hex_digit";
381 compute "letter_plus_digit";
382 compute "graphic";
383 compute "whitespace";
384 compute "printing";
385 compute "iso_control";
386 compute "punctuation";
387 compute "symbol";
388 compute "blank";
389 compute "ascii";
390 compute "empty";
391 compute "designated";
392
393 close $in;
394 close $out;
395
396 exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
397
398 # And we're done.
399
400
401
402
403
404