Commit | Line | Data |
---|---|---|
d0434ddf MG |
1 | #!/usr/bin/perl |
2 | # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt | |
3 | # | |
dcc69bab MG |
4 | # Copyright (C) 2009, 2010 Free Software Foundation, Inc. |
5 | # | |
d0434ddf MG |
6 | # This library is free software; you can redistribute it and/or |
7 | # modify it under the terms of the GNU Lesser General Public | |
8 | # License as published by the Free Software Foundation; either | |
9 | # version 3 of the License, or (at your option) any later version. | |
dcc69bab | 10 | # |
d0434ddf MG |
11 | # This library is distributed in the hope that it will be useful, |
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | # Lesser General Public License for more details. | |
dcc69bab | 15 | # |
d0434ddf MG |
16 | # You should have received a copy of the GNU Lesser General Public |
17 | # License along with this library; if not, write to the Free Software | |
18 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | ||
20 | open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!"; | |
21 | open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!"; | |
22 | ||
23 | # For Unicode, we follow Java's specification: a character is | |
24 | # lowercase if | |
25 | # * it is not in the range [U+2000,U+2FFF], and | |
26 | # * the Unicode attribute table does not give a lowercase mapping | |
27 | # for it, and | |
28 | # * at least one of the following is true: | |
29 | # o the Unicode attribute table gives a mapping to uppercase | |
30 | # for the character, or | |
31 | # o the name for the character in the Unicode attribute table | |
32 | # contains the words "SMALL LETTER" or "SMALL LIGATURE". | |
33 | ||
34 | sub lower_case { | |
35 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
36 | if (($codepoint < 0x2000 || $codepoint > 0x2FFF) | |
37 | && (!defined($lowercase) || $lowercase eq "") | |
38 | && ((defined($uppercase) && $uppercase ne "") | |
39 | || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) { | |
40 | return 1; | |
41 | } else { | |
42 | return 0; | |
43 | } | |
44 | } | |
45 | ||
46 | # For Unicode, we follow Java's specification: a character is | |
47 | # uppercase if | |
48 | # * it is not in the range [U+2000,U+2FFF], and | |
49 | # * the Unicode attribute table does not give an uppercase mapping | |
50 | # for it (this excludes titlecase characters), and | |
51 | # * at least one of the following is true: | |
52 | # o the Unicode attribute table gives a mapping to lowercase | |
53 | # for the character, or | |
54 | # o the name for the character in the Unicode attribute table | |
55 | # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE". | |
56 | ||
57 | sub upper_case { | |
58 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
59 | if (($codepoint < 0x2000 || $codepoint > 0x2FFF) | |
60 | && (!defined($uppercase) || $uppercase eq "") | |
61 | && ((defined($lowercase) && $lowercase ne "") | |
62 | || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) { | |
63 | return 1; | |
64 | } else { | |
65 | return 0; | |
66 | } | |
67 | } | |
68 | ||
69 | # A character is titlecase if it has the category Lt in the character | |
70 | # attribute database. | |
71 | ||
72 | sub title_case { | |
73 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
74 | if (defined($category) && $category eq "Lt") { | |
75 | return 1; | |
76 | } else { | |
77 | return 0; | |
78 | } | |
79 | } | |
80 | ||
81 | # A letter is any character with one of the letter categories (Lu, Ll, | |
82 | # Lt, Lm, Lo) in the Unicode character database. | |
83 | ||
84 | sub letter { | |
85 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
86 | if (defined($category) && ($category eq "Lu" | |
87 | || $category eq "Ll" | |
88 | || $category eq "Lt" | |
89 | || $category eq "Lm" | |
90 | || $category eq "Lo")) { | |
91 | return 1; | |
92 | } else { | |
93 | return 0; | |
94 | } | |
95 | } | |
96 | ||
97 | # A character is a digit if it has the category Nd in the character | |
98 | # attribute database. In Latin-1 and ASCII, the only such characters | |
99 | # are 0123456789. In Unicode, there are other digit characters in | |
100 | # other code blocks, such as Gujarati digits and Tibetan digits. | |
101 | ||
102 | sub digit { | |
103 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
104 | if (defined($category) && $category eq "Nd") { | |
105 | return 1; | |
106 | } else { | |
107 | return 0; | |
108 | } | |
109 | } | |
110 | ||
111 | # The only hex digits are 0123456789abcdefABCDEF. | |
112 | ||
113 | sub hex_digit { | |
114 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
115 | if (($codepoint >= 0x30 && $codepoint <= 0x39) | |
116 | || ($codepoint >= 0x41 && $codepoint <= 0x46) | |
117 | || ($codepoint >= 0x61 && $codepoint <= 0x66)) { | |
118 | return 1; | |
119 | } else { | |
120 | return 0; | |
121 | } | |
122 | } | |
123 | ||
124 | # The union of char-set:letter and char-set:digit. | |
125 | ||
126 | sub letter_plus_digit { | |
127 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
128 | if (letter($codepoint, $name, $category, $uppercase, $lowercase) | |
129 | || digit($codepoint, $name, $category, $uppercase, $lowercase)) { | |
130 | return 1; | |
131 | } else { | |
132 | return 0; | |
133 | } | |
134 | } | |
135 | ||
136 | # Characters that would 'use ink' when printed | |
137 | sub graphic { | |
138 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
139 | if ($category =~ (/L|M|N|P|S/)) { | |
140 | return 1; | |
141 | } else { | |
142 | return 0; | |
143 | } | |
144 | } | |
145 | ||
146 | # A whitespace character is either | |
147 | # * a character with one of the space, line, or paragraph separator | |
148 | # categories (Zs, Zl or Zp) of the Unicode character database. | |
149 | # * U+0009 Horizontal tabulation (\t control-I) | |
150 | # * U+000A Line feed (\n control-J) | |
151 | # * U+000B Vertical tabulation (\v control-K) | |
152 | # * U+000C Form feed (\f control-L) | |
153 | # * U+000D Carriage return (\r control-M) | |
154 | ||
155 | sub whitespace { | |
156 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
157 | if ($category =~ (/Zs|Zl|Zp/) | |
158 | || $codepoint == 0x9 | |
159 | || $codepoint == 0xA | |
160 | || $codepoint == 0xB | |
161 | || $codepoint == 0xC | |
162 | || $codepoint == 0xD) { | |
163 | return 1; | |
164 | } else { | |
165 | return 0; | |
166 | } | |
167 | } | |
168 | ||
169 | # A printing character is one that would occupy space when printed, | |
170 | # i.e., a graphic character or a space character. char-set:printing is | |
171 | # the union of char-set:whitespace and char-set:graphic. | |
172 | ||
173 | sub printing { | |
174 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
175 | if (whitespace($codepoint, $name, $category, $uppercase, $lowercase) | |
176 | || graphic($codepoint, $name, $category, $uppercase, $lowercase)) { | |
177 | return 1; | |
178 | } else { | |
179 | return 0; | |
180 | } | |
181 | } | |
182 | ||
183 | # The ISO control characters are the Unicode/Latin-1 characters in the | |
184 | # ranges [U+0000,U+001F] and [U+007F,U+009F]. | |
185 | ||
186 | sub iso_control { | |
187 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
188 | if (($codepoint >= 0x00 && $codepoint <= 0x1F) | |
189 | || ($codepoint >= 0x7F && $codepoint <= 0x9F)) { | |
190 | return 1; | |
191 | } else { | |
192 | return 0; | |
193 | } | |
194 | } | |
195 | ||
196 | # A punctuation character is any character that has one of the | |
197 | # punctuation categories in the Unicode character database (Pc, Pd, | |
198 | # Ps, Pe, Pi, Pf, or Po.) | |
199 | ||
200 | # Note that srfi-14 gives conflicting requirements!! It claims that | |
201 | # only the Unicode punctuation is necessary, but, explicitly calls out | |
202 | # the soft hyphen character (U+00AD) as punctution. Current versions | |
203 | # of Unicode consider U+00AD to be a formatting character, not | |
204 | # punctuation. | |
205 | ||
206 | sub punctuation { | |
207 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
208 | if ($category =~ (/P/)) { | |
209 | return 1; | |
210 | } else { | |
211 | return 0; | |
212 | } | |
213 | } | |
214 | ||
215 | # A symbol is any character that has one of the symbol categories in | |
216 | # the Unicode character database (Sm, Sc, Sk, or So). | |
217 | ||
218 | sub symbol { | |
219 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
220 | if ($category =~ (/S/)) { | |
221 | return 1; | |
222 | } else { | |
223 | return 0; | |
224 | } | |
225 | } | |
226 | ||
227 | # Blank chars are horizontal whitespace. A blank character is either | |
228 | # * a character with the space separator category (Zs) in the | |
229 | # Unicode character database. | |
230 | # * U+0009 Horizontal tabulation (\t control-I) | |
231 | sub blank { | |
232 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
233 | if ($category =~ (/Zs/) | |
234 | || $codepoint == 0x9) { | |
235 | return 1; | |
236 | } else { | |
237 | return 0; | |
238 | } | |
239 | } | |
240 | ||
241 | # ASCII | |
242 | sub ascii { | |
243 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
244 | if ($codepoint <= 0x7F) { | |
245 | return 1; | |
246 | } else { | |
247 | return 0; | |
248 | } | |
249 | } | |
250 | ||
251 | # Empty | |
252 | sub empty { | |
253 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; | |
254 | return 0; | |
255 | } | |
256 | ||
719bb8cd MG |
257 | # Designated -- All characters except for the surrogates |
258 | sub designated { | |
d0434ddf | 259 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
24d23822 MG |
260 | if ($category =~ (/Cs/)) { |
261 | return 0; | |
262 | } else { | |
263 | return 1; | |
264 | } | |
d0434ddf MG |
265 | } |
266 | ||
267 | ||
268 | # The procedure generates the two C structures necessary to describe a | |
269 | # given category. | |
270 | sub compute { | |
271 | my($f) = @_; | |
272 | my $start = -1; | |
273 | my $end = -1; | |
274 | my $len = 0; | |
275 | my @rstart = (-1); | |
276 | my @rend = (-1); | |
277 | ||
278 | seek($in, 0, 0) or die "Can't seek to beginning of file: $!"; | |
279 | ||
280 | print "$f\n"; | |
281 | ||
282 | while (<$in>) { | |
283 | # Parse the 14 column, semicolon-delimited UnicodeData.txt | |
284 | # file | |
285 | chomp; | |
286 | my(@fields) = split(/;/); | |
287 | ||
288 | # The codepoint: an integer | |
289 | my $codepoint = hex($fields[0]); | |
290 | ||
291 | # If this is a character range, the last character in this | |
292 | # range | |
293 | my $codepoint_end = $codepoint; | |
294 | ||
295 | # The name of the character | |
296 | my $name = $fields[1]; | |
297 | ||
298 | # A two-character category code, such as Ll (lower-case | |
299 | # letter) | |
300 | my $category = $fields[2]; | |
301 | ||
302 | # The codepoint of the uppercase version of this char | |
303 | my $uppercase = $fields[12]; | |
304 | ||
305 | # The codepoint of the lowercase version of this char | |
306 | my $lowercase = $fields[13]; | |
307 | ||
308 | my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase); | |
309 | if ($pass == 1) { | |
310 | ||
311 | # Some pairs of lines in UnicodeData.txt delimit ranges of | |
312 | # characters. | |
313 | if ($name =~ /First/) { | |
314 | $line = <$in>; | |
315 | die $! if $!; | |
316 | $codepoint_end = hex( (split(/;/, $line))[0] ); | |
317 | } | |
318 | ||
319 | # Compute ranges of characters [start:end] that meet the | |
320 | # criteria. Store the ranges. | |
321 | if ($start == -1) { | |
322 | $start = $codepoint; | |
323 | $end = $codepoint_end; | |
324 | } elsif ($end + 1 == $codepoint) { | |
325 | $end = $codepoint_end; | |
326 | } else { | |
327 | $rstart[$len] = $start; | |
328 | $rend[$len] = $end; | |
329 | $len++; | |
330 | $start = $codepoint; | |
331 | $end = $codepoint_end; | |
332 | } | |
333 | } | |
334 | } | |
335 | ||
336 | # Extra logic to ensure that the last range is included | |
337 | if ($start != -1) { | |
338 | if ($len > 0 && $rstart[@rstart-1] != $start) { | |
339 | $rstart[$len] = $start; | |
340 | $rend[$len] = $end; | |
341 | $len++; | |
342 | } elsif ($len == 0) { | |
6d30df5d MG |
343 | $rstart[0] = $start; |
344 | $rend[0] = $end; | |
345 | $len++; | |
d0434ddf MG |
346 | } |
347 | } | |
348 | ||
349 | # Print the C struct that contains the range list. | |
350 | print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n"; | |
351 | if ($rstart[0] != -1) { | |
352 | for (my $i=0; $i<@rstart-1; $i++) { | |
353 | printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i]; | |
354 | } | |
355 | printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1]; | |
356 | } | |
357 | print $out "};\n\n"; | |
358 | ||
359 | # Print the C struct that contains the range list length and | |
360 | # pointer to the range list. | |
361 | print $out "scm_t_char_set cs_${f} = {\n"; | |
362 | print $out " $len,\n"; | |
363 | print $out " cs_" . $f . "_ranges\n"; | |
364 | print $out "};\n\n"; | |
365 | } | |
366 | ||
367 | # Write a bit of a header | |
368 | print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n"; | |
369 | print $out "/* This file is #include'd by srfi-14.c. */\n\n"; | |
24d23822 | 370 | print $out "/* This file was generated from\n"; |
d0434ddf MG |
371 | print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n"; |
372 | print $out " with the unidata_to_charset.pl script. */\n\n"; | |
373 | ||
374 | # Write the C structs for each SRFI-14 charset | |
375 | compute "lower_case"; | |
376 | compute "upper_case"; | |
377 | compute "title_case"; | |
378 | compute "letter"; | |
379 | compute "digit"; | |
380 | compute "hex_digit"; | |
381 | compute "letter_plus_digit"; | |
382 | compute "graphic"; | |
383 | compute "whitespace"; | |
384 | compute "printing"; | |
385 | compute "iso_control"; | |
386 | compute "punctuation"; | |
387 | compute "symbol"; | |
388 | compute "blank"; | |
389 | compute "ascii"; | |
390 | compute "empty"; | |
719bb8cd | 391 | compute "designated"; |
d0434ddf MG |
392 | |
393 | close $in; | |
394 | close $out; | |
395 | ||
396 | exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!"; | |
397 | ||
398 | # And we're done. | |
399 | ||
400 | ||
401 | ||
402 | ||
403 | ||
404 |