| 1 | #!/usr/bin/perl |
| 2 | # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt |
| 3 | # |
| 4 | # Copyright (C) 2009, 2010 Free Software Foundation, Inc. |
| 5 | # |
| 6 | # This library is free software; you can redistribute it and/or |
| 7 | # modify it under the terms of the GNU Lesser General Public |
| 8 | # License as published by the Free Software Foundation; either |
| 9 | # version 3 of the License, or (at your option) any later version. |
| 10 | # |
| 11 | # This library is distributed in the hope that it will be useful, |
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | # Lesser General Public License for more details. |
| 15 | # |
| 16 | # You should have received a copy of the GNU Lesser General Public |
| 17 | # License along with this library; if not, write to the Free Software |
| 18 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | |
| 20 | open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!"; |
| 21 | open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!"; |
| 22 | |
| 23 | # For Unicode, we follow Java's specification: a character is |
| 24 | # lowercase if |
| 25 | # * it is not in the range [U+2000,U+2FFF], and |
| 26 | # * the Unicode attribute table does not give a lowercase mapping |
| 27 | # for it, and |
| 28 | # * at least one of the following is true: |
| 29 | # o the Unicode attribute table gives a mapping to uppercase |
| 30 | # for the character, or |
| 31 | # o the name for the character in the Unicode attribute table |
| 32 | # contains the words "SMALL LETTER" or "SMALL LIGATURE". |
| 33 | |
| 34 | sub lower_case { |
| 35 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 36 | if (($codepoint < 0x2000 || $codepoint > 0x2FFF) |
| 37 | && (!defined($lowercase) || $lowercase eq "") |
| 38 | && ((defined($uppercase) && $uppercase ne "") |
| 39 | || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) { |
| 40 | return 1; |
| 41 | } else { |
| 42 | return 0; |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | # For Unicode, we follow Java's specification: a character is |
| 47 | # uppercase if |
| 48 | # * it is not in the range [U+2000,U+2FFF], and |
| 49 | # * the Unicode attribute table does not give an uppercase mapping |
| 50 | # for it (this excludes titlecase characters), and |
| 51 | # * at least one of the following is true: |
| 52 | # o the Unicode attribute table gives a mapping to lowercase |
| 53 | # for the character, or |
| 54 | # o the name for the character in the Unicode attribute table |
| 55 | # contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE". |
| 56 | |
| 57 | sub upper_case { |
| 58 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 59 | if (($codepoint < 0x2000 || $codepoint > 0x2FFF) |
| 60 | && (!defined($uppercase) || $uppercase eq "") |
| 61 | && ((defined($lowercase) && $lowercase ne "") |
| 62 | || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) { |
| 63 | return 1; |
| 64 | } else { |
| 65 | return 0; |
| 66 | } |
| 67 | } |
| 68 | |
| 69 | # A character is titlecase if it has the category Lt in the character |
| 70 | # attribute database. |
| 71 | |
| 72 | sub title_case { |
| 73 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 74 | if (defined($category) && $category eq "Lt") { |
| 75 | return 1; |
| 76 | } else { |
| 77 | return 0; |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | # A letter is any character with one of the letter categories (Lu, Ll, |
| 82 | # Lt, Lm, Lo) in the Unicode character database. |
| 83 | |
| 84 | sub letter { |
| 85 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 86 | if (defined($category) && ($category eq "Lu" |
| 87 | || $category eq "Ll" |
| 88 | || $category eq "Lt" |
| 89 | || $category eq "Lm" |
| 90 | || $category eq "Lo")) { |
| 91 | return 1; |
| 92 | } else { |
| 93 | return 0; |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | # A character is a digit if it has the category Nd in the character |
| 98 | # attribute database. In Latin-1 and ASCII, the only such characters |
| 99 | # are 0123456789. In Unicode, there are other digit characters in |
| 100 | # other code blocks, such as Gujarati digits and Tibetan digits. |
| 101 | |
| 102 | sub digit { |
| 103 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 104 | if (defined($category) && $category eq "Nd") { |
| 105 | return 1; |
| 106 | } else { |
| 107 | return 0; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | # The only hex digits are 0123456789abcdefABCDEF. |
| 112 | |
| 113 | sub hex_digit { |
| 114 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 115 | if (($codepoint >= 0x30 && $codepoint <= 0x39) |
| 116 | || ($codepoint >= 0x41 && $codepoint <= 0x46) |
| 117 | || ($codepoint >= 0x61 && $codepoint <= 0x66)) { |
| 118 | return 1; |
| 119 | } else { |
| 120 | return 0; |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | # The union of char-set:letter and char-set:digit. |
| 125 | |
| 126 | sub letter_plus_digit { |
| 127 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 128 | if (letter($codepoint, $name, $category, $uppercase, $lowercase) |
| 129 | || digit($codepoint, $name, $category, $uppercase, $lowercase)) { |
| 130 | return 1; |
| 131 | } else { |
| 132 | return 0; |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | # Characters that would 'use ink' when printed |
| 137 | sub graphic { |
| 138 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 139 | if ($category =~ (/L|M|N|P|S/)) { |
| 140 | return 1; |
| 141 | } else { |
| 142 | return 0; |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | # A whitespace character is either |
| 147 | # * a character with one of the space, line, or paragraph separator |
| 148 | # categories (Zs, Zl or Zp) of the Unicode character database. |
| 149 | # * U+0009 Horizontal tabulation (\t control-I) |
| 150 | # * U+000A Line feed (\n control-J) |
| 151 | # * U+000B Vertical tabulation (\v control-K) |
| 152 | # * U+000C Form feed (\f control-L) |
| 153 | # * U+000D Carriage return (\r control-M) |
| 154 | |
| 155 | sub whitespace { |
| 156 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 157 | if ($category =~ (/Zs|Zl|Zp/) |
| 158 | || $codepoint == 0x9 |
| 159 | || $codepoint == 0xA |
| 160 | || $codepoint == 0xB |
| 161 | || $codepoint == 0xC |
| 162 | || $codepoint == 0xD) { |
| 163 | return 1; |
| 164 | } else { |
| 165 | return 0; |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | # A printing character is one that would occupy space when printed, |
| 170 | # i.e., a graphic character or a space character. char-set:printing is |
| 171 | # the union of char-set:whitespace and char-set:graphic. |
| 172 | |
| 173 | sub printing { |
| 174 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 175 | if (whitespace($codepoint, $name, $category, $uppercase, $lowercase) |
| 176 | || graphic($codepoint, $name, $category, $uppercase, $lowercase)) { |
| 177 | return 1; |
| 178 | } else { |
| 179 | return 0; |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | # The ISO control characters are the Unicode/Latin-1 characters in the |
| 184 | # ranges [U+0000,U+001F] and [U+007F,U+009F]. |
| 185 | |
| 186 | sub iso_control { |
| 187 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 188 | if (($codepoint >= 0x00 && $codepoint <= 0x1F) |
| 189 | || ($codepoint >= 0x7F && $codepoint <= 0x9F)) { |
| 190 | return 1; |
| 191 | } else { |
| 192 | return 0; |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | # A punctuation character is any character that has one of the |
| 197 | # punctuation categories in the Unicode character database (Pc, Pd, |
| 198 | # Ps, Pe, Pi, Pf, or Po.) |
| 199 | |
| 200 | # Note that srfi-14 gives conflicting requirements!! It claims that |
| 201 | # only the Unicode punctuation is necessary, but, explicitly calls out |
| 202 | # the soft hyphen character (U+00AD) as punctution. Current versions |
| 203 | # of Unicode consider U+00AD to be a formatting character, not |
| 204 | # punctuation. |
| 205 | |
| 206 | sub punctuation { |
| 207 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 208 | if ($category =~ (/P/)) { |
| 209 | return 1; |
| 210 | } else { |
| 211 | return 0; |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | # A symbol is any character that has one of the symbol categories in |
| 216 | # the Unicode character database (Sm, Sc, Sk, or So). |
| 217 | |
| 218 | sub symbol { |
| 219 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 220 | if ($category =~ (/S/)) { |
| 221 | return 1; |
| 222 | } else { |
| 223 | return 0; |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | # Blank chars are horizontal whitespace. A blank character is either |
| 228 | # * a character with the space separator category (Zs) in the |
| 229 | # Unicode character database. |
| 230 | # * U+0009 Horizontal tabulation (\t control-I) |
| 231 | sub blank { |
| 232 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 233 | if ($category =~ (/Zs/) |
| 234 | || $codepoint == 0x9) { |
| 235 | return 1; |
| 236 | } else { |
| 237 | return 0; |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | # ASCII |
| 242 | sub ascii { |
| 243 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 244 | if ($codepoint <= 0x7F) { |
| 245 | return 1; |
| 246 | } else { |
| 247 | return 0; |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | # Empty |
| 252 | sub empty { |
| 253 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 254 | return 0; |
| 255 | } |
| 256 | |
| 257 | # Designated -- All characters except for the surrogates |
| 258 | sub designated { |
| 259 | my($codepoint, $name, $category, $uppercase, $lowercase)= @_; |
| 260 | if ($category =~ (/Cs/)) { |
| 261 | return 0; |
| 262 | } else { |
| 263 | return 1; |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | |
| 268 | # The procedure generates the two C structures necessary to describe a |
| 269 | # given category. |
| 270 | sub compute { |
| 271 | my($f) = @_; |
| 272 | my $start = -1; |
| 273 | my $end = -1; |
| 274 | my $len = 0; |
| 275 | my @rstart = (-1); |
| 276 | my @rend = (-1); |
| 277 | |
| 278 | seek($in, 0, 0) or die "Can't seek to beginning of file: $!"; |
| 279 | |
| 280 | print "$f\n"; |
| 281 | |
| 282 | while (<$in>) { |
| 283 | # Parse the 14 column, semicolon-delimited UnicodeData.txt |
| 284 | # file |
| 285 | chomp; |
| 286 | my(@fields) = split(/;/); |
| 287 | |
| 288 | # The codepoint: an integer |
| 289 | my $codepoint = hex($fields[0]); |
| 290 | |
| 291 | # If this is a character range, the last character in this |
| 292 | # range |
| 293 | my $codepoint_end = $codepoint; |
| 294 | |
| 295 | # The name of the character |
| 296 | my $name = $fields[1]; |
| 297 | |
| 298 | # A two-character category code, such as Ll (lower-case |
| 299 | # letter) |
| 300 | my $category = $fields[2]; |
| 301 | |
| 302 | # The codepoint of the uppercase version of this char |
| 303 | my $uppercase = $fields[12]; |
| 304 | |
| 305 | # The codepoint of the lowercase version of this char |
| 306 | my $lowercase = $fields[13]; |
| 307 | |
| 308 | my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase); |
| 309 | if ($pass == 1) { |
| 310 | |
| 311 | # Some pairs of lines in UnicodeData.txt delimit ranges of |
| 312 | # characters. |
| 313 | if ($name =~ /First/) { |
| 314 | $line = <$in>; |
| 315 | die $! if $!; |
| 316 | $codepoint_end = hex( (split(/;/, $line))[0] ); |
| 317 | } |
| 318 | |
| 319 | # Compute ranges of characters [start:end] that meet the |
| 320 | # criteria. Store the ranges. |
| 321 | if ($start == -1) { |
| 322 | $start = $codepoint; |
| 323 | $end = $codepoint_end; |
| 324 | } elsif ($end + 1 == $codepoint) { |
| 325 | $end = $codepoint_end; |
| 326 | } else { |
| 327 | $rstart[$len] = $start; |
| 328 | $rend[$len] = $end; |
| 329 | $len++; |
| 330 | $start = $codepoint; |
| 331 | $end = $codepoint_end; |
| 332 | } |
| 333 | } |
| 334 | } |
| 335 | |
| 336 | # Extra logic to ensure that the last range is included |
| 337 | if ($start != -1) { |
| 338 | if ($len > 0 && $rstart[@rstart-1] != $start) { |
| 339 | $rstart[$len] = $start; |
| 340 | $rend[$len] = $end; |
| 341 | $len++; |
| 342 | } elsif ($len == 0) { |
| 343 | $rstart[0] = $start; |
| 344 | $rend[0] = $end; |
| 345 | $len++; |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | # Print the C struct that contains the range list. |
| 350 | print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n"; |
| 351 | if ($rstart[0] != -1) { |
| 352 | for (my $i=0; $i<@rstart-1; $i++) { |
| 353 | printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i]; |
| 354 | } |
| 355 | printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1]; |
| 356 | } |
| 357 | print $out "};\n\n"; |
| 358 | |
| 359 | # Print the C struct that contains the range list length and |
| 360 | # pointer to the range list. |
| 361 | print $out "scm_t_char_set cs_${f} = {\n"; |
| 362 | print $out " $len,\n"; |
| 363 | print $out " cs_" . $f . "_ranges\n"; |
| 364 | print $out "};\n\n"; |
| 365 | } |
| 366 | |
| 367 | # Write a bit of a header |
| 368 | print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n"; |
| 369 | print $out "/* This file is #include'd by srfi-14.c. */\n\n"; |
| 370 | print $out "/* This file was generated from\n"; |
| 371 | print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n"; |
| 372 | print $out " with the unidata_to_charset.pl script. */\n\n"; |
| 373 | |
| 374 | # Write the C structs for each SRFI-14 charset |
| 375 | compute "lower_case"; |
| 376 | compute "upper_case"; |
| 377 | compute "title_case"; |
| 378 | compute "letter"; |
| 379 | compute "digit"; |
| 380 | compute "hex_digit"; |
| 381 | compute "letter_plus_digit"; |
| 382 | compute "graphic"; |
| 383 | compute "whitespace"; |
| 384 | compute "printing"; |
| 385 | compute "iso_control"; |
| 386 | compute "punctuation"; |
| 387 | compute "symbol"; |
| 388 | compute "blank"; |
| 389 | compute "ascii"; |
| 390 | compute "empty"; |
| 391 | compute "designated"; |
| 392 | |
| 393 | close $in; |
| 394 | close $out; |
| 395 | |
| 396 | exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!"; |
| 397 | |
| 398 | # And we're done. |
| 399 | |
| 400 | |
| 401 | |
| 402 | |
| 403 | |
| 404 | |