libguile/unidata_to_charset.pl

   1 #!/usr/bin/perl
   2 # unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
   3 #
   4 # Copyright (C) 2009, 2010 Free Software Foundation, Inc.
   5 #
   6 # This library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 3 of the License, or (at your option) any later version.
  10 #
  11 # This library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with this library; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19
  20 open(my $in,  "<",  "UnicodeData.txt")  or die "Can't open UnicodeData.txt: $!";
  21 open(my $out, ">",  "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
  22
  23 # For Unicode, we follow Java's specification: a character is
  24 # lowercase if
  25 #    * it is not in the range [U+2000,U+2FFF], and
  26 #    * the Unicode attribute table does not give a lowercase mapping
  27 #      for it, and
  28 #    * at least one of the following is true:
  29 #          o the Unicode attribute table gives a mapping to uppercase
  30 #            for the character, or
  31 #          o the name for the character in the Unicode attribute table
  32 #            contains the words "SMALL LETTER" or "SMALL LIGATURE".
  33
  34 sub lower_case {
  35     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  36     if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
  37         && (!defined($lowercase) || $lowercase eq "")
  38         && ((defined($uppercase) && $uppercase ne "")
  39             || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
  40         return 1;
  41     } else {
  42         return 0;
  43     }
  44 }
  45
  46 # For Unicode, we follow Java's specification: a character is
  47 # uppercase if
  48 #    * it is not in the range [U+2000,U+2FFF], and
  49 #    * the Unicode attribute table does not give an uppercase mapping
  50 #      for it (this excludes titlecase characters), and
  51 #    * at least one of the following is true:
  52 #          o the Unicode attribute table gives a mapping to lowercase
  53 #            for the character, or
  54 #          o the name for the character in the Unicode attribute table
  55 #            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
  56
  57 sub upper_case {
  58     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  59     if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
  60         && (!defined($uppercase) || $uppercase eq "")
  61         && ((defined($lowercase) && $lowercase ne "")
  62             || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
  63         return 1;
  64     } else {
  65         return 0;
  66     }
  67 }
  68
  69 # A character is titlecase if it has the category Lt in the character
  70 # attribute database.
  71
  72 sub title_case {
  73     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  74     if (defined($category) && $category eq "Lt") {
  75         return 1;
  76     } else {
  77         return 0;
  78     }
  79 }
  80
  81 # A letter is any character with one of the letter categories (Lu, Ll,
  82 # Lt, Lm, Lo) in the Unicode character database.
  83
  84 sub letter {
  85     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
  86     if (defined($category) && ($category eq "Lu"
  87                                || $category eq "Ll"
  88                                || $category eq "Lt"
  89                                || $category eq "Lm"
  90                                || $category eq "Lo")) {
  91         return 1;
  92     } else {
  93         return 0;
  94     }
  95 }
  96
  97 # A character is a digit if it has the category Nd in the character
  98 # attribute database. In Latin-1 and ASCII, the only such characters
  99 # are 0123456789. In Unicode, there are other digit characters in
 100 # other code blocks, such as Gujarati digits and Tibetan digits.
 101
 102 sub digit {
 103     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 104     if (defined($category) && $category eq "Nd") {
 105         return 1;
 106     } else {
 107         return 0;
 108     }
 109 }
 110
 111 # The only hex digits are 0123456789abcdefABCDEF.
 112
 113 sub hex_digit {
 114     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 115     if (($codepoint >= 0x30 && $codepoint <= 0x39)
 116         || ($codepoint >= 0x41 && $codepoint <= 0x46)
 117         || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
 118         return 1;
 119     } else {
 120         return 0;
 121     }
 122 }
 123
 124 # The union of char-set:letter and char-set:digit.
 125
 126 sub letter_plus_digit {
 127     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 128     if (letter($codepoint, $name, $category, $uppercase, $lowercase)
 129         || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
 130         return 1;
 131     } else {
 132         return 0;
 133     }
 134 }
 135
 136 # Characters that would 'use ink' when printed
 137 sub graphic {
 138     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 139     if ($category =~ (/L|M|N|P|S/)) {
 140         return 1;
 141     } else {
 142         return 0;
 143     }
 144 }
 145
 146 # A whitespace character is either
 147 #    * a character with one of the space, line, or paragraph separator
 148 #      categories (Zs, Zl or Zp) of the Unicode character database.
 149 #    * U+0009 Horizontal tabulation (\t control-I)
 150 #    * U+000A Line feed (\n control-J)
 151 #    * U+000B Vertical tabulation (\v control-K)
 152 #    * U+000C Form feed (\f control-L)
 153 #    * U+000D Carriage return (\r control-M)
 154
 155 sub whitespace {
 156     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 157     if ($category =~ (/Zs|Zl|Zp/)
 158         || $codepoint == 0x9
 159         || $codepoint == 0xA
 160         || $codepoint == 0xB
 161         || $codepoint == 0xC
 162         || $codepoint == 0xD) {
 163         return 1;
 164     } else {
 165         return 0;
 166     }
 167 }
 168
 169 # A printing character is one that would occupy space when printed,
 170 # i.e., a graphic character or a space character. char-set:printing is
 171 # the union of char-set:whitespace and char-set:graphic.
 172
 173 sub printing {
 174     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 175     if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
 176         || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
 177         return 1;
 178     } else {
 179         return 0;
 180     }
 181 }
 182
 183 # The ISO control characters are the Unicode/Latin-1 characters in the
 184 # ranges [U+0000,U+001F] and [U+007F,U+009F].
 185
 186 sub iso_control {
 187     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 188     if (($codepoint >= 0x00 && $codepoint <= 0x1F)
 189         || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
 190         return 1;
 191     } else {
 192         return 0;
 193     }
 194 }
 195
 196 # A punctuation character is any character that has one of the
 197 # punctuation categories in the Unicode character database (Pc, Pd,
 198 # Ps, Pe, Pi, Pf, or Po.)
 199
 200 # Note that srfi-14 gives conflicting requirements!!  It claims that
 201 # only the Unicode punctuation is necessary, but, explicitly calls out
 202 # the soft hyphen character (U+00AD) as punctution.  Current versions
 203 # of Unicode consider U+00AD to be a formatting character, not
 204 # punctuation.
 205
 206 sub punctuation {
 207     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 208     if ($category =~ (/P/)) {
 209         return 1;
 210     } else {
 211         return 0;
 212     }
 213 }
 214
 215 # A symbol is any character that has one of the symbol categories in
 216 # the Unicode character database (Sm, Sc, Sk, or So).
 217
 218 sub symbol {
 219     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 220     if ($category =~ (/S/)) {
 221         return 1;
 222     } else {
 223         return 0;
 224     }
 225 }
 226
 227 # Blank chars are horizontal whitespace.  A blank character is either
 228 #    * a character with the space separator category (Zs) in the
 229 #      Unicode character database.
 230 #    * U+0009 Horizontal tabulation (\t control-I)
 231 sub blank {
 232     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 233     if ($category =~ (/Zs/)
 234         || $codepoint == 0x9) {
 235         return 1;
 236     } else {
 237         return 0;
 238     }
 239 }
 240
 241 # ASCII
 242 sub ascii {
 243     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 244     if ($codepoint <= 0x7F) {
 245         return 1;
 246     } else {
 247         return 0;
 248     }
 249 }
 250
 251 # Empty
 252 sub empty {
 253     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 254     return 0;
 255 }
 256
 257 # Designated -- All characters except for the surrogates
 258 sub designated {
 259     my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
 260     if ($category =~ (/Cs/)) {
 261         return 0;
 262     } else {
 263         return 1;
 264     }
 265 }
 266
 267
 268 # The procedure generates the two C structures necessary to describe a
 269 # given category.
 270 sub compute {
 271     my($f) = @_;
 272     my $start = -1;
 273     my $end = -1;
 274     my $len = 0;
 275     my @rstart = (-1);
 276     my @rend = (-1);
 277
 278     seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
 279
 280     print "$f\n";
 281
 282     while (<$in>) {
 283         # Parse the 14 column, semicolon-delimited UnicodeData.txt
 284         # file
 285         chomp;
 286         my(@fields) = split(/;/);
 287
 288         # The codepoint: an integer
 289         my $codepoint = hex($fields[0]);
 290
 291         # If this is a character range, the last character in this
 292         # range
 293         my $codepoint_end = $codepoint;
 294
 295         # The name of the character
 296         my $name = $fields[1];
 297
 298         # A two-character category code, such as Ll (lower-case
 299         # letter)
 300         my $category = $fields[2];
 301
 302         # The codepoint of the uppercase version of this char
 303         my $uppercase = $fields[12];
 304
 305         # The codepoint of the lowercase version of this char
 306         my $lowercase = $fields[13];
 307
 308         my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
 309         if ($pass == 1) {
 310
 311             # Some pairs of lines in UnicodeData.txt delimit ranges of
 312             # characters.
 313             if ($name =~ /First/) {
 314                 $line = <$in>;
 315                 die $! if $!;
 316                 $codepoint_end = hex( (split(/;/, $line))[0] );
 317             }
 318
 319             # Compute ranges of characters [start:end] that meet the
 320             # criteria.  Store the ranges.
 321             if ($start == -1) {
 322                 $start = $codepoint;
 323                 $end = $codepoint_end;
 324             } elsif ($end + 1 == $codepoint) {
 325                 $end = $codepoint_end;
 326             } else {
 327                 $rstart[$len] = $start;
 328                 $rend[$len] = $end;
 329                 $len++;
 330                 $start = $codepoint;
 331                 $end = $codepoint_end;
 332             }
 333         }
 334     }
 335
 336     # Extra logic to ensure that the last range is included
 337     if ($start != -1) {
 338         if ($len > 0 && $rstart[@rstart-1] != $start) {
 339             $rstart[$len] = $start;
 340             $rend[$len] = $end;
 341             $len++;
 342         } elsif ($len == 0) {
 343             $rstart[0] = $start;
 344             $rend[0] = $end;
 345             $len++;
 346         }
 347     }
 348
 349     # Print the C struct that contains the range list.
 350     print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
 351     if ($rstart[0] != -1) {
 352         for (my $i=0; $i<@rstart-1; $i++) {
 353             printf $out "  {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
 354         }
 355         printf $out "  {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
 356     }
 357     print $out "};\n\n";
 358
 359     # Print the C struct that contains the range list length and
 360     # pointer to the range list.
 361     print $out "scm_t_char_set cs_${f} = {\n";
 362     print $out "  $len,\n";
 363     print $out "  cs_" . $f . "_ranges\n";
 364     print $out "};\n\n";
 365 }
 366
 367 # Write a bit of a header
 368 print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
 369 print $out "/* This file is #include'd by srfi-14.c.  */\n\n";
 370 print $out "/* This file was generated from\n";
 371 print $out "   http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
 372 print $out "   with the unidata_to_charset.pl script.  */\n\n";
 373
 374 # Write the C structs for each SRFI-14 charset
 375 compute "lower_case";
 376 compute "upper_case";
 377 compute "title_case";
 378 compute "letter";
 379 compute "digit";
 380 compute "hex_digit";
 381 compute "letter_plus_digit";
 382 compute "graphic";
 383 compute "whitespace";
 384 compute "printing";
 385 compute "iso_control";
 386 compute "punctuation";
 387 compute "symbol";
 388 compute "blank";
 389 compute "ascii";
 390 compute "empty";
 391 compute "designated";
 392
 393 close $in;
 394 close $out;
 395
 396 exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
 397
 398 # And we're done.
 399
 400
 401
 402
 403
 404