[bpt/guile.git] / libguile / unidata_to_charset.pl

#!/usr/bin/perl
# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
#
# Copyright (C) 2009, 2010 Free Software Foundation, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

open(my $in,  "<",  "UnicodeData.txt")  or die "Can't open UnicodeData.txt: $!";           
open(my $out, ">",  "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";

# For Unicode, we follow Java's specification: a character is
# lowercase if
#    * it is not in the range [U+2000,U+2FFF], and
#    * the Unicode attribute table does not give a lowercase mapping
#      for it, and
#    * at least one of the following is true:
#          o the Unicode attribute table gives a mapping to uppercase
#            for the character, or
#          o the name for the character in the Unicode attribute table
#            contains the words "SMALL LETTER" or "SMALL LIGATURE".

sub lower_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
        && (!defined($lowercase) || $lowercase eq "")
        && ((defined($uppercase) && $uppercase ne "")
            || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
        return 1;
    } else {
        return 0;
    }
}

# For Unicode, we follow Java's specification: a character is
# uppercase if
#    * it is not in the range [U+2000,U+2FFF], and
#    * the Unicode attribute table does not give an uppercase mapping
#      for it (this excludes titlecase characters), and
#    * at least one of the following is true:
#          o the Unicode attribute table gives a mapping to lowercase
#            for the character, or
#          o the name for the character in the Unicode attribute table
#            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".

sub upper_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
        && (!defined($uppercase) || $uppercase eq "")
        && ((defined($lowercase) && $lowercase ne "")
            || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
        return 1;
    } else {
        return 0;
    }
}

# A character is titlecase if it has the category Lt in the character
# attribute database.

sub title_case {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && $category eq "Lt") {
        return 1;
    } else {
        return 0;
    }
}

# A letter is any character with one of the letter categories (Lu, Ll,
# Lt, Lm, Lo) in the Unicode character database.

sub letter {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && ($category eq "Lu"
                               || $category eq "Ll"
                               || $category eq "Lt"
                               || $category eq "Lm"
                               || $category eq "Lo")) {
        return 1;
    } else {
        return 0;
    }
}

# A character is a digit if it has the category Nd in the character
# attribute database. In Latin-1 and ASCII, the only such characters
# are 0123456789. In Unicode, there are other digit characters in
# other code blocks, such as Gujarati digits and Tibetan digits.

sub digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (defined($category) && $category eq "Nd") {
        return 1;
    } else {
        return 0;
    }
}

# The only hex digits are 0123456789abcdefABCDEF. 

sub hex_digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint >= 0x30 && $codepoint <= 0x39)
        || ($codepoint >= 0x41 && $codepoint <= 0x46)
        || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
        return 1;
    } else {
        return 0;
    }
}

# The union of char-set:letter and char-set:digit.

sub letter_plus_digit {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (letter($codepoint, $name, $category, $uppercase, $lowercase)
        || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
        return 1;
    } else {
        return 0;
    }
}

# Characters that would 'use ink' when printed
sub graphic {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/L|M|N|P|S/)) {
        return 1;
    } else {
        return 0;
    }
}

# A whitespace character is either
#    * a character with one of the space, line, or paragraph separator
#      categories (Zs, Zl or Zp) of the Unicode character database.
#    * U+0009 Horizontal tabulation (\t control-I)
#    * U+000A Line feed (\n control-J)
#    * U+000B Vertical tabulation (\v control-K)
#    * U+000C Form feed (\f control-L)
#    * U+000D Carriage return (\r control-M)

sub whitespace {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Zs|Zl|Zp/)
        || $codepoint == 0x9
        || $codepoint == 0xA 
        || $codepoint == 0xB 
        || $codepoint == 0xC 
        || $codepoint == 0xD) { 
        return 1;
    } else {
        return 0;
    }
}

# A printing character is one that would occupy space when printed,
# i.e., a graphic character or a space character. char-set:printing is
# the union of char-set:whitespace and char-set:graphic.

sub printing {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
        || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
        return 1;
    } else {
        return 0;
    }
}

# The ISO control characters are the Unicode/Latin-1 characters in the
# ranges [U+0000,U+001F] and [U+007F,U+009F].

sub iso_control {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if (($codepoint >= 0x00 && $codepoint <= 0x1F)
        || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
        return 1;
    } else {
        return 0;
    }
}

# A punctuation character is any character that has one of the
# punctuation categories in the Unicode character database (Pc, Pd,
# Ps, Pe, Pi, Pf, or Po.)

# Note that srfi-14 gives conflicting requirements!!  It claims that
# only the Unicode punctuation is necessary, but, explicitly calls out
# the soft hyphen character (U+00AD) as punctution.  Current versions
# of Unicode consider U+00AD to be a formatting character, not
# punctuation.

sub punctuation {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/P/)) {
        return 1;
    } else {
        return 0;
    }
}
        
# A symbol is any character that has one of the symbol categories in
# the Unicode character database (Sm, Sc, Sk, or So).

sub symbol {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/S/)) {
        return 1;
    } else {
        return 0;
    }
}
        
# Blank chars are horizontal whitespace.  A blank character is either
#    * a character with the space separator category (Zs) in the
#      Unicode character database.
#    * U+0009 Horizontal tabulation (\t control-I) 
sub blank {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Zs/)
        || $codepoint == 0x9) { 
        return 1;
    } else {
        return 0;
    }
}

# ASCII
sub ascii {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($codepoint <= 0x7F) {
        return 1;
    } else {
        return 0;
    }
}

# Empty
sub empty {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    return 0;
}

# Designated -- All characters except for the surrogates
sub designated {
    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
    if ($category =~ (/Cs/)) {
        return 0;
    } else {
        return 1;
    }
}


# The procedure generates the two C structures necessary to describe a
# given category.
sub compute {
    my($f) = @_;
    my $start = -1;
    my $end = -1;
    my $len = 0;
    my @rstart = (-1);
    my @rend = (-1);

    seek($in, 0, 0) or die "Can't seek to beginning of file: $!";

    print "$f\n";

    while (<$in>) {
        # Parse the 14 column, semicolon-delimited UnicodeData.txt
        # file
        chomp;
        my(@fields) = split(/;/);

        # The codepoint: an integer
        my $codepoint = hex($fields[0]); 

        # If this is a character range, the last character in this
        # range
        my $codepoint_end = $codepoint;  

        # The name of the character
        my $name = $fields[1];    

        # A two-character category code, such as Ll (lower-case
        # letter)
        my $category = $fields[2];       

        # The codepoint of the uppercase version of this char
        my $uppercase = $fields[12];   

        # The codepoint of the lowercase version of this char
        my $lowercase = $fields[13];    

        my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
        if ($pass == 1) {

            # Some pairs of lines in UnicodeData.txt delimit ranges of
            # characters.
            if ($name =~ /First/) {
                $line = <$in>;
                die $! if $!;
                $codepoint_end = hex( (split(/;/, $line))[0] );
            }                 

            # Compute ranges of characters [start:end] that meet the
            # criteria.  Store the ranges.
            if ($start == -1) {
                $start = $codepoint;
                $end = $codepoint_end;
            } elsif ($end + 1 == $codepoint) {
                $end = $codepoint_end;
            } else {
                $rstart[$len] = $start;
                $rend[$len] = $end;
                $len++;
                $start = $codepoint;
                $end = $codepoint_end;
            }
        }
    }

    # Extra logic to ensure that the last range is included
    if ($start != -1) {
        if ($len > 0 && $rstart[@rstart-1] != $start) {
            $rstart[$len] = $start;
            $rend[$len] = $end;
            $len++;
        } elsif ($len == 0) {
	    $rstart[0] = $start;
	    $rend[0] = $end;
	    $len++;
        }
    }

    # Print the C struct that contains the range list.
    print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
    if ($rstart[0] != -1) {
        for (my $i=0; $i<@rstart-1; $i++) {
            printf $out "  {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
        }
        printf $out "  {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
    }
    print $out "};\n\n";

    # Print the C struct that contains the range list length and
    # pointer to the range list.
    print $out "scm_t_char_set cs_${f} = {\n";
    print $out "  $len,\n";
    print $out "  cs_" . $f . "_ranges\n";
    print $out "};\n\n";
}

# Write a bit of a header
print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
print $out "/* This file is #include'd by srfi-14.c.  */\n\n";
print $out "/* This file was generated from\n";
print $out "   http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
print $out "   with the unidata_to_charset.pl script.  */\n\n";

# Write the C structs for each SRFI-14 charset
compute "lower_case";
compute "upper_case";
compute "title_case";
compute "letter";
compute "digit";
compute "hex_digit";
compute "letter_plus_digit";
compute "graphic";
compute "whitespace";
compute "printing";
compute "iso_control";
compute "punctuation";
compute "symbol";
compute "blank";
compute "ascii";
compute "empty";
compute "designated";

close $in;
close $out;

exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";

# And we're done.
Commit	Line	Data
d0434ddf MG	1	#!/usr/bin/perl
	2	# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
	3	#
dcc69bab MG	4	# Copyright (C) 2009, 2010 Free Software Foundation, Inc.
dcc69bab MG	5	#
d0434ddf MG	6	# This library is free software; you can redistribute it and/or
	7	# modify it under the terms of the GNU Lesser General Public
	8	# License as published by the Free Software Foundation; either
	9	# version 3 of the License, or (at your option) any later version.
dcc69bab	10	#
d0434ddf MG	11	# This library is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	# Lesser General Public License for more details.
dcc69bab	15	#
d0434ddf MG	16	# You should have received a copy of the GNU Lesser General Public
	17	# License along with this library; if not, write to the Free Software
	18	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	19
	20	open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!";
	21	open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
	22
	23	# For Unicode, we follow Java's specification: a character is
	24	# lowercase if
	25	# * it is not in the range [U+2000,U+2FFF], and
	26	# * the Unicode attribute table does not give a lowercase mapping
	27	# for it, and
	28	# * at least one of the following is true:
	29	# o the Unicode attribute table gives a mapping to uppercase
	30	# for the character, or
	31	# o the name for the character in the Unicode attribute table
	32	# contains the words "SMALL LETTER" or "SMALL LIGATURE".
	33
	34	sub lower_case {
	35	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
	36	if (($codepoint < 0x2000 \|\| $codepoint > 0x2FFF)
	37	&& (!defined($lowercase) \|\| $lowercase eq "")
	38	&& ((defined($uppercase) && $uppercase ne "")
	39	\|\| ($name =~ /(SMALL LETTER\|SMALL LIGATURE)/))) {
	40	return 1;
	41	} else {
	42	return 0;
	43	}
	44	}
	45
	46	# For Unicode, we follow Java's specification: a character is
	47	# uppercase if
	48	# * it is not in the range [U+2000,U+2FFF], and
	49	# * the Unicode attribute table does not give an uppercase mapping
	50	# for it (this excludes titlecase characters), and
	51	# * at least one of the following is true:
	52	# o the Unicode attribute table gives a mapping to lowercase
	53	# for the character, or
	54	# o the name for the character in the Unicode attribute table
	55	# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
	56
	57	sub upper_case {
	58	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
	59	if (($codepoint < 0x2000 \|\| $codepoint > 0x2FFF)
	60	&& (!defined($uppercase) \|\| $uppercase eq "")
	61	&& ((defined($lowercase) && $lowercase ne "")
	62	\|\| ($name =~ /(CAPITAL LETTER\|CAPITAL LIGATURE)/))) {
	63	return 1;
	64	} else {
	65	return 0;
	66	}
	67	}
	68
	69	# A character is titlecase if it has the category Lt in the character
	70	# attribute database.
	71
	72	sub title_case {
	73	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
	74	if (defined($category) && $category eq "Lt") {
	75	return 1;
	76	} else {
	77	return 0;
	78	}
	79	}
80
81	# A letter is any character with one of the letter categories (Lu, Ll,
82	# Lt, Lm, Lo) in the Unicode character database.
83
84	sub letter {
85	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
86	if (defined($category) && ($category eq "Lu"
87	\|\| $category eq "Ll"
88	\|\| $category eq "Lt"
89	\|\| $category eq "Lm"
90	\|\| $category eq "Lo")) {
91	return 1;
92	} else {
93	return 0;
94	}
95	}
96
97	# A character is a digit if it has the category Nd in the character
98	# attribute database. In Latin-1 and ASCII, the only such characters
99	# are 0123456789. In Unicode, there are other digit characters in
100	# other code blocks, such as Gujarati digits and Tibetan digits.
101
102	sub digit {
103	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
104	if (defined($category) && $category eq "Nd") {
105	return 1;
106	} else {
107	return 0;
108	}
109	}
110
111	# The only hex digits are 0123456789abcdefABCDEF.
112
113	sub hex_digit {
114	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
115	if (($codepoint >= 0x30 && $codepoint <= 0x39)
116	\|\| ($codepoint >= 0x41 && $codepoint <= 0x46)
117	\|\| ($codepoint >= 0x61 && $codepoint <= 0x66)) {
118	return 1;
119	} else {
120	return 0;
121	}
122	}
123
124	# The union of char-set:letter and char-set:digit.
125
126	sub letter_plus_digit {
127	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
128	if (letter($codepoint, $name, $category, $uppercase, $lowercase)
129	\|\| digit($codepoint, $name, $category, $uppercase, $lowercase)) {
130	return 1;
131	} else {
132	return 0;
133	}
134	}
135
136	# Characters that would 'use ink' when printed
137	sub graphic {
138	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
139	if ($category =~ (/L\|M\|N\|P\|S/)) {
140	return 1;
141	} else {
142	return 0;
143	}
144	}
145
146	# A whitespace character is either
147	# * a character with one of the space, line, or paragraph separator
148	# categories (Zs, Zl or Zp) of the Unicode character database.
149	# * U+0009 Horizontal tabulation (\t control-I)
150	# * U+000A Line feed (\n control-J)
151	# * U+000B Vertical tabulation (\v control-K)
152	# * U+000C Form feed (\f control-L)
153	# * U+000D Carriage return (\r control-M)
154
155	sub whitespace {
156	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
157	if ($category =~ (/Zs\|Zl\|Zp/)
158	\|\| $codepoint == 0x9
159	\|\| $codepoint == 0xA
160	\|\| $codepoint == 0xB
161	\|\| $codepoint == 0xC
162	\|\| $codepoint == 0xD) {
163	return 1;
164	} else {
165	return 0;
166	}
167	}
168
169	# A printing character is one that would occupy space when printed,
170	# i.e., a graphic character or a space character. char-set:printing is
171	# the union of char-set:whitespace and char-set:graphic.
172
173	sub printing {
174	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
175	if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
176	\|\| graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
177	return 1;
178	} else {
179	return 0;
180	}
181	}
182
183	# The ISO control characters are the Unicode/Latin-1 characters in the
184	# ranges [U+0000,U+001F] and [U+007F,U+009F].
185
186	sub iso_control {
187	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
188	if (($codepoint >= 0x00 && $codepoint <= 0x1F)
189	\|\| ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
190	return 1;
191	} else {
192	return 0;
193	}
194	}
195
196	# A punctuation character is any character that has one of the
197	# punctuation categories in the Unicode character database (Pc, Pd,
198	# Ps, Pe, Pi, Pf, or Po.)
199
200	# Note that srfi-14 gives conflicting requirements!! It claims that
201	# only the Unicode punctuation is necessary, but, explicitly calls out
202	# the soft hyphen character (U+00AD) as punctution. Current versions
203	# of Unicode consider U+00AD to be a formatting character, not
204	# punctuation.
205
206	sub punctuation {
207	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
208	if ($category =~ (/P/)) {
209	return 1;
210	} else {
211	return 0;
212	}
213	}
214
215	# A symbol is any character that has one of the symbol categories in
216	# the Unicode character database (Sm, Sc, Sk, or So).
217
218	sub symbol {
219	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
220	if ($category =~ (/S/)) {
221	return 1;
222	} else {
223	return 0;
224	}
225	}
226
227	# Blank chars are horizontal whitespace. A blank character is either
228	# * a character with the space separator category (Zs) in the
229	# Unicode character database.
230	# * U+0009 Horizontal tabulation (\t control-I)
231	sub blank {
232	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
233	if ($category =~ (/Zs/)
234	\|\| $codepoint == 0x9) {
235	return 1;
236	} else {
237	return 0;
238	}
239	}
240
241	# ASCII
242	sub ascii {
243	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
244	if ($codepoint <= 0x7F) {
245	return 1;
246	} else {
247	return 0;
248	}
249	}
250
251	# Empty
252	sub empty {
253	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
254	return 0;
255	}
256
719bb8cd MG	257	# Designated -- All characters except for the surrogates
719bb8cd MG	258	sub designated {
d0434ddf	259	my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
24d23822 MG	260	if ($category =~ (/Cs/)) {
	261	return 0;
	262	} else {
	263	return 1;
	264	}
d0434ddf MG	265	}
	266
	267
	268	# The procedure generates the two C structures necessary to describe a
	269	# given category.
	270	sub compute {
	271	my($f) = @_;
	272	my $start = -1;
	273	my $end = -1;
	274	my $len = 0;
	275	my @rstart = (-1);
	276	my @rend = (-1);
	277
	278	seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
	279
	280	print "$f\n";
	281
	282	while (<$in>) {
	283	# Parse the 14 column, semicolon-delimited UnicodeData.txt
	284	# file
	285	chomp;
	286	my(@fields) = split(/;/);
	287
	288	# The codepoint: an integer
	289	my $codepoint = hex($fields[0]);
	290
	291	# If this is a character range, the last character in this
	292	# range
	293	my $codepoint_end = $codepoint;
	294
	295	# The name of the character
	296	my $name = $fields[1];
	297
	298	# A two-character category code, such as Ll (lower-case
	299	# letter)
	300	my $category = $fields[2];
	301
	302	# The codepoint of the uppercase version of this char
	303	my $uppercase = $fields[12];
	304
	305	# The codepoint of the lowercase version of this char
	306	my $lowercase = $fields[13];
	307
	308	my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
	309	if ($pass == 1) {
	310
	311	# Some pairs of lines in UnicodeData.txt delimit ranges of
	312	# characters.
	313	if ($name =~ /First/) {
	314	$line = <$in>;
	315	die $! if $!;
	316	$codepoint_end = hex( (split(/;/, $line))[0] );
	317	}
	318
	319	# Compute ranges of characters [start:end] that meet the
	320	# criteria. Store the ranges.
	321	if ($start == -1) {
	322	$start = $codepoint;
	323	$end = $codepoint_end;
	324	} elsif ($end + 1 == $codepoint) {
	325	$end = $codepoint_end;
	326	} else {
	327	$rstart[$len] = $start;
	328	$rend[$len] = $end;
329	$len++;
330	$start = $codepoint;
331	$end = $codepoint_end;
332	}
333	}
334	}
335
336	# Extra logic to ensure that the last range is included
337	if ($start != -1) {
338	if ($len > 0 && $rstart[@rstart-1] != $start) {
339	$rstart[$len] = $start;
340	$rend[$len] = $end;
341	$len++;
342	} elsif ($len == 0) {
6d30df5d MG	343	$rstart[0] = $start;
	344	$rend[0] = $end;
	345	$len++;
d0434ddf MG	346	}
	347	}
	348
	349	# Print the C struct that contains the range list.
	350	print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
	351	if ($rstart[0] != -1) {
	352	for (my $i=0; $i<@rstart-1; $i++) {
	353	printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
	354	}
	355	printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1];
	356	}
	357	print $out "};\n\n";
	358
	359	# Print the C struct that contains the range list length and
	360	# pointer to the range list.
	361	print $out "scm_t_char_set cs_${f} = {\n";
	362	print $out " $len,\n";
	363	print $out " cs_" . $f . "_ranges\n";
	364	print $out "};\n\n";
	365	}
	366
	367	# Write a bit of a header
	368	print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
	369	print $out "/* This file is #include'd by srfi-14.c. */\n\n";
24d23822	370	print $out "/* This file was generated from\n";
d0434ddf MG	371	print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";
	372	print $out " with the unidata_to_charset.pl script. */\n\n";
	373
	374	# Write the C structs for each SRFI-14 charset
	375	compute "lower_case";
	376	compute "upper_case";
	377	compute "title_case";
	378	compute "letter";
	379	compute "digit";
	380	compute "hex_digit";
	381	compute "letter_plus_digit";
	382	compute "graphic";
	383	compute "whitespace";
	384	compute "printing";
	385	compute "iso_control";
	386	compute "punctuation";
	387	compute "symbol";
	388	compute "blank";
	389	compute "ascii";
	390	compute "empty";
719bb8cd	391	compute "designated";
d0434ddf MG	392
	393	close $in;
	394	close $out;
	395
	396	exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
	397
	398	# And we're done.
	399
	400
	401
	402
	403
	404