Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / mklinebreak.pl
CommitLineData
b0322a85
CE
1#! /usr/bin/perl
2#
3# Compile LineBreak.txt into C array declarations.
4#
5# The array's structure is [firstchar, lastchar, class], giving the
6# linebreaking "class" for unicode character range firstchar-lastchar.
7#
8# The ranges are sorted in numerical order.
9#
10# An array gets generated for each block of 4096 unicode characters.
11#
12# Finally, two arrays get declared: a pointer to an array for each 4096
13# unicode character block, and the number of elements in the array.
14#
15# The pointer is NULL for each block of 4096 unicode characters that is not
16# defined in LineBreak.txt
17#
18# By definition, a unicode character that is not listed in the array is
19# class XX.
20
21use strict;
22use warnings;
23use mkcommon;
24
25my %general_category;
26
27open(UC, "<UnicodeData.txt") || die;
28
29while (defined($_=<UC>))
30{
31 chomp;
32
33 my @f=split(/;/);
34
35 my $cp;
36
37 eval "\$cp=0x$f[0]";
38
39 $general_category{$cp}=$f[2];
40}
41
42my $obj=mkcommon->new;
43
44open(F, "<LineBreak.txt") || die;
45
46while (defined($_=<F>))
47{
48 chomp;
49
50 next unless /^([0-9A-F]+)(\.\.([0-9A-F]+))?\;([^\s][^\s])\s*/;
51
52 my $f=$1;
53 my $l=$3;
54 my $t=$4;
55
56 $l=$f unless $l;
57
58 eval "\$f=0x$f";
59 eval "\$l=0x$l";
60
61 next if $t eq "XX";
62
63 if ($t eq "SA")
64 {
65 while ($f <= $l)
66 {
67 die "Cannot find general_category for $f\n"
68 unless exists $general_category{$f};
69
70 $obj->range($f, $f,
71 $general_category{$f} eq "Mn" ||
72 $general_category{$f} eq "Mc" ?
73 "UNICODE_LB_CM":"UNICODE_LB_AL");
74 # LB1 rule
75 ++$f;
76 }
77 }
78 else
79 {
80 $t="AL" if $t eq "AI" || $t eq "SG"; # LB1 rule
81
82 $obj->range($f, $l, "UNICODE_LB_$t");
83 }
84}
85
86$obj->output;