nrelease - fix/improve livecd
[dragonfly.git] / tools / tools / locale / tools / utf8-rollup.pl
1 #!/usr/bin/env perl -wC
2
3 use strict;
4 #use File::Copy;
5 #use XML::Parser;
6 use Tie::IxHash;
7 #use Data::Dumper;
8 use Getopt::Long;
9 #use Digest::SHA qw(sha1_hex);
10 #require "charmaps.pm";
11
12
13 if ($#ARGV != 1) {
14         print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n";
15         exit(1);
16 }
17
18 my $CLDRDIR = undef;
19 my $ETCDIR = undef;
20
21 my $result = GetOptions (
22                 "cldr=s"        => \$CLDRDIR,
23                 "etc=s"         => \$ETCDIR,
24             );
25
26 my @SECTIONS = (
27         ["en_US",       "* 0x0000 - 0x007F Basic Latin\n" .
28                         "* 0x0080 - 0x00FF Latin-1 Supplement\n" .
29                         "* 0x0100 - 0x017F Latin Extended-A\n" .
30                         "* 0x0180 - 0x024F Latin Extended-B\n" .
31                         "* 0x0250 - 0x02AF IPA Extensions\n" .
32                         "* 0x1D00 - 0x1D7F Phonetic Extensions\n" .
33                         "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" .
34                         "* 0x1E00 - 0x1EFF Latin Extended Additional\n" .
35                         "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n".
36                         "* 0x2C60 - 0x2C7F Latin Extended-C\n" .
37                         "* 0xA720 - 0xA7FF Latin Extended-D\n" .
38                         "* 0xAB30 - 0xAB6F Latin Extended-E\n" .
39                         "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n".
40                         "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
41         ["el_GR",       "* 0x0370 - 0x03FF Greek (No Coptic!)\n" .
42                         "* 0x1F00 - 0x1FFF Greek Extended\n"],
43         ["ru_RU",       "* 0x0400 - 0x04FF Cyrillic\n" .
44                         "* 0x0500 - 0x052F Cyrillic Supplementary\n" .
45                         "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" .
46                         "* 0xA640 - 0xA69F Cyrillic Extended-B\n"],
47         ["hy_AM",       "* 0x0530 - 0x058F Armenian\n" .
48                         "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
49         ["he_IL",       "* 0x0590 - 0x05FF Hebrew\n" .
50                         "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
51         ["ar_SA",       "* 0x0600 - 0x06FF Arabic\n" .
52                         "* 0x0750 - 0x074F Arabic Supplement\n" .
53                         "* 0x08A0 - 0x08FF Arabic Extended-A\n" .
54                         "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" .
55                         "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"],
56         ["hi_IN",       "* 0x0900 - 0x097F Devanagari\n" .
57                         "* 0xA8E0 - 0xA8FF Devanagari Extended\n"],
58         ["bn_IN",       "* 0x0900 - 0x097F Bengali\n"],
59         ["pa_Guru_IN",  "* 0x0A00 - 0x0A7F Gurmukhi\n"],
60         ["gu_IN",       "* 0x0A80 - 0x0AFF Gujarati\n"],
61         ["or_IN",       "* 0x0B00 - 0x0B7F Oriya\n"],
62         ["ta_IN",       "* 0x0B80 - 0x0BFF Tamil\n"],
63         ["te_IN",       "* 0x0C00 - 0x0C7F Telugu\n"],
64         ["kn_IN",       "* 0x0C80 - 0x0CFF Kannada\n"],
65         ["ml_IN",       "* 0x0D00 - 0x0D7F Malayalam\n"],
66         ["si_LK",       "* 0x0D80 - 0x0DFF Sinhala\n"],
67         ["th_TH",       "* 0x0E00 - 0x0E7F Thai\n"],
68         ["lo_LA",       "* 0x0E80 - 0x0EFF Lao\n"],
69         ["bo_IN",       "* 0x0F00 - 0x0FFF Tibetan\n"],
70         ["my_MM",       "* 0x1000 - 0x109F Myanmar\n" .
71                         "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" .
72                         "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"],
73         ["ka_GE",       "* 0x10A0 - 0x10FF Georgia\n" .
74                         "* 0x2D00 - 0x2D2F Georgian Supplement\n"],
75         ["ja_JP",       "* 0x1100 - 0x11FF Hangul Jamo\n" .
76                         "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
77                         "* 0x3040 - 0x309F Hiragana\n" .
78                         "* 0x30A0 - 0x30FF Katakana\n" .
79                         "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" .
80                         "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
81                         "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
82                         "* 0x3300 - 0x33FF CJK Compatibility\n" .
83                         "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" .
84                         "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" .
85                         "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
86                         "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" .
87                         "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" .
88                         "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
89         ["am_ET",       "* 0x1200 - 0x137F Ethiopic\n" .
90                         "* 0x1380 - 0x139F Ethiopic Supplement\n" .
91                         "* 0x2D80 - 0x2DDF Ethiopic Extended\n" .
92                         "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"],
93         ["chr_US",      "* 0x13A0 - 0x13FF Cherokee\n"],
94         ["km_KH",       "* 0x1780 - 0x17FF Khmer\n" .
95                         "* 0x19E0 - 0x19FF Khmer Symbols\n"],
96         ["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"],
97         ["ii_CN",       "* 0xA000 - 0xA48F Yi Syllables\n" .
98                         "* 0xA490 - 0xA4CF Yi Radicals\n"],
99         ["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"],
100         ["ko_KR",       "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
101                         "* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" .
102                         "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
103                         "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
104 );
105
106 #       ["zh_Hans_CN",  "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" .
107 #                       "* 0x2F00 - 0x2FDF Rangxi Radicales\n" .
108 #                       "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
109 #                       "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
110 #                       "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" .
111 #                       "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"],
112
113 my %seen = ();
114 my %pending_seen = ();
115 my %utf8map = ();
116 my %utf8aliases = ();
117 my $outfilename = "$ETCDIR/common.UTF-8.src";
118 my $manual_file = "$ETCDIR/manual-input.UTF-8";
119 my $stars = "**********************************************************************\n";
120
121 get_utf8map("$CLDRDIR/posix/UTF-8.cm");
122 generate_header ();
123 generate_sections ();
124 generate_footer ();
125
126 ############################
127
128 sub get_utf8map {
129         my $file = shift;
130
131         open(FIN, $file);
132         my @lines = <FIN>;
133         close(FIN);
134         chomp(@lines);
135
136         my $prev_k = undef;
137         my $prev_v = "";
138         my $incharmap = 0;
139         foreach my $l (@lines) {
140                 $l =~ s/\r//;
141                 next if ($l =~ /^\#/);
142                 next if ($l eq "");
143
144                 if ($l eq "CHARMAP") {
145                         $incharmap = 1;
146                         next;
147                 }
148
149                 next if (!$incharmap);
150                 last if ($l eq "END CHARMAP");
151
152                 $l =~ /^<([^\s]+)>\s+(.*)/;
153                 my $k = $1;
154                 my $v = $2;
155                 $k =~ s/_/ /g;          # unicode char string
156                 $v =~ s/\\x//g;         # UTF-8 char code
157                 $utf8map{$k} = $v;
158
159                 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
160
161                 $prev_v = $v;
162                 $prev_k = $k;
163         }
164 }
165
166 sub generate_header {
167         open(FOUT, ">", "$outfilename")
168                 or die ("can't write to $outfilename\n");
169         print FOUT <<EOF;
170 # Warning: Do not edit. This file is automatically generated from the
171 # tools in /usr/src/tools/tools/locale. The data is obtained from the
172 # CLDR project, obtained from http://cldr.unicode.org/
173 # -----------------------------------------------------------------------------
174
175 comment_char *
176 escape_char /
177
178 LC_CTYPE
179 EOF
180 }
181
182 sub generate_footer {
183         print FOUT "\nEND LC_CTYPE\n";
184         close (FOUT);
185 }
186
187 sub already_seen {
188         my $ucode = shift;
189         if (defined $seen{$ucode}) {
190                 return 1;
191         }
192         $pending_seen{$ucode} = 1;
193         return 0;
194 }
195
196 sub already_seen_RO {
197         my $ucode = shift;
198         if (defined $seen{$ucode}) {
199                 return 1;
200         }
201         return 0;
202 }
203
204 sub merge_seen {
205         foreach my $sn (keys %pending_seen) {
206                 $seen{$sn} = 1;
207         }
208         %pending_seen = ();
209 }
210
211 sub initialize_lines {
212         my @result = ();
213         my $terr = shift;
214         my $n;
215         my $back2hex;
216         my @types = ("graph", "alpha");
217         if ($terr eq "ja_JP") {
218             foreach my $T (@types) {
219                 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n";
220                 for ($n = hex("3401"); $n <= hex("4DB4"); $n++) {
221                         $back2hex=sprintf("%X", $n);
222                         push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
223                                 $back2hex . ">;/\n";
224                 }
225                 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n";
226                 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n";
227                 for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) {
228                         $back2hex=sprintf("%X", $n);
229                         push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
230                                 $back2hex . ">;/\n";
231                 }
232                 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n";
233             }
234             push @result, "merge\tnow\n";
235         }
236         return @result;
237 }
238
239 sub compress_ctype {
240         my $territory = shift;
241         my $term;
242         my $active = 0;
243         my $cat_loaded = 0;
244         my $lock_ID;
245         my $prev_ID;
246         my $curr_ID;
247         my $lock_name;
248         my $prev_name;
249         my $curr_name;
250         my $key_name;
251         my $category = '';
252
253         my @lines = initialize_lines ($territory);
254
255         my $filename = "$CLDRDIR/posix/$territory.UTF-8.src";
256         if (! -f $filename) {
257                 print STDERR "Cannot open $filename\n";
258                 return;
259         }
260         open(FIN, "$filename");
261         print "Reading from $filename\n";
262         while (<FIN>) {
263                 if (/^LC_CTYPE/../^END LC_CTYPE/) {
264                         if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" &&
265                                 $_ ne "*************\n" && $_ ne "\n") {
266                                 push @lines, $_;
267                         }
268                 }
269         }
270         close(FIN);
271         foreach my $line (@lines) {
272                 if ($line =~ m/^([a-z]{3,})\t/) {
273                         $category = $1;
274                         if ($category eq 'merge') {
275                                 merge_seen;
276                                 next;
277                         }
278                         if ($category ne 'print') {
279                                 $cat_loaded = 1;
280                         }
281                 }
282                 next if ($category eq 'print');
283                 if ($category eq 'toupper' || $category eq 'tolower') {
284                         if ($line =~ m/<([-_A-Za-z0-9]+)>,/) {
285                                 $key_name = $1;
286                                 $key_name =~ s/_/ /g;
287                                 if (already_seen_RO (hex($utf8map{$key_name}))) {
288                                         next;
289                                 }
290                                 if ($cat_loaded) { print FOUT $category; }
291                                 $cat_loaded = 0;
292                                 $line =~ s/^[a-z]{3,}\t/\t/;
293                                 print FOUT $line;
294                         }
295                         next;
296                 }
297                 if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) {
298                         $term = ($2 eq '') ? 1 : 0;
299                         $curr_name = $1;
300                         $key_name = $1;
301                         $key_name =~ s/_/ /g;
302                         $curr_ID = hex($utf8map{$key_name});
303                         if (already_seen ($curr_ID)) {
304                                 next;
305                         }
306                         if ($active) {
307                                 if ($curr_ID == $prev_ID + 1) {
308                                         $prev_ID = $curr_ID;
309                                         $prev_name = $curr_name;
310                                 } else {
311                                         if ($cat_loaded) { print FOUT $category; }
312                                         $cat_loaded = 0;
313                                         if ($prev_ID == $lock_ID) {
314                                                 print FOUT "\t<" . $prev_name . ">;/\n";
315                                         } elsif ($prev_ID - 1 == $lock_ID) {
316                                                 print FOUT "\t<" . $lock_name . ">;/\n";
317                                                 print FOUT "\t<" . $prev_name . ">;/\n";
318                                         } else {
319                                                 print FOUT "\t<" . $lock_name .
320                                                        ">;...;<" . $prev_name . ">;/\n";
321                                         }
322                                         $lock_ID = $curr_ID;
323                                         $prev_ID = $curr_ID;
324                                         $lock_name = $curr_name;
325                                         $prev_name = $curr_name;
326                                 }
327                         } else {
328                                 $active = 1;
329                                 $lock_ID = $curr_ID;
330                                 $prev_ID = $curr_ID;
331                                 $lock_name = $curr_name;
332                                 $prev_name = $curr_name;
333                         }
334                         if ($term) {
335                                 if ($cat_loaded) { print FOUT $category; }
336                                 $cat_loaded = 0;
337                                 if ($curr_ID == $lock_ID) {
338                                         print FOUT "\t<" . $curr_name . ">\n";
339                                 } elsif ($curr_ID == $lock_ID + 1) {
340                                         print FOUT "\t<" . $lock_name . ">;/\n";
341                                         print FOUT "\t<" . $curr_name . ">\n";
342                                 } else {
343                                         print FOUT "\t<" . $lock_name .
344                                                ">;...;<" . $curr_name . ">\n";
345                                 }
346                                 $active = 0;
347                         }
348                 } else {
349                         print FOUT $line;
350                 }
351         }
352 }
353
354 sub generate_sections {
355         foreach my $section (@SECTIONS ) {
356                 print FOUT "\n";
357                 print FOUT $stars;
358                 print FOUT @$section[1];
359                 print FOUT $stars;
360                 compress_ctype (@$section[0]);
361                 merge_seen;
362         }
363         my @lines = ();
364         open(FIN, "$manual_file");
365         print "Reading from $manual_file\n";
366         while (<FIN>) {
367                 push @lines, $_;
368         }
369         close(FIN);
370         foreach my $line (@lines) {
371                 print FOUT $line;
372         }
373 }