1 #!/usr/local/bin/perl -wC
9 #use Digest::SHA qw(sha1_hex);
10 #require "charmaps.pm";
14 print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n";
21 my $result = GetOptions (
22 "cldr=s" => \$CLDRDIR,
27 ["en_US", "* 0x0000 - 0x007F Basic Latin\n" .
28 "* 0x0080 - 0x00FF Latin-1 Supplement\n" .
29 "* 0x0100 - 0x017F Latin Extended-A\n" .
30 "* 0x0180 - 0x024F Latin Extended-B\n" .
31 "* 0x0250 - 0x02AF IPA Extensions\n" .
32 "* 0x1D00 - 0x1D7F Phonetic Extensions\n" .
33 "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" .
34 "* 0x1E00 - 0x1EFF Latin Extended Additional\n" .
35 "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n".
36 "* 0x2C60 - 0x2C7F Latin Extended-C\n" .
37 "* 0xA720 - 0xA7FF Latin Extended-D\n" .
38 "* 0xAB30 - 0xAB6F Latin Extended-E\n" .
39 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n".
40 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
41 ["el_GR", "* 0x0370 - 0x03FF Greek (No Coptic!)\n" .
42 "* 0x1F00 - 0x1FFF Greek Extended\n"],
43 ["ru_RU", "* 0x0400 - 0x04FF Cyrillic\n" .
44 "* 0x0500 - 0x052F Cyrillic Supplementary\n" .
45 "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" .
46 "* 0xA640 - 0xA69F Cyrillic Extended-B\n"],
47 ["hy_AM", "* 0x0530 - 0x058F Armenian\n" .
48 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
49 ["he_IL", "* 0x0590 - 0x05FF Hebrew\n" .
50 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
51 ["ar_SA", "* 0x0600 - 0x06FF Arabic\n" .
52 "* 0x0750 - 0x074F Arabic Supplement\n" .
53 "* 0x08A0 - 0x08FF Arabic Extended-A\n" .
54 "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" .
55 "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"],
56 ["hi_IN", "* 0x0900 - 0x097F Devanagari\n" .
57 "* 0xA8E0 - 0xA8FF Devanagari Extended\n"],
58 ["bn_IN", "* 0x0900 - 0x097F Bengali\n"],
59 ["pa_Guru_IN", "* 0x0A00 - 0x0A7F Gurmukhi\n"],
60 ["gu_IN", "* 0x0A80 - 0x0AFF Gujarati\n"],
61 ["or_IN", "* 0x0B00 - 0x0B7F Oriya\n"],
62 ["ta_IN", "* 0x0B80 - 0x0BFF Tamil\n"],
63 ["te_IN", "* 0x0C00 - 0x0C7F Telugu\n"],
64 ["kn_IN", "* 0x0C80 - 0x0CFF Kannada\n"],
65 ["ml_IN", "* 0x0D00 - 0x0D7F Malayalam\n"],
66 ["si_LK", "* 0x0D80 - 0x0DFF Sinhala\n"],
67 ["th_TH", "* 0x0E00 - 0x0E7F Thai\n"],
68 ["lo_LA", "* 0x0E80 - 0x0EFF Lao\n"],
69 ["bo_IN", "* 0x0F00 - 0x0FFF Tibetan\n"],
70 ["my_MM", "* 0x1000 - 0x109F Myanmar\n" .
71 "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" .
72 "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"],
73 ["ka_GE", "* 0x10A0 - 0x10FF Georgia\n" .
74 "* 0x2D00 - 0x2D2F Georgian Supplement\n"],
75 ["ja_JP", "* 0x1100 - 0x11FF Hangul Jamo\n" .
76 "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
77 "* 0x3040 - 0x309F Hiragana\n" .
78 "* 0x30A0 - 0x30FF Katakana\n" .
79 "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" .
80 "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
81 "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
82 "* 0x3300 - 0x33FF CJK Compatibility\n" .
83 "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" .
84 "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" .
85 "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
86 "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" .
87 "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" .
88 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
89 ["am_ET", "* 0x1200 - 0x137F Ethiopic\n" .
90 "* 0x1380 - 0x139F Ethiopic Supplement\n" .
91 "* 0x2D80 - 0x2DDF Ethiopic Extended\n" .
92 "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"],
93 ["chr_US", "* 0x13A0 - 0x13FF Cherokee\n"],
94 ["km_KH", "* 0x1780 - 0x17FF Khmer\n" .
95 "* 0x19E0 - 0x19FF Khmer Symbols\n"],
96 ["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"],
97 ["ii_CN", "* 0xA000 - 0xA48F Yi Syllables\n" .
98 "* 0xA490 - 0xA4CF Yi Radicals\n"],
99 ["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"],
100 ["ko_KR", "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
101 "* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" .
102 "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
103 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
106 # ["zh_Hans_CN", "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" .
107 # "* 0x2F00 - 0x2FDF Rangxi Radicales\n" .
108 # "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
109 # "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
110 # "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" .
111 # "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"],
114 my %pending_seen = ();
116 my %utf8aliases = ();
117 my $outfilename = "$ETCDIR/common.UTF-8.src";
118 my $manual_file = "$ETCDIR/manual-input.UTF-8";
119 my $stars = "**********************************************************************\n";
121 get_utf8map("$CLDRDIR/posix/UTF-8.cm");
123 generate_sections ();
126 ############################
139 foreach my $l (@lines) {
141 next if ($l =~ /^\#/);
144 if ($l eq "CHARMAP") {
149 next if (!$incharmap);
150 last if ($l eq "END CHARMAP");
152 $l =~ /^<([^\s]+)>\s+(.*)/;
155 $k =~ s/_/ /g; # unicode char string
156 $v =~ s/\\x//g; # UTF-8 char code
159 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
166 sub generate_header {
167 open(FOUT, ">", "$outfilename")
168 or die ("can't write to $outfilename\n");
170 # Warning: Do not edit. This file is automatically generated from the
171 # tools in /usr/src/tools/tools/locale. The data is obtained from the
172 # CLDR project, obtained from http://cldr.unicode.org/
173 # -----------------------------------------------------------------------------
182 sub generate_footer {
183 print FOUT "\nEND LC_CTYPE\n";
189 if (defined $seen{$ucode}) {
192 $pending_seen{$ucode} = 1;
196 sub already_seen_RO {
198 if (defined $seen{$ucode}) {
205 foreach my $sn (keys %pending_seen) {
211 sub initialize_lines {
216 my @types = ("graph", "alpha");
217 if ($terr eq "ja_JP") {
218 foreach my $T (@types) {
219 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n";
220 for ($n = hex("3401"); $n <= hex("4DB4"); $n++) {
221 $back2hex=sprintf("%X", $n);
222 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
225 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n";
226 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n";
227 for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) {
228 $back2hex=sprintf("%X", $n);
229 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
232 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n";
234 push @result, "merge\tnow\n";
240 my $territory = shift;
253 my @lines = initialize_lines ($territory);
255 my $filename = "$CLDRDIR/posix/$territory.UTF-8.src";
256 if (! -f $filename) {
257 print STDERR "Cannot open $filename\n";
260 open(FIN, "$filename");
261 print "Reading from $filename\n";
263 if (/^LC_CTYPE/../^END LC_CTYPE/) {
264 if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" &&
265 $_ ne "*************\n" && $_ ne "\n") {
271 foreach my $line (@lines) {
272 if ($line =~ m/^([a-z]{3,})\t/) {
274 if ($category eq 'merge') {
278 if ($category ne 'print') {
282 next if ($category eq 'print');
283 if ($category eq 'toupper' || $category eq 'tolower') {
284 if ($line =~ m/<([-_A-Za-z0-9]+)>,/) {
286 $key_name =~ s/_/ /g;
287 if (already_seen_RO (hex($utf8map{$key_name}))) {
290 if ($cat_loaded) { print FOUT $category; }
292 $line =~ s/^[a-z]{3,}\t/\t/;
297 if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) {
298 $term = ($2 eq '') ? 1 : 0;
301 $key_name =~ s/_/ /g;
302 $curr_ID = hex($utf8map{$key_name});
303 if (already_seen ($curr_ID)) {
307 if ($curr_ID == $prev_ID + 1) {
309 $prev_name = $curr_name;
311 if ($cat_loaded) { print FOUT $category; }
313 if ($prev_ID == $lock_ID) {
314 print FOUT "\t<" . $prev_name . ">;/\n";
315 } elsif ($prev_ID - 1 == $lock_ID) {
316 print FOUT "\t<" . $lock_name . ">;/\n";
317 print FOUT "\t<" . $prev_name . ">;/\n";
319 print FOUT "\t<" . $lock_name .
320 ">;...;<" . $prev_name . ">;/\n";
324 $lock_name = $curr_name;
325 $prev_name = $curr_name;
331 $lock_name = $curr_name;
332 $prev_name = $curr_name;
335 if ($cat_loaded) { print FOUT $category; }
337 if ($curr_ID == $lock_ID) {
338 print FOUT "\t<" . $curr_name . ">\n";
339 } elsif ($curr_ID == $lock_ID + 1) {
340 print FOUT "\t<" . $lock_name . ">;/\n";
341 print FOUT "\t<" . $curr_name . ">\n";
343 print FOUT "\t<" . $lock_name .
344 ">;...;<" . $curr_name . ">\n";
354 sub generate_sections {
355 foreach my $section (@SECTIONS ) {
358 print FOUT @$section[1];
360 compress_ctype (@$section[0]);
364 open(FIN, "$manual_file");
365 print "Reading from $manual_file\n";
370 foreach my $line (@lines) {