cldr2def tool: Fix alternative month generation
[dragonfly.git] / tools / tools / locale / tools / cldr2def.pl
CommitLineData
252345eb
JM
1#!/usr/bin/perl -wC
2
3use strict;
4use XML::Parser;
5use Tie::IxHash;
6use Data::Dumper;
7use Getopt::Long;
8use Digest::SHA qw(sha1_hex);
9require "charmaps.pm";
10
11
12if ($#ARGV < 2) {
13 print "Usage: $0 --cldr=<cldrdir> --unidata=<unidatadir> --etc=<etcdir> --type=<type> [--lc=<la_CC>]\n";
14 exit(1);
15}
16
17my $DEFENCODING = "UTF-8";
18my @filter = ();
19
20my $CLDRDIR = undef;
21my $UNIDATADIR = undef;
22my $ETCDIR = undef;
23my $TYPE = undef;
24my $doonly = undef;
25
26my $result = GetOptions (
27 "cldr=s" => \$CLDRDIR,
28 "unidata=s" => \$UNIDATADIR,
29 "etc=s" => \$ETCDIR,
30 "type=s" => \$TYPE,
31 "lc=s" => \$doonly
32 );
33
34my %convertors = ();
35
36my %ucd = ();
37my %values = ();
38my %hashtable = ();
39my %languages = ();
40my %translations = ();
41my %encodings = ();
42my %alternativemonths = ();
43get_languages();
44
45my %utf8map = ();
46my %utf8aliases = ();
47get_unidata($UNIDATADIR);
48get_utf8map("$CLDRDIR/posix/$DEFENCODING.cm");
49get_encodings("$ETCDIR/charmaps");
50
51my %keys = ();
52tie(%keys, "Tie::IxHash");
53tie(%hashtable, "Tie::IxHash");
54
55my %FILESNAMES = (
56 "monetdef" => "LC_MONETARY",
57 "timedef" => "LC_TIME",
58 "msgdef" => "LC_MESSAGES",
59 "numericdef" => "LC_NUMERIC"
60);
61
62my %callback = (
63 mdorder => \&callback_mdorder,
64 altmon => \&callback_altmon,
ddddc53a 65 cformat => \&callback_cformat,
252345eb
JM
66 data => undef,
67);
68
69my %DESC = (
70
71 # numericdef
72 "decimal_point" => "decimal_point",
73 "thousands_sep" => "thousands_sep",
74 "grouping" => "grouping",
75
76 # monetdef
77 "int_curr_symbol" => "int_curr_symbol (last character always " .
78 "SPACE)",
79 "currency_symbol" => "currency_symbol",
80 "mon_decimal_point" => "mon_decimal_point",
81 "mon_thousands_sep" => "mon_thousands_sep",
82 "mon_grouping" => "mon_grouping",
83 "positive_sign" => "positive_sign",
84 "negative_sign" => "negative_sign",
85 "int_frac_digits" => "int_frac_digits",
86 "frac_digits" => "frac_digits",
87 "p_cs_precedes" => "p_cs_precedes",
88 "p_sep_by_space" => "p_sep_by_space",
89 "n_cs_precedes" => "n_cs_precedes",
90 "n_sep_by_space" => "n_sep_by_space",
91 "p_sign_posn" => "p_sign_posn",
92 "n_sign_posn" => "n_sign_posn",
93
94 # msgdef
95 "yesexpr" => "yesexpr",
96 "noexpr" => "noexpr",
97 "yesstr" => "yesstr",
98 "nostr" => "nostr",
99
100 # timedef
101 "abmon" => "Short month names",
102 "mon" => "Long month names (as in a date)",
103 "abday" => "Short weekday names",
104 "day" => "Long weekday names",
105 "t_fmt" => "X_fmt",
106 "d_fmt" => "x_fmt",
ddddc53a 107 "c_fmt" => "c_fmt",
252345eb
JM
108 "am_pm" => "AM/PM",
109 "d_t_fmt" => "date_fmt",
110 "altmon" => "Long month names (without case ending)",
111 "md_order" => "md_order",
112 "t_fmt_ampm" => "ampm_fmt",
113);
114
115if ($TYPE eq "numericdef") {
116 %keys = (
117 "decimal_point" => "s",
118 "thousands_sep" => "s",
119 "grouping" => "ai",
120 );
121 get_fields();
122 print_fields();
123 make_makefile();
124}
125
126if ($TYPE eq "monetdef") {
127 %keys = (
128 "int_curr_symbol" => "s",
129 "currency_symbol" => "s",
130 "mon_decimal_point" => "s",
131 "mon_thousands_sep" => "s",
132 "mon_grouping" => "ai",
133 "positive_sign" => "s",
134 "negative_sign" => "s",
135 "int_frac_digits" => "i",
136 "frac_digits" => "i",
137 "p_cs_precedes" => "i",
138 "p_sep_by_space" => "i",
139 "n_cs_precedes" => "i",
140 "n_sep_by_space" => "i",
141 "p_sign_posn" => "i",
142 "n_sign_posn" => "i"
143 );
144 get_fields();
145 print_fields();
146 make_makefile();
147}
148
149if ($TYPE eq "msgdef") {
150 %keys = (
151 "yesexpr" => "s",
152 "noexpr" => "s",
153 "yesstr" => "s",
154 "nostr" => "s"
155 );
156 get_fields();
157 print_fields();
158 make_makefile();
159}
160
161if ($TYPE eq "timedef") {
162 %keys = (
163 "abmon" => "as",
164 "mon" => "as",
165 "abday" => "as",
166 "day" => "as",
167 "t_fmt" => "s",
168 "d_fmt" => "s",
ddddc53a 169 "c_fmt" => "<cformat<d_t_fmt<s",
252345eb
JM
170 "am_pm" => "as",
171 "d_fmt" => "s",
172 "d_t_fmt" => "s",
173 "altmon" => "<altmon<mon<as",
174 "md_order" => "<mdorder<d_fmt<s",
175 "t_fmt_ampm" => "s",
176 );
177 get_fields();
178 print_fields();
179 make_makefile();
180}
181
ddddc53a
JM
182sub callback_cformat {
183 my $s = shift;
184 $s =~ s/ %Z//;
185 $s =~ s/ %z//;
186 return $s;
187};
188
252345eb
JM
189sub callback_mdorder {
190 my $s = shift;
191 return undef if (!defined $s);
192 $s =~ s/[^dm]//g;
193 return $s;
194};
195
196sub callback_altmon {
197 # if the language/country is known in %alternative months then
198 # return that, otherwise repeat mon
199 my $s = shift;
200
201 if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
bdbc34a9
JM
202 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
203 my @cleaned;
204 foreach (@altnames)
205 {
206 $_ =~ s/^\s+//;
207 $_ =~ s/\s+$//;
208 push @cleaned, $_;
209 }
210 return join(";",@cleaned);
252345eb
JM
211 }
212
213 return $s;
214}
215
216############################
217
218sub get_unidata {
219 my $directory = shift;
220
221 open(FIN, "$directory/UnicodeData.txt")
222 or die("Cannot open $directory/UnicodeData.txt");;
223 my @lines = <FIN>;
224 chomp(@lines);
225 close(FIN);
226
227 foreach my $l (@lines) {
228 my @a = split(/;/, $l);
229
230 $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name
231 $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code
232 }
233}
234
235sub get_utf8map {
236 my $file = shift;
237
238 open(FIN, $file);
239 my @lines = <FIN>;
240 close(FIN);
241 chomp(@lines);
242
243 my $prev_k = undef;
244 my $prev_v = "";
245 my $incharmap = 0;
246 foreach my $l (@lines) {
247 $l =~ s/\r//;
248 next if ($l =~ /^\#/);
249 next if ($l eq "");
250
251 if ($l eq "CHARMAP") {
252 $incharmap = 1;
253 next;
254 }
255
256 next if (!$incharmap);
257 last if ($l eq "END CHARMAP");
258
259 $l =~ /^<([^\s]+)>\s+(.*)/;
260 my $k = $1;
261 my $v = $2;
262 $k =~ s/_/ /g; # unicode char string
263 $v =~ s/\\x//g; # UTF-8 char code
264 $utf8map{$k} = $v;
265
266 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
267
268 $prev_v = $v;
269 $prev_k = $k;
270 }
271}
272
273sub get_encodings {
274 my $dir = shift;
275 foreach my $e (sort(keys(%encodings))) {
276 if (!open(FIN, "$dir/$e.TXT")) {
277 print "Cannot open charmap for $e\n";
278 next;
279
280 }
281 $encodings{$e} = 1;
282 my @lines = <FIN>;
283 close(FIN);
284 chomp(@lines);
285 foreach my $l (@lines) {
286 $l =~ s/\r//;
287 next if ($l =~ /^\#/);
288 next if ($l eq "");
289
290 my @a = split(" ", $l);
291 next if ($#a < 1);
292 $a[0] =~ s/^0[xX]//; # local char code
293 $a[1] =~ s/^0[xX]//; # unicode char code
294 $convertors{$e}{uc($a[1])} = uc($a[0]);
295 }
296 }
297}
298
299sub get_languages {
300 my %data = get_xmldata($ETCDIR);
301 %languages = %{$data{L}};
302 %translations = %{$data{T}};
303 %alternativemonths = %{$data{AM}};
304 %encodings = %{$data{E}};
305
306 return if (!defined $doonly);
307
308 my @a = split(/_/, $doonly);
309 if ($#a == 1) {
310 $filter[0] = $a[0];
311 $filter[1] = "x";
312 $filter[2] = $a[1];
313 } elsif ($#a == 2) {
314 $filter[0] = $a[0];
315 $filter[1] = $a[1];
316 $filter[2] = $a[2];
317 }
318
319 print Dumper(@filter);
320 return;
321}
322
323sub get_fields {
324 foreach my $l (sort keys(%languages)) {
325 foreach my $f (sort keys(%{$languages{$l}})) {
326 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
327 next if ($#filter == 2 && ($filter[0] ne $l
328 || $filter[1] ne $f || $filter[2] ne $c));
329 next if (defined $languages{$l}{$f}{definitions}
330 && $languages{$l}{$f}{definitions} !~ /$TYPE/);
331
332 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
333 my $file;
334 $file = $l . "_";
335 $file .= $f . "_" if ($f ne "x");
336 $file .= $c;
337
338 my $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src";
339 $filename = "$ETCDIR/$file.$DEFENCODING.src"
340 if (! -f $filename);
341 if (! -f $filename
342 && defined $languages{$l}{$f}{fallback}) {
343 $file = $languages{$l}{$f}{fallback};
344 $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src";
345 }
346 $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src"
347 if (! -f $filename);
348 if (! -f $filename) {
349 print STDERR
350 "Cannot open $file.$DEFENCODING.src or fallback\n";
351 next;
352 }
353 open(FIN, "$filename");
354 print "Reading from $filename for ${l}_${f}_${c}\n";
355 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
356 my @lines = <FIN>;
357 chomp(@lines);
358 close(FIN);
359 my $continue = 0;
360 foreach my $k (keys(%keys)) {
361 foreach my $line (@lines) {
362 $line =~ s/\r//;
363 next if (!$continue && $line !~ /^$k\s/);
364 if ($continue) {
365 $line =~ s/^\s+//;
366 } else {
367 $line =~ s/^$k\s+//;
368 }
369
370 $values{$l}{$c}{$k} = ""
371 if (!defined $values{$l}{$c}{$k});
372
373 $continue = ($line =~ /\/$/);
374 $line =~ s/\/$// if ($continue);
375
376 while ($line =~ /_/) {
377 $line =~
378 s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
379 }
380 die "_ in data - $line" if ($line =~ /_/);
381 $values{$l}{$c}{$k} .= $line;
382
383 last if (!$continue);
384 }
385 }
386 }
387 }
388 }
389}
390
391sub decodecldr {
392 my $e = shift;
393 my $s = shift;
394
395 my $v = undef;
396
397 if ($e eq "UTF-8") {
398 #
399 # Conversion to UTF-8 can be done from the Unicode name to
400 # the UTF-8 character code.
401 #
402 $v = $utf8map{$s};
403 die "Cannot convert $s in $e (charmap)" if (!defined $v);
404 } else {
405 #
406 # Conversion to these encodings can be done from the Unicode
407 # name to Unicode code to the encodings code.
408 #
409 my $ucc = undef;
410 $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
411 $ucc = $ucd{name2code}{$utf8aliases{$s}}
412 if (!defined $ucc
413 && $utf8aliases{$s}
414 && defined $ucd{name2code}{$utf8aliases{$s}});
415
416 if (!defined $ucc) {
417 if (defined $translations{$e}{$s}{hex}) {
418 $v = $translations{$e}{$s}{hex};
419 $ucc = 0;
420 } elsif (defined $translations{$e}{$s}{ucc}) {
421 $ucc = $translations{$e}{$s}{ucc};
422 }
423 }
424
425 die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
426 $v = $convertors{$e}{$ucc} if (!defined $v);
427
428 $v = $translations{$e}{$s}{hex}
429 if (!defined $v && defined $translations{$e}{$s}{hex});
430
431 if (!defined $v && defined $translations{$e}{$s}{unicode}) {
432 my $ucn = $translations{$e}{$s}{unicode};
433 $ucc = $ucd{name2code}{$ucn}
434 if (defined $ucd{name2code}{$ucn});
435 $ucc = $ucd{name2code}{$utf8aliases{$ucn}}
436 if (!defined $ucc
437 && defined $ucd{name2code}{$utf8aliases{$ucn}});
438 $v = $convertors{$e}{$ucc};
439 }
440
441 die "Cannot convert $s in $e (charmap)" if (!defined $v);
442 }
443
444 return pack("C", hex($v)) if (length($v) == 2);
445 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
446 if (length($v) == 4);
447 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
448 hex(substr($v, 4, 2))) if (length($v) == 6);
449 print STDERR "Cannot convert $e $s\n";
450 return "length = " . length($v);
451
452}
453
454sub translate {
455 my $enc = shift;
456 my $v = shift;
457
458 return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
459 return undef;
460}
461
462sub print_fields {
463 foreach my $l (sort keys(%languages)) {
464 foreach my $f (sort keys(%{$languages{$l}})) {
465 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
466 next if ($#filter == 2 && ($filter[0] ne $l
467 || $filter[1] ne $f || $filter[2] ne $c));
468 next if (defined $languages{$l}{$f}{definitions}
469 && $languages{$l}{$f}{definitions} !~ /$TYPE/);
470 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
471 if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
472 print "Skipping ${l}_" .
473 ($f eq "x" ? "" : "${f}_") .
474 "${c} - not read\n";
475 next;
476 }
477 my $file = $l;
478 $file .= "_" . $f if ($f ne "x");
479 $file .= "_" . $c;
480 print "Writing to $file in $enc\n";
481
482 if ($enc ne $DEFENCODING &&
483 !defined $convertors{$enc}) {
484 print "Failed! Cannot convert to $enc.\n";
485 next;
486 };
487
488 open(FOUT, ">$TYPE.draft/$file.$enc.new");
489 my $okay = 1;
490 my $output = "";
491 print FOUT <<EOF;
492# Warning: Do not edit. This file is automatically generated from the
493# tools in /usr/src/tools/tools/locale. The data is obtained from the
494# CLDR project, obtained from http://cldr.unicode.org/
495# -----------------------------------------------------------------------------
496EOF
497 foreach my $k (keys(%keys)) {
498 my $f = $keys{$k};
499
500 die("Unknown $k in \%DESC")
501 if (!defined $DESC{$k});
502
503 $output .= "#\n# $DESC{$k}\n";
504
505 # Replace one row with another
506 if ($f =~ /^>/) {
507 $k = substr($f, 1);
508 $f = $keys{$k};
509 }
510
511 # Callback function
512 if ($f =~ /^\</) {
513 $callback{data}{c} = $c;
514 $callback{data}{k} = $k;
515 $callback{data}{l} = $l;
516 $callback{data}{e} = $enc;
517 my @a = split(/\</, substr($f, 1));
518 my $rv =
519 &{$callback{$a[0]}}($values{$l}{$c}{$a[1]});
520 $values{$l}{$c}{$k} = $rv;
521 $f = $a[2];
522 $callback{data} = ();
523 }
524
525 my $v = $values{$l}{$c}{$k};
526 $v = "undef" if (!defined $v);
527
528 if ($f eq "i") {
529 $output .= "$v\n";
530 next;
531 }
532 if ($f eq "ai") {
533 $output .= "$v\n";
534 next;
535 }
536 if ($f eq "s") {
537 $v =~ s/^"//;
538 $v =~ s/"$//;
539 my $cm = "";
540 while ($v =~ /^(.*?)<(.*?)>(.*)/) {
541 my $p1 = $1;
542 $cm = $2;
543 my $p3 = $3;
544
545 my $rv = decodecldr($enc, $cm);
546# $rv = translate($enc, $cm)
547# if (!defined $rv);
548 if (!defined $rv) {
549 print STDERR
550"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
551 $okay = 0;
552 next;
553 }
554
555 $v = $p1 . $rv . $p3;
556 }
557 $output .= "$v\n";
558 next;
559 }
560 if ($f eq "as") {
561 foreach my $v (split(/;/, $v)) {
562 $v =~ s/^"//;
563 $v =~ s/"$//;
564 my $cm = "";
565 while ($v =~ /^(.*?)<(.*?)>(.*)/) {
566 my $p1 = $1;
567 $cm = $2;
568 my $p3 = $3;
569
570 my $rv =
571 decodecldr($enc,
572 $cm);
573# $rv = translate($enc,
574# $cm)
575# if (!defined $rv);
576 if (!defined $rv) {
577 print STDERR
578"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
579 $okay = 0;
580 next;
581 }
582
583 $v = $1 . $rv . $3;
584 }
585 $output .= "$v\n";
586 }
587 next;
588 }
589
590 die("$k is '$f'");
591
592 }
593
594 $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
595 $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
596 print FOUT "$output# EOF\n";
597 close(FOUT);
598
599 if ($okay) {
600 rename("$TYPE.draft/$file.$enc.new",
601 "$TYPE.draft/$file.$enc.src");
602 } else {
603 rename("$TYPE.draft/$file.$enc.new",
604 "$TYPE.draft/$file.$enc.failed");
605 }
606 }
607 }
608 }
609 }
610}
611
612sub make_makefile {
613 return if ($#filter > -1);
614 print "Creating Makefile for $TYPE\n";
615 open(FOUT, ">$TYPE.draft/Makefile");
616 print FOUT <<EOF;
617# Warning: Do not edit. This file is automatically generated from the
618# tools in /usr/src/tools/tools/locale.
619
620LOCALEDIR= /usr/share/locale
621FILESNAME= $FILESNAMES{$TYPE}
622.SUFFIXES: .src .out
623
624.src.out:
625 grep -v '^\#' < \${.IMPSRC} > \${.TARGET}
626
627## PLACEHOLDER
628
629EOF
630
631 foreach my $hash (keys(%hashtable)) {
632 my @files = sort(keys(%{$hashtable{$hash}}));
633 if ($#files > 0) {
634 my $link = shift(@files);
635 $link =~ s/_x_/_/; # strip family if none there
636 foreach my $file (@files) {
637 my @a = split(/_/, $file);
638 my @b = split(/\./, $a[-1]);
639 $file =~ s/_x_/_/;
640 print FOUT "SAME+=\t\t$link:$file\n";
641 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
642 }
643 }
644 }
645
646 foreach my $l (sort keys(%languages)) {
647 foreach my $f (sort keys(%{$languages{$l}})) {
648 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
649 next if ($#filter == 2 && ($filter[0] ne $l
650 || $filter[1] ne $f || $filter[2] ne $c));
651 next if (defined $languages{$l}{$f}{definitions}
652 && $languages{$l}{$f}{definitions} !~ /$TYPE/);
653 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
654 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
655 print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
656 "${c} - not read\n";
657 next;
658 }
659 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
660 my $file = $l . "_";
661 $file .= $f . "_" if ($f ne "x");
662 $file .= $c;
663 next if (!defined $languages{$l}{$f}{data}{$c}{$e});
664 print FOUT "LOCALES+=\t$file.$e\n";
665 }
666
667 if (defined $languages{$l}{$f}{nc_link}) {
668 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
669 my $file = $l . "_";
670 $file .= $f . "_" if ($f ne "x");
671 $file .= $c;
672 print FOUT "SAME+=\t\t$file.$e:$languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
673 }
674 }
675
676 if (defined $languages{$l}{$f}{e_link}) {
677 foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
678 my @a = split(/:/, $el);
679 my $file = $l . "_";
680 $file .= $f . "_" if ($f ne "x");
681 $file .= $c;
682 print FOUT "SAME+=\t\t$file.$a[0]:$file.$a[1]\t# legacy (same charset)\n";
683 }
684 }
685
686 }
687 }
688 }
689
690 print FOUT <<EOF;
691
692FILES= \${LOCALES:S/\$/.out/}
693CLEANFILES= \${FILES}
694
695.for f in \${SAME}
696SYMLINKS+= ../\${f:C/:.*\$//}/\${FILESNAME} \${LOCALEDIR}/\${f:C/^.*://}
697.endfor
698
699.for f in \${LOCALES}
700FILESDIR_\${f}.out= \${LOCALEDIR}/\${f}
701.endfor
702
703.include <bsd.prog.mk>
704EOF
705
706 close(FOUT);
707}