tools: Do not hardcode path to perl.
[dragonfly.git] / tools / tools / locale / tools / unicode2src.pl
CommitLineData
344be199 1#!/usr/bin/env perl -wC
252345eb
JM
2
3#
4# $FreeBSD$
5#
6
7use strict;
8use XML::Parser;
9use Tie::IxHash;
10use Data::Dumper;
11use Getopt::Long;
12use Digest::SHA qw(sha1_hex);
13
14
15if ($#ARGV < 2) {
16 print "Usage: $0 --cldr=<cldrdir> --unidata=<unidatadir> --etc=<etcdir> --input=<inputfile> --output=<outputfile>\n";
17 exit(1);
18}
19
20my @filter = ();
21
22my $CLDRDIR = undef;
23my $UNIDATADIR = undef;
24my $ETCDIR = undef;
25my $TYPE = undef;
26my $INPUT = undef;
27my $OUTPUT = undef;
28
29my $result = GetOptions (
30 "cldr=s" => \$CLDRDIR,
31 "unidata=s" => \$UNIDATADIR,
32 "etc=s" => \$ETCDIR,
33 "type=s" => \$TYPE,
34 "input=s" => \$INPUT,
35 "output=s" => \$OUTPUT,
36 );
37
38my %ucd = ();
39my %utf8map = ();
40my %utf8aliases = ();
41get_unidata($UNIDATADIR);
42get_utf8map("$CLDRDIR/posix/UTF-8.cm");
43convert($INPUT, $OUTPUT);
44
45############################
46
47sub get_unidata {
48 my $directory = shift;
49
50 open(FIN, "$directory/UnicodeData.txt")
51 or die("Cannot open $directory/UnicodeData.txt");;
52 my @lines = <FIN>;
53 chomp(@lines);
54 close(FIN);
55
56 foreach my $l (@lines) {
57 my @a = split(/;/, $l);
58
59 $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name
60 $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code
61 }
62}
63
64sub get_utf8map {
65 my $file = shift;
66
67 open(FIN, $file);
68 my @lines = <FIN>;
69 close(FIN);
70 chomp(@lines);
71
72 my $prev_k = undef;
73 my $prev_v = "";
74 my $incharmap = 0;
75 foreach my $l (@lines) {
76 $l =~ s/\r//;
77 next if ($l =~ /^\#/);
78 next if ($l eq "");
79
80 if ($l eq "CHARMAP") {
81 $incharmap = 1;
82 next;
83 }
84
85 next if (!$incharmap);
86 last if ($l eq "END CHARMAP");
87
88 $l =~ /^<([^\s]+)>\s+(.*)/;
89 my $k = $1;
90 my $v = $2;
91 $k =~ s/_/ /g; # unicode char string
92 $v =~ s/\\x//g; # UTF-8 char code
93 $utf8map{$k} = $v;
94
95 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
96
97 $prev_v = $v;
98 $prev_k = $k;
99 }
100}
101
102sub decode_cldr {
103 my $s = shift;
104
105 my $v = $utf8map{$s};
106 $v = $utf8aliases{$s} if (!defined $v);
107 die "Cannot convert $s" if (!defined $v);
108
109 return pack("C", hex($v)) if (length($v) == 2);
110 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
111 if (length($v) == 4);
112 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
113 hex(substr($v, 4, 2))) if (length($v) == 6);
114 print STDERR "Cannot convert $s\n";
115 return "length = " . length($v);
116}
117
118sub convert {
119 my $IN = shift;
120 my $OUT = shift;
121
122 open(FIN, "$IN");
123 open(FOUT, ">$OUT");
124
125# print Dumper(%utf8map);
126
127 my $l;
128 while (defined ($l = <FIN>)) {
129 chomp($l);
130
131 if ($l =~ /^#/) {
132 print FOUT $l, "\n";
133 next;
134 }
135
136 while ($l =~ /^(.*?)<(.*?)>(.*)$/) {
137 $l = $1 . decode_cldr($2) . $3;
138 }
139 print FOUT $l, "\n";
140 }
141
142 close(FOUT);
143 close(FIN);
144}