tools: Do not hardcode path to perl.
[dragonfly.git] / tools / tools / locale / tools / convert_map.pl
CommitLineData
344be199 1i#!/usr/bin/env perl
ce02c398
JM
2#
3# This file and its contents are supplied under the terms of the
4# Common Development and Distribution License ("CDDL"), version 1.0.
5# You may only use this file in accordance with the terms of version
6# 1.0 of the CDDL.
7#
8# A full copy of the text of the CDDL should have accompanied this
9# source. A copy is of the CDDL is also available via the Internet
10# at http://www.illumos.org/license/CDDL.
11#
12
13#
14# Copyright 2010 Nexenta Systems, Inc. All rights reserved.
15# Copyright 2015 John Marino <draco@marino.st>
16#
17
18# This converts MAPPING files to localedef character maps
19# suitable for use with the UTF-8 derived localedef data.
20
21sub ucs_to_utf8
22{
23 my $ucs = shift;
24 my $utf8;
25
26 if ($ucs <= 0x7f) {
27 $utf8 = sprintf("\\x%02X", $ucs).$utf8;
28 } elsif ($ucs <= 0x7ff) {
29 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
30 $ucs >>= 6;
31 $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8;
32
33 } elsif ($ucs <= 0xffff) {
34 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
35 $ucs >>= 6;
36 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
37 $ucs >>= 6;
38 $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8;
39
40 } elsif ($ucs <= 0x1fffff) {
41 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
42 $ucs >>= 6;
43 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
44 $ucs >>= 6;
45 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
46 $ucs >>= 6;
47 $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8;
48
49 } elsif ($ucs <= 0x03ffffff) {
50 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
51 $ucs >>= 6;
52 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
53 $ucs >>= 6;
54 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
55 $ucs >>= 6;
56 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
57 $ucs >>= 6;
58 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
59
60 } else {
61 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
62 $ucs >>= 6;
63 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
64 $ucs >>= 6;
65 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
66 $ucs >>= 6;
67 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
68 $ucs >>= 6;
69 $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
70 $ucs >>= 6;
71 $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
72 }
73
74 return ($utf8);
75}
76
77my %unames;
78my %uvalues;
79
80#
81# This is not a general purpose Character Map parser, but its good enough
82# for the stock one supplied with CLDR.
83#
84sub load_utf8_cm
85{
86 my $file = shift;
87
88 open(UTF8, "$file") || die "open";
89
90 while (<UTF8>) {
91 next if (/^#/);
92 next if (/^\s*$/);
93 next if (/^\s*CHARMAP\s*$/);
94 next if (/^\s*END\s*CHARMAP\s*$/);
95 chomp;
96 @words = split /\s+/;
97 $name = $words[0];
98 $utf8val = $words[1];
99
100 if (defined($unames{$utf8val})) {
101 $unames{$utf8val} .= "\n" .$name;
102 } else {
103 $unames{$utf8val} = $name;
104 }
105 $uvalues{$name} = $utf8val;
106 }
107 close(UTF8);
108}
109
110my %map;
111
112sub load_map
113{
114 my $file = shift;
115
116 open(MAP, "$file") || die "open";
117
118 while (<MAP>) {
119 next if (/^#/);
120 next if (/^\s*$/);
121 next if (/^0x..\+0x../);
122 next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/);
123 next if (/^0x[0-9A-F]{2}\s+#/);
124 next if (/# ... NO MAPPING .../);
125 chomp;
126 @words = split /\s+/;
127 $utf8 = $words[1];
128 $utf8 =~ s/^\\x[0]*//;
129 $utf8 = ucs_to_utf8(hex($utf8));
130 $val = $words[0];
131 if (defined ($map{$val})) {
132 $map{$val} .= " ".$utf8;
133 } else {
134 $map{$val} = $utf8;
135 }
136 }
137}
138
139sub mb_str
140{
141 my $val = shift;
142 my $str = "";
143 $val = hex($val);
144
145 if ($val == 0) {
146 return ("\\x00");
147 }
148 while ($val) {
149 $str = sprintf("\\x%02x", $val & 0xff).$str;
150 $val >>= 8;
151 }
152 return ($str);
153}
154
155$mf = shift(@ARGV);
156$codeset = shift(@ARGV);
157my $max_mb;
158
159load_utf8_cm("etc/final-maps/map.UTF-8");
160load_map($mf);
161
162
163 if ($codeset eq "SJIS") { $max_mb = 2 }
164elsif ($codeset eq "eucCN") { $max_mb = 2 }
165elsif ($codeset eq "eucJP") { $max_mb = 3 }
166elsif ($codeset eq "eucKR") { $max_mb = 2 }
167elsif ($codeset eq "GBK") { $max_mb = 2 }
168elsif ($codeset eq "GB2312") { $max_mb = 2 }
169elsif ($codeset eq "Big5") { $max_mb = 2 }
170elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 }
171else { $max_mb = 1 };
172print("<code_set_name> \"$codeset\"\n");
173print("<mb_cur_min> 1\n");
174print("<mb_cur_max> $max_mb\n");
175
176print("CHARMAP\n");
177foreach $val (sort (keys (%map))) {
178 #$utf8 = $map{$val};
179 foreach $utf8 (split / /, $map{$val}) {
180 $ref = $unames{$utf8};
181 foreach $name (sort (split /\n/, $ref)) {
182 print "$name";
183 my $nt = int((64 - length($name) + 7) / 8);
184 while ($nt) {
185 print "\t";
186 $nt--;
187 }
188 print mb_str($val)."\n";
189 }
190 }
191}
192print "END CHARMAP\n";