| 1 | i#!/usr/bin/env perl |
| 2 | # |
| 3 | # This file and its contents are supplied under the terms of the |
| 4 | # Common Development and Distribution License ("CDDL"), version 1.0. |
| 5 | # You may only use this file in accordance with the terms of version |
| 6 | # 1.0 of the CDDL. |
| 7 | # |
| 8 | # A full copy of the text of the CDDL should have accompanied this |
| 9 | # source. A copy is of the CDDL is also available via the Internet |
| 10 | # at http://www.illumos.org/license/CDDL. |
| 11 | # |
| 12 | |
| 13 | # |
| 14 | # Copyright 2010 Nexenta Systems, Inc. All rights reserved. |
| 15 | # Copyright 2015 John Marino <draco@marino.st> |
| 16 | # |
| 17 | |
| 18 | # This converts MAPPING files to localedef character maps |
| 19 | # suitable for use with the UTF-8 derived localedef data. |
| 20 | |
| 21 | sub ucs_to_utf8 |
| 22 | { |
| 23 | my $ucs = shift; |
| 24 | my $utf8; |
| 25 | |
| 26 | if ($ucs <= 0x7f) { |
| 27 | $utf8 = sprintf("\\x%02X", $ucs).$utf8; |
| 28 | } elsif ($ucs <= 0x7ff) { |
| 29 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 30 | $ucs >>= 6; |
| 31 | $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8; |
| 32 | |
| 33 | } elsif ($ucs <= 0xffff) { |
| 34 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 35 | $ucs >>= 6; |
| 36 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 37 | $ucs >>= 6; |
| 38 | $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8; |
| 39 | |
| 40 | } elsif ($ucs <= 0x1fffff) { |
| 41 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 42 | $ucs >>= 6; |
| 43 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 44 | $ucs >>= 6; |
| 45 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 46 | $ucs >>= 6; |
| 47 | $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8; |
| 48 | |
| 49 | } elsif ($ucs <= 0x03ffffff) { |
| 50 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 51 | $ucs >>= 6; |
| 52 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 53 | $ucs >>= 6; |
| 54 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 55 | $ucs >>= 6; |
| 56 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 57 | $ucs >>= 6; |
| 58 | $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; |
| 59 | |
| 60 | } else { |
| 61 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 62 | $ucs >>= 6; |
| 63 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 64 | $ucs >>= 6; |
| 65 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 66 | $ucs >>= 6; |
| 67 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 68 | $ucs >>= 6; |
| 69 | $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; |
| 70 | $ucs >>= 6; |
| 71 | $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; |
| 72 | } |
| 73 | |
| 74 | return ($utf8); |
| 75 | } |
| 76 | |
| 77 | my %unames; |
| 78 | my %uvalues; |
| 79 | |
| 80 | # |
| 81 | # This is not a general purpose Character Map parser, but its good enough |
| 82 | # for the stock one supplied with CLDR. |
| 83 | # |
| 84 | sub load_utf8_cm |
| 85 | { |
| 86 | my $file = shift; |
| 87 | |
| 88 | open(UTF8, "$file") || die "open"; |
| 89 | |
| 90 | while (<UTF8>) { |
| 91 | next if (/^#/); |
| 92 | next if (/^\s*$/); |
| 93 | next if (/^\s*CHARMAP\s*$/); |
| 94 | next if (/^\s*END\s*CHARMAP\s*$/); |
| 95 | chomp; |
| 96 | @words = split /\s+/; |
| 97 | $name = $words[0]; |
| 98 | $utf8val = $words[1]; |
| 99 | |
| 100 | if (defined($unames{$utf8val})) { |
| 101 | $unames{$utf8val} .= "\n" .$name; |
| 102 | } else { |
| 103 | $unames{$utf8val} = $name; |
| 104 | } |
| 105 | $uvalues{$name} = $utf8val; |
| 106 | } |
| 107 | close(UTF8); |
| 108 | } |
| 109 | |
| 110 | my %map; |
| 111 | |
| 112 | sub load_map |
| 113 | { |
| 114 | my $file = shift; |
| 115 | |
| 116 | open(MAP, "$file") || die "open"; |
| 117 | |
| 118 | while (<MAP>) { |
| 119 | next if (/^#/); |
| 120 | next if (/^\s*$/); |
| 121 | next if (/^0x..\+0x../); |
| 122 | next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/); |
| 123 | next if (/^0x[0-9A-F]{2}\s+#/); |
| 124 | next if (/# ... NO MAPPING .../); |
| 125 | chomp; |
| 126 | @words = split /\s+/; |
| 127 | $utf8 = $words[1]; |
| 128 | $utf8 =~ s/^\\x[0]*//; |
| 129 | $utf8 = ucs_to_utf8(hex($utf8)); |
| 130 | $val = $words[0]; |
| 131 | if (defined ($map{$val})) { |
| 132 | $map{$val} .= " ".$utf8; |
| 133 | } else { |
| 134 | $map{$val} = $utf8; |
| 135 | } |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | sub mb_str |
| 140 | { |
| 141 | my $val = shift; |
| 142 | my $str = ""; |
| 143 | $val = hex($val); |
| 144 | |
| 145 | if ($val == 0) { |
| 146 | return ("\\x00"); |
| 147 | } |
| 148 | while ($val) { |
| 149 | $str = sprintf("\\x%02x", $val & 0xff).$str; |
| 150 | $val >>= 8; |
| 151 | } |
| 152 | return ($str); |
| 153 | } |
| 154 | |
| 155 | $mf = shift(@ARGV); |
| 156 | $codeset = shift(@ARGV); |
| 157 | my $max_mb; |
| 158 | |
| 159 | load_utf8_cm("etc/final-maps/map.UTF-8"); |
| 160 | load_map($mf); |
| 161 | |
| 162 | |
| 163 | if ($codeset eq "SJIS") { $max_mb = 2 } |
| 164 | elsif ($codeset eq "eucCN") { $max_mb = 2 } |
| 165 | elsif ($codeset eq "eucJP") { $max_mb = 3 } |
| 166 | elsif ($codeset eq "eucKR") { $max_mb = 2 } |
| 167 | elsif ($codeset eq "GBK") { $max_mb = 2 } |
| 168 | elsif ($codeset eq "GB2312") { $max_mb = 2 } |
| 169 | elsif ($codeset eq "Big5") { $max_mb = 2 } |
| 170 | elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 } |
| 171 | else { $max_mb = 1 }; |
| 172 | print("<code_set_name> \"$codeset\"\n"); |
| 173 | print("<mb_cur_min> 1\n"); |
| 174 | print("<mb_cur_max> $max_mb\n"); |
| 175 | |
| 176 | print("CHARMAP\n"); |
| 177 | foreach $val (sort (keys (%map))) { |
| 178 | #$utf8 = $map{$val}; |
| 179 | foreach $utf8 (split / /, $map{$val}) { |
| 180 | $ref = $unames{$utf8}; |
| 181 | foreach $name (sort (split /\n/, $ref)) { |
| 182 | print "$name"; |
| 183 | my $nt = int((64 - length($name) + 7) / 8); |
| 184 | while ($nt) { |
| 185 | print "\t"; |
| 186 | $nt--; |
| 187 | } |
| 188 | print mb_str($val)."\n"; |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | print "END CHARMAP\n"; |