From ce02c39812ecae8c8bdc89570d49258d0f38a9e6 Mon Sep 17 00:00:00 2001 From: John Marino Date: Sat, 1 Aug 2015 19:27:04 +0200 Subject: [PATCH] clr2def: Add LC_CTYPE source file generation support I added the capability to generate LC_CTYPE source files (really this is basically extracting a section from the POSIX files) but there was some logic to figure out how to use the least amount of files because some of them are large. I compromised on a scheme that makes two reductions. The first eliminates true duplicates and uses the SAME+= mechanism to create symlinks. However, this leaves still some duplicates because while the output is distinct, the source files are the same (e.g. en_US.ISO8859* uses the same input file as en_US.UTF-8 locale, but the LC_CTYPE products differ. The script identifies those are replaces them with symlinks. So it looks like a lot of files but really it's only about 12 or so. During the actual LC_CTYPE generating, character maps are needed. I added a Illumos tool to do this, which I had to modify. Unlike Illumos, we will pregenerate the maps that the tool (convert_map.pl) produces. I had to spend hours troubleshooting various "invalid" inputs so this is definitely something that should not be repeated in the build. --- tools/tools/locale/Makefile | 7 +- tools/tools/locale/tools/cldr2def.pl | 123 +++++++++++++-- tools/tools/locale/tools/convert_map.pl | 192 ++++++++++++++++++++++++ tools/tools/locale/tools/finalize | 48 +++++- 4 files changed, 355 insertions(+), 15 deletions(-) create mode 100644 tools/tools/locale/tools/convert_map.pl diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile index a95da266ef..dcc246ec2e 100644 --- a/tools/tools/locale/Makefile +++ b/tools/tools/locale/Makefile @@ -17,7 +17,8 @@ PASSON= CLDRDIR="${CLDRDIR}" UNIDATADIR="${UNIDATADIR}" ETCDIR= ${.CURDIR}/etc -TYPES?= monetdef numericdef msgdef timedef colldef +KNOWN= monetdef numericdef msgdef timedef colldef ctypedef +TYPES?= ${KNOWN} LOCALE_DESTDIR?= /tmp/generated-locales/ .if defined(LC) @@ -26,16 +27,20 @@ LC:= --lc=${LC} all: .for t in ${TYPES} +. if ${KNOWN:M${t}} test -d ${t} || mkdir ${t} make build-${t} +. endif .endfor @echo "" @find . -name *failed install: .for t in ${TYPES} +. if ${KNOWN:M${t}} cd ${t} && make cd ${t} && sudo DESTDIR=${LOCALE_DESTDIR} make install +. endif .endfor .for t in ${TYPES} diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl index 5e0533750b..4ee61aa721 100755 --- a/tools/tools/locale/tools/cldr2def.pl +++ b/tools/tools/locale/tools/cldr2def.pl @@ -58,7 +58,8 @@ my %FILESNAMES = ( "timedef" => "LC_TIME", "msgdef" => "LC_MESSAGES", "numericdef" => "LC_NUMERIC", - "colldef" => "LC_COLLATE" + "colldef" => "LC_COLLATE", + "ctypedef" => "LC_CTYPE" ); my %callback = ( @@ -119,6 +120,11 @@ if ($TYPE eq "colldef") { make_makefile(); } +if ($TYPE eq "ctypedef") { + transform_ctypes(); + make_makefile(); +} + if ($TYPE eq "numericdef") { %keys = ( "decimal_point" => "s", @@ -327,6 +333,78 @@ sub get_languages { return; } +sub transform_ctypes { + foreach my $l (sort keys(%languages)) { + foreach my $f (sort keys(%{$languages{$l}})) { + foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { + next if ($#filter == 2 && ($filter[0] ne $l + || $filter[1] ne $f || $filter[2] ne $c)); + next if (defined $languages{$l}{$f}{definitions} + && $languages{$l}{$f}{definitions} !~ /$TYPE/); + $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread + my $file; + $file = $l . "_"; + $file .= $f . "_" if ($f ne "x"); + $file .= $c; + my $actfile = $file; + + my $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src"; + $filename = "$ETCDIR/$file.$DEFENCODING.src" + if (! -f $filename); + if (! -f $filename + && defined $languages{$l}{$f}{fallback}) { + $file = $languages{$l}{$f}{fallback}; + $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src"; + } + $filename = "$CLDRDIR/posix/$file.$DEFENCODING.src" + if (! -f $filename); + if (! -f $filename) { + print STDERR + "Cannot open $file.$DEFENCODING.src or fallback\n"; + next; + } + open(FIN, "$filename"); + print "Reading from $filename for ${l}_${f}_${c}\n"; + $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read + my @lines; + my $shex; + my $uhex; + while () { + if ((/^comment_char\s/) || (/^escape_char\s/)){ + push @lines, $_; + } + if (/^LC_CTYPE/../^END LC_CTYPE/) { + push @lines, $_; + } + } + close(FIN); + $shex = sha1_hex(join("\n", @lines)); + $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; + $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; + open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); + print FOUT < -1); print "Creating Makefile for $TYPE\n"; - my $SRCOUT = ($TYPE eq "colldef") ? - "localedef -D -U -w \${.CURDIR}/widths.txt -f " . - "\${.CURDIR}/map.UTF-8 -i \${.IMPSRC} ". - "\${.OBJDIR}/\${.IMPSRC:T:R}" : - "grep -v '^\#' < \${.IMPSRC} > \${.TARGET}"; - my $SRCOUT2 = ($TYPE eq "colldef") ? - "LC_COLLATE" : "out"; + my $SRCOUT; + my $SRCOUT2; + my $MAPLOC; + if ($TYPE eq "colldef") { + $SRCOUT = "localedef -D -U -c -i \${.IMPSRC} \\\n" . + "\t-f \${.CURDIR}/../ctypedef/map.UTF-8 " . + "\${.OBJDIR}/\${.IMPSRC:T:R} || true"; + $SRCOUT2 = "LC_COLLATE"; + } + elsif ($TYPE eq "ctypedef") { + # TODO ! + $SRCOUT = "localedef -D -U -w \${MAPLOC}/widths.txt \\\n" . + "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:C/^.*\\.//} " . + "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R}"; + $SRCOUT2 = "LC_CTYPE"; + $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . + "locale/etc/final-maps\n"; + } + else { + $SRCOUT = "grep -v -E '^\#[ ]?\$\$' < \${.IMPSRC} > \${.TARGET}"; + $SRCOUT2 = "out"; + } open(FOUT, ">$TYPE.draft/Makefile"); print FOUT < +# + +# This converts MAPPING files to localedef character maps +# suitable for use with the UTF-8 derived localedef data. + +sub ucs_to_utf8 +{ + my $ucs = shift; + my $utf8; + + if ($ucs <= 0x7f) { + $utf8 = sprintf("\\x%02X", $ucs).$utf8; + } elsif ($ucs <= 0x7ff) { + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8; + + } elsif ($ucs <= 0xffff) { + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8; + + } elsif ($ucs <= 0x1fffff) { + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8; + + } elsif ($ucs <= 0x03ffffff) { + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; + + } else { + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8; + $ucs >>= 6; + $utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8; + } + + return ($utf8); +} + +my %unames; +my %uvalues; + +# +# This is not a general purpose Character Map parser, but its good enough +# for the stock one supplied with CLDR. +# +sub load_utf8_cm +{ + my $file = shift; + + open(UTF8, "$file") || die "open"; + + while () { + next if (/^#/); + next if (/^\s*$/); + next if (/^\s*CHARMAP\s*$/); + next if (/^\s*END\s*CHARMAP\s*$/); + chomp; + @words = split /\s+/; + $name = $words[0]; + $utf8val = $words[1]; + + if (defined($unames{$utf8val})) { + $unames{$utf8val} .= "\n" .$name; + } else { + $unames{$utf8val} = $name; + } + $uvalues{$name} = $utf8val; + } + close(UTF8); +} + +my %map; + +sub load_map +{ + my $file = shift; + + open(MAP, "$file") || die "open"; + + while () { + next if (/^#/); + next if (/^\s*$/); + next if (/^0x..\+0x../); + next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/); + next if (/^0x[0-9A-F]{2}\s+#/); + next if (/# ... NO MAPPING .../); + chomp; + @words = split /\s+/; + $utf8 = $words[1]; + $utf8 =~ s/^\\x[0]*//; + $utf8 = ucs_to_utf8(hex($utf8)); + $val = $words[0]; + if (defined ($map{$val})) { + $map{$val} .= " ".$utf8; + } else { + $map{$val} = $utf8; + } + } +} + +sub mb_str +{ + my $val = shift; + my $str = ""; + $val = hex($val); + + if ($val == 0) { + return ("\\x00"); + } + while ($val) { + $str = sprintf("\\x%02x", $val & 0xff).$str; + $val >>= 8; + } + return ($str); +} + +$mf = shift(@ARGV); +$codeset = shift(@ARGV); +my $max_mb; + +load_utf8_cm("etc/final-maps/map.UTF-8"); +load_map($mf); + + + if ($codeset eq "SJIS") { $max_mb = 2 } +elsif ($codeset eq "eucCN") { $max_mb = 2 } +elsif ($codeset eq "eucJP") { $max_mb = 3 } +elsif ($codeset eq "eucKR") { $max_mb = 2 } +elsif ($codeset eq "GBK") { $max_mb = 2 } +elsif ($codeset eq "GB2312") { $max_mb = 2 } +elsif ($codeset eq "Big5") { $max_mb = 2 } +elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 } +else { $max_mb = 1 }; +print(" \"$codeset\"\n"); +print(" 1\n"); +print(" $max_mb\n"); + +print("CHARMAP\n"); +foreach $val (sort (keys (%map))) { + #$utf8 = $map{$val}; + foreach $utf8 (split / /, $map{$val}) { + $ref = $unames{$utf8}; + foreach $name (sort (split /\n/, $ref)) { + print "$name"; + my $nt = int((64 - length($name) + 7) / 8); + while ($nt) { + print "\t"; + $nt--; + } + print mb_str($val)."\n"; + } + } +} +print "END CHARMAP\n"; diff --git a/tools/tools/locale/tools/finalize b/tools/tools/locale/tools/finalize index 8e48fe0219..3f07f2478b 100755 --- a/tools/tools/locale/tools/finalize +++ b/tools/tools/locale/tools/finalize @@ -9,19 +9,20 @@ usage () { echo "finalize ' to package standard localization" - echo "type must be one of { monetdef, msgdef, numericdef, timedef, colldef }" + echo "type must be one of { monetdef, msgdef, numericdef, timedef, colldef, ctypedef }" exit 1 } [ $# -ne 1 ] && usage [ $1 = "monetdef" -o $1 = "msgdef" -o $1 = "colldef" -o \ - $1 = "numericdef" -o $1 = "timedef" ] || usage + $1 = "numericdef" -o $1 = "timedef" -o $1 = "ctypedef" ] || usage self=$(realpath $0) base=$(dirname ${self}) old=${base}/../${1}.draft new=${base}/../${1} TEMP=/tmp/${1}.locales +TEMP2=/tmp/${1}.hashes FULLMAP=/tmp/utf8-map FULLEXTRACT=/tmp/extracted-names AWKCMD="/## PLACEHOLDER/ { \ @@ -29,19 +30,56 @@ AWKCMD="/## PLACEHOLDER/ { \ !/## PLACEHOLDER/ { print \$0 }" grep '^LOCALES+' ${old}/Makefile > ${TEMP} + +if [ $1 = "ctypedef" ] +then +keep=$(cat ${TEMP} | awk '/UTF-8/ { print $2 }') +else keep=$(cat ${TEMP} | awk '{ print $2 }') +fi for original in ${keep} do cp ${old}/${original}.src ${new}/ done -if [ $1 = "colldef" ] +if [ $1 = "ctypedef" ] then +keep=$(cat ${TEMP} | awk '{ print $2 ".src" }') +(cd ${old} && md5 -r ${keep} | sort) > ${TEMP2} +linx=$(cat ${TEMP2} | awk '!/UTF-8/ { print $2 }') +for original in ${linx} +do +linkhash=$(fgrep "${original}" ${TEMP2} | awk '{ print $1 }') +utf8file=$(fgrep "${linkhash}" ${TEMP2} | fgrep 'UTF-8' | awk '{ print $2 }') +ln -s ${utf8file} ${new}/${original} +done +rm -f ${TEMP2} /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${CLDRDIR}/posix/UTF-8.cm \ - > ${new}/map.UTF-8 - cp ${base}/../etc/charmaps/widths.txt ${new}/ + > ${base}/../etc/final-maps/map.UTF-8 +CHARMAPS="ARMSCII-8 Big5 Big5HKSCS CP1131 CP1251 \ + CP866 GB2312 GBK ISCII-DEV ISO8859-1 \ + ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \ + ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \ + PT154 SJIS US-ASCII eucCN eucJP eucKR" + +# GB18030 blows up, use pre-generate Illumos version + +for map in ${CHARMAPS} +do +encoding=${map} +if [ ${map} = "Big5HKSCS" ] +then +encoding="Big5" +fi +/usr/local/bin/perl ${base}/convert_map.pl \ + ${base}/../etc/charmaps/${map}.TXT ${encoding} \ + | /usr/bin/sed -E -e 's/ +/ /g' \ + > ${base}/../etc/final-maps/map.${map} + echo map ${map} converted. +done + fi grep -v '^LOCALES+' ${old}/Makefile | awk "${AWKCMD}" > ${new}/Makefile -- 2.41.0