Add localedef(1), a locale definition generator tool
authorJohn Marino <draco@marino.st>
Tue, 28 Jul 2015 16:31:53 +0000 (18:31 +0200)
committerJohn Marino <draco@marino.st>
Tue, 28 Jul 2015 17:55:15 +0000 (19:55 +0200)
The localedef tool can read entire (and unmodified) CLDR posix definition
files, and generate all 6 LC categories: LC_COLLATE, LC_CTYPE, LC_TIME,
LC_NUMERIC, LC_MONETARY and LC_MESSAGES.

The last 4 of those aren't needed.  We already have a tool that generates
msgdef, timedef, moneydef and numericdef.  In the immediate future,
localedef will only be used generate LC_COLLATE files in a new format.
This will render colldef files unreadable, thus colldef will be removed
when this happens.

In the future, localedef will be tasked to generate LC_CTYPE files as
well.  When that happens, the mklocale tool will be retired.

While localedef *can* read pristine POSIX files (which causes 6 files
to be generated), it will given files with only the LC_COLLATE part,
which will also have all the white space removed as well.  Remove just
the spaces can save megabytes.

This tool has a long history with Solaris [1].  The Nexenta developers
modified it to read CLDR files and created the much richer collation
formats.  The libc collation functions have to be modified to read the
new format (called "DragonFly-4.4") and to handle the new data structures.

The result will be that locale-sensitive tools and functions will now
properly sort multibyte and unicode strings.  Our "BSD" sort is not locale
sensitive, so it will probably have to be replaced with GNU sort in order
to leverage our new collation capabilities.

This can't be hooked into the build yet.  It needs the new header for
collate.c to define the data structures.  Until that happens, this is
actually unbuildable.

[1] Linux also has a tool called localdef, but I do know know if it shares
    a common history or if it uses CLDR POSIX files.  It seems to have the
    same purpose though.

17 files changed:
usr.bin/localedef/Makefile [new file with mode: 0644]
usr.bin/localedef/avl.c [new file with mode: 0644]
usr.bin/localedef/avl.h [new file with mode: 0644]
usr.bin/localedef/avl_impl.h [new file with mode: 0644]
usr.bin/localedef/charmap.c [new file with mode: 0644]
usr.bin/localedef/collate.c [new file with mode: 0644]
usr.bin/localedef/ctype.c [new file with mode: 0644]
usr.bin/localedef/localedef.1 [new file with mode: 0644]
usr.bin/localedef/localedef.c [new file with mode: 0644]
usr.bin/localedef/localedef.h [new file with mode: 0644]
usr.bin/localedef/messages.c [new file with mode: 0644]
usr.bin/localedef/monetary.c [new file with mode: 0644]
usr.bin/localedef/numeric.c [new file with mode: 0644]
usr.bin/localedef/parser.y [new file with mode: 0644]
usr.bin/localedef/scanner.c [new file with mode: 0644]
usr.bin/localedef/time.c [new file with mode: 0644]
usr.bin/localedef/wide.c [new file with mode: 0644]

diff --git a/usr.bin/localedef/Makefile b/usr.bin/localedef/Makefile
new file mode 100644 (file)
index 0000000..732a252
--- /dev/null
@@ -0,0 +1,24 @@
+# Adapted from Illumos by John Marino <draco@marino.st>
+
+PROG=  localedef
+SRCS=  charmap.c \
+       collate.c \
+       ctype.c \
+       localedef.c \
+       messages.c \
+       monetary.c \
+       numeric.c \
+       parser.y \
+       scanner.c \
+       time.c \
+       wide.c \
+       avl.c
+
+${SRCS:M*.c}: parser.h
+parser.h: parser.y
+
+CFLAGS+=       -I. -I${.CURDIR}
+CFLAGS+=       -I${.CURDIR}/../../lib/libc/locale
+CFLAGS+=       -I${.CURDIR}/../../lib/libc/stdtime
+
+.include <bsd.prog.mk>
diff --git a/usr.bin/localedef/avl.c b/usr.bin/localedef/avl.c
new file mode 100644 (file)
index 0000000..5c8bfd7
--- /dev/null
@@ -0,0 +1,997 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+/*
+ * AVL - generic AVL tree implementation for kernel use
+ *
+ * A complete description of AVL trees can be found in many CS textbooks.
+ *
+ * Here is a very brief overview. An AVL tree is a binary search tree that is
+ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
+ * any given node, the left and right subtrees are allowed to differ in height
+ * by at most 1 level.
+ *
+ * This relaxation from a perfectly balanced binary tree allows doing
+ * insertion and deletion relatively efficiently. Searching the tree is
+ * still a fast operation, roughly O(log(N)).
+ *
+ * The key to insertion and deletion is a set of tree manipulations called
+ * rotations, which bring unbalanced subtrees back into the semi-balanced state.
+ *
+ * This implementation of AVL trees has the following peculiarities:
+ *
+ *     - The AVL specific data structures are physically embedded as fields
+ *       in the "using" data structures.  To maintain generality the code
+ *       must constantly translate between "avl_node_t *" and containing
+ *       data structure "void *"s by adding/subtracting the avl_offset.
+ *
+ *     - Since the AVL data is always embedded in other structures, there is
+ *       no locking or memory allocation in the AVL routines. This must be
+ *       provided for by the enclosing data structure's semantics. Typically,
+ *       avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
+ *       exclusive write lock. Other operations require a read lock.
+ *
+ *      - The implementation uses iteration instead of explicit recursion,
+ *       since it is intended to run on limited size kernel stacks. Since
+ *       there is no recursion stack present to move "up" in the tree,
+ *       there is an explicit "parent" link in the avl_node_t.
+ *
+ *      - The left/right children pointers of a node are in an array.
+ *       In the code, variables (instead of constants) are used to represent
+ *       left and right indices.  The implementation is written as if it only
+ *       dealt with left handed manipulations.  By changing the value assigned
+ *       to "left", the code also works for right handed trees.  The
+ *       following variables/terms are frequently used:
+ *
+ *             int left;       // 0 when dealing with left children,
+ *                             // 1 for dealing with right children
+ *
+ *             int left_heavy; // -1 when left subtree is taller at some node,
+ *                             // +1 when right subtree is taller
+ *
+ *             int right;      // will be the opposite of left (0 or 1)
+ *             int right_heavy;// will be the opposite of left_heavy (-1 or 1)
+ *
+ *             int direction;  // 0 for "<" (ie. left child); 1 for ">" (right)
+ *
+ *       Though it is a little more confusing to read the code, the approach
+ *       allows using half as much code (and hence cache footprint) for tree
+ *       manipulations and eliminates many conditional branches.
+ *
+ *     - The avl_index_t is an opaque "cookie" used to find nodes at or
+ *       adjacent to where a new value would be inserted in the tree. The value
+ *       is a modified "avl_node_t *".  The bottom bit (normally 0 for a
+ *       pointer) is set to indicate if that the new node has a value greater
+ *       than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include "avl.h"
+
+
+/*
+ * Small arrays to translate between balance (or diff) values and child indices.
+ *
+ * Code that deals with binary tree data structures will randomly use
+ * left and right children when examining a tree.  C "if()" statements
+ * which evaluate randomly suffer from very poor hardware branch prediction.
+ * In this code we avoid some of the branch mispredictions by using the
+ * following translation arrays. They replace random branches with an
+ * additional memory reference. Since the translation arrays are both very
+ * small the data should remain efficiently in cache.
+ */
+static const int  avl_child2balance[2] = {-1, 1};
+static const int  avl_balance2child[]  = {0, 0, 1};
+
+
+/*
+ * Walk from one node to the previous valued node (ie. an infix walk
+ * towards the left). At any given node we do one of 2 things:
+ *
+ * - If there is a left child, go to it, then to it's rightmost descendant.
+ *
+ * - otherwise we return through parent nodes until we've come from a right
+ *   child.
+ *
+ * Return Value:
+ * NULL - if at the end of the nodes
+ * otherwise next node
+ */
+void *
+avl_walk(avl_tree_t *tree, void        *oldnode, int left)
+{
+       size_t off = tree->avl_offset;
+       avl_node_t *node = AVL_DATA2NODE(oldnode, off);
+       int right = 1 - left;
+       int was_child;
+
+
+       /*
+        * nowhere to walk to if tree is empty
+        */
+       if (node == NULL)
+               return (NULL);
+
+       /*
+        * Visit the previous valued node. There are two possibilities:
+        *
+        * If this node has a left child, go down one left, then all
+        * the way right.
+        */
+       if (node->avl_child[left] != NULL) {
+               for (node = node->avl_child[left];
+                   node->avl_child[right] != NULL;
+                   node = node->avl_child[right])
+                       ;
+       /*
+        * Otherwise, return thru left children as far as we can.
+        */
+       } else {
+               for (;;) {
+                       was_child = AVL_XCHILD(node);
+                       node = AVL_XPARENT(node);
+                       if (node == NULL)
+                               return (NULL);
+                       if (was_child == right)
+                               break;
+               }
+       }
+
+       return (AVL_NODE2DATA(node, off));
+}
+
+/*
+ * Return the lowest valued node in a tree or NULL.
+ * (leftmost child from root of tree)
+ */
+void *
+avl_first(avl_tree_t *tree)
+{
+       avl_node_t *node;
+       avl_node_t *prev = NULL;
+       size_t off = tree->avl_offset;
+
+       for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
+               prev = node;
+
+       if (prev != NULL)
+               return (AVL_NODE2DATA(prev, off));
+       return (NULL);
+}
+
+/*
+ * Return the highest valued node in a tree or NULL.
+ * (rightmost child from root of tree)
+ */
+void *
+avl_last(avl_tree_t *tree)
+{
+       avl_node_t *node;
+       avl_node_t *prev = NULL;
+       size_t off = tree->avl_offset;
+
+       for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
+               prev = node;
+
+       if (prev != NULL)
+               return (AVL_NODE2DATA(prev, off));
+       return (NULL);
+}
+
+/*
+ * Access the node immediately before or after an insertion point.
+ *
+ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
+ *
+ * Return value:
+ *     NULL: no node in the given direction
+ *     "void *"  of the found tree node
+ */
+void *
+avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
+{
+       int child = AVL_INDEX2CHILD(where);
+       avl_node_t *node = AVL_INDEX2NODE(where);
+       void *data;
+       size_t off = tree->avl_offset;
+
+       if (node == NULL) {
+               return (NULL);
+       }
+       data = AVL_NODE2DATA(node, off);
+       if (child != direction)
+               return (data);
+
+       return (avl_walk(tree, data, direction));
+}
+
+
+/*
+ * Search for the node which contains "value".  The algorithm is a
+ * simple binary tree search.
+ *
+ * return value:
+ *     NULL: the value is not in the AVL tree
+ *             *where (if not NULL)  is set to indicate the insertion point
+ *     "void *"  of the found tree node
+ */
+void *
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
+{
+       avl_node_t *node;
+       avl_node_t *prev = NULL;
+       int child = 0;
+       int diff;
+       size_t off = tree->avl_offset;
+
+       for (node = tree->avl_root; node != NULL;
+           node = node->avl_child[child]) {
+
+               prev = node;
+
+               diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
+               if (diff == 0) {
+#ifdef DEBUG
+                       if (where != NULL)
+                               *where = 0;
+#endif
+                       return (AVL_NODE2DATA(node, off));
+               }
+               child = avl_balance2child[1 + diff];
+
+       }
+
+       if (where != NULL)
+               *where = AVL_MKINDEX(prev, child);
+
+       return (NULL);
+}
+
+
+/*
+ * Perform a rotation to restore balance at the subtree given by depth.
+ *
+ * This routine is used by both insertion and deletion. The return value
+ * indicates:
+ *      0 : subtree did not change height
+ *     !0 : subtree was reduced in height
+ *
+ * The code is written as if handling left rotations, right rotations are
+ * symmetric and handled by swapping values of variables right/left[_heavy]
+ *
+ * On input balance is the "new" balance at "node". This value is either
+ * -2 or +2.
+ */
+static int
+avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
+{
+       int left = !(balance < 0);      /* when balance = -2, left will be 0 */
+       int right = 1 - left;
+       int left_heavy = balance >> 1;
+       int right_heavy = -left_heavy;
+       avl_node_t *parent = AVL_XPARENT(node);
+       avl_node_t *child = node->avl_child[left];
+       avl_node_t *cright;
+       avl_node_t *gchild;
+       avl_node_t *gright;
+       avl_node_t *gleft;
+       int which_child = AVL_XCHILD(node);
+       int child_bal = AVL_XBALANCE(child);
+
+       /* BEGIN CSTYLED */
+       /*
+        * case 1 : node is overly left heavy, the left child is balanced or
+        * also left heavy. This requires the following rotation.
+        *
+        *                   (node bal:-2)
+        *                    /           \
+        *                   /             \
+        *              (child bal:0 or -1)
+        *              /    \
+        *             /      \
+        *                     cright
+        *
+        * becomes:
+        *
+        *              (child bal:1 or 0)
+        *              /        \
+        *             /          \
+        *                        (node bal:-1 or 0)
+        *                         /     \
+        *                        /       \
+        *                     cright
+        *
+        * we detect this situation by noting that child's balance is not
+        * right_heavy.
+        */
+       /* END CSTYLED */
+       if (child_bal != right_heavy) {
+
+               /*
+                * compute new balance of nodes
+                *
+                * If child used to be left heavy (now balanced) we reduced
+                * the height of this sub-tree -- used in "return...;" below
+                */
+               child_bal += right_heavy; /* adjust towards right */
+
+               /*
+                * move "cright" to be node's left child
+                */
+               cright = child->avl_child[right];
+               node->avl_child[left] = cright;
+               if (cright != NULL) {
+                       AVL_SETPARENT(cright, node);
+                       AVL_SETCHILD(cright, left);
+               }
+
+               /*
+                * move node to be child's right child
+                */
+               child->avl_child[right] = node;
+               AVL_SETBALANCE(node, -child_bal);
+               AVL_SETCHILD(node, right);
+               AVL_SETPARENT(node, child);
+
+               /*
+                * update the pointer into this subtree
+                */
+               AVL_SETBALANCE(child, child_bal);
+               AVL_SETCHILD(child, which_child);
+               AVL_SETPARENT(child, parent);
+               if (parent != NULL)
+                       parent->avl_child[which_child] = child;
+               else
+                       tree->avl_root = child;
+
+               return (child_bal == 0);
+       }
+
+       /* BEGIN CSTYLED */
+       /*
+        * case 2 : When node is left heavy, but child is right heavy we use
+        * a different rotation.
+        *
+        *                   (node b:-2)
+        *                    /   \
+        *                   /     \
+        *                  /       \
+        *             (child b:+1)
+        *              /     \
+        *             /       \
+        *                   (gchild b: != 0)
+        *                     /  \
+        *                    /    \
+        *                 gleft   gright
+        *
+        * becomes:
+        *
+        *              (gchild b:0)
+        *              /       \
+        *             /         \
+        *            /           \
+        *        (child b:?)   (node b:?)
+        *         /  \          /   \
+        *        /    \        /     \
+        *            gleft   gright
+        *
+        * computing the new balances is more complicated. As an example:
+        *       if gchild was right_heavy, then child is now left heavy
+        *              else it is balanced
+        */
+       /* END CSTYLED */
+       gchild = child->avl_child[right];
+       gleft = gchild->avl_child[left];
+       gright = gchild->avl_child[right];
+
+       /*
+        * move gright to left child of node and
+        *
+        * move gleft to right child of node
+        */
+       node->avl_child[left] = gright;
+       if (gright != NULL) {
+               AVL_SETPARENT(gright, node);
+               AVL_SETCHILD(gright, left);
+       }
+
+       child->avl_child[right] = gleft;
+       if (gleft != NULL) {
+               AVL_SETPARENT(gleft, child);
+               AVL_SETCHILD(gleft, right);
+       }
+
+       /*
+        * move child to left child of gchild and
+        *
+        * move node to right child of gchild and
+        *
+        * fixup parent of all this to point to gchild
+        */
+       balance = AVL_XBALANCE(gchild);
+       gchild->avl_child[left] = child;
+       AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
+       AVL_SETPARENT(child, gchild);
+       AVL_SETCHILD(child, left);
+
+       gchild->avl_child[right] = node;
+       AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
+       AVL_SETPARENT(node, gchild);
+       AVL_SETCHILD(node, right);
+
+       AVL_SETBALANCE(gchild, 0);
+       AVL_SETPARENT(gchild, parent);
+       AVL_SETCHILD(gchild, which_child);
+       if (parent != NULL)
+               parent->avl_child[which_child] = gchild;
+       else
+               tree->avl_root = gchild;
+
+       return (1);     /* the new tree is always shorter */
+}
+
+
+/*
+ * Insert a new node into an AVL tree at the specified (from avl_find()) place.
+ *
+ * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
+ * searches out to the leaf positions.  The avl_index_t indicates the node
+ * which will be the parent of the new node.
+ *
+ * After the node is inserted, a single rotation further up the tree may
+ * be necessary to maintain an acceptable AVL balance.
+ */
+void
+avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
+{
+       avl_node_t *node;
+       avl_node_t *parent = AVL_INDEX2NODE(where);
+       int old_balance;
+       int new_balance;
+       int which_child = AVL_INDEX2CHILD(where);
+       size_t off = tree->avl_offset;
+
+       node = AVL_DATA2NODE(new_data, off);
+
+       /*
+        * First, add the node to the tree at the indicated position.
+        */
+       ++tree->avl_numnodes;
+
+       node->avl_child[0] = NULL;
+       node->avl_child[1] = NULL;
+
+       AVL_SETCHILD(node, which_child);
+       AVL_SETBALANCE(node, 0);
+       AVL_SETPARENT(node, parent);
+       if (parent != NULL) {
+               parent->avl_child[which_child] = node;
+       } else {
+               tree->avl_root = node;
+       }
+       /*
+        * Now, back up the tree modifying the balance of all nodes above the
+        * insertion point. If we get to a highly unbalanced ancestor, we
+        * need to do a rotation.  If we back out of the tree we are done.
+        * If we brought any subtree into perfect balance (0), we are also done.
+        */
+       for (;;) {
+               node = parent;
+               if (node == NULL)
+                       return;
+
+               /*
+                * Compute the new balance
+                */
+               old_balance = AVL_XBALANCE(node);
+               new_balance = old_balance + avl_child2balance[which_child];
+
+               /*
+                * If we introduced equal balance, then we are done immediately
+                */
+               if (new_balance == 0) {
+                       AVL_SETBALANCE(node, 0);
+                       return;
+               }
+
+               /*
+                * If both old and new are not zero we went
+                * from -1 to -2 balance, do a rotation.
+                */
+               if (old_balance != 0)
+                       break;
+
+               AVL_SETBALANCE(node, new_balance);
+               parent = AVL_XPARENT(node);
+               which_child = AVL_XCHILD(node);
+       }
+
+       /*
+        * perform a rotation to fix the tree and return
+        */
+       (void) avl_rotation(tree, node, new_balance);
+}
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after or
+ * before (AVL_AFTER, AVL_BEFORE) the data "here".
+ *
+ * Insertions can only be done at empty leaf points in the tree, therefore
+ * if the given child of the node is already present we move to either
+ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
+ * every other node in the tree is a leaf, this always works.
+ *
+ * To help developers using this interface, we assert that the new node
+ * is correctly ordered at every step of the way in DEBUG kernels.
+ */
+void
+avl_insert_here(
+       avl_tree_t *tree,
+       void *new_data,
+       void *here,
+       int direction)
+{
+       avl_node_t *node;
+       int child = direction;  /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
+#ifdef DEBUG
+       int diff;
+#endif
+
+       /*
+        * If corresponding child of node is not NULL, go to the neighboring
+        * node and reverse the insertion direction.
+        */
+       node = AVL_DATA2NODE(here, tree->avl_offset);
+
+#ifdef DEBUG
+       diff = tree->avl_compar(new_data, here);
+#endif
+
+       if (node->avl_child[child] != NULL) {
+               node = node->avl_child[child];
+               child = 1 - child;
+               while (node->avl_child[child] != NULL) {
+#ifdef DEBUG
+                       diff = tree->avl_compar(new_data,
+                           AVL_NODE2DATA(node, tree->avl_offset));
+#endif
+                       node = node->avl_child[child];
+               }
+#ifdef DEBUG
+               diff = tree->avl_compar(new_data,
+                   AVL_NODE2DATA(node, tree->avl_offset));
+#endif
+       }
+
+       avl_insert(tree, new_data, AVL_MKINDEX(node, child));
+}
+
+/*
+ * Add a new node to an AVL tree.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+       avl_index_t where;
+
+       /*
+        * This is unfortunate.  We want to call panic() here, even for
+        * non-DEBUG kernels.  In userland, however, we can't depend on anything
+        * in libc or else the rtld build process gets confused.  So, all we can
+        * do in userland is resort to a normal ASSERT().
+        */
+       if (avl_find(tree, new_node, &where) != NULL)
+       avl_insert(tree, new_node, where);
+}
+
+/*
+ * Delete a node from the AVL tree.  Deletion is similar to insertion, but
+ * with 2 complications.
+ *
+ * First, we may be deleting an interior node. Consider the following subtree:
+ *
+ *     d           c            c
+ *    / \         / \          / \
+ *   b   e       b   e        b   e
+ *  / \                / \          /
+ * a   c       a            a
+ *
+ * When we are deleting node (d), we find and bring up an adjacent valued leaf
+ * node, say (c), to take the interior node's place. In the code this is
+ * handled by temporarily swapping (d) and (c) in the tree and then using
+ * common code to delete (d) from the leaf position.
+ *
+ * Secondly, an interior deletion from a deep tree may require more than one
+ * rotation to fix the balance. This is handled by moving up the tree through
+ * parents and applying rotations as needed. The return value from
+ * avl_rotation() is used to detect when a subtree did not change overall
+ * height due to a rotation.
+ */
+void
+avl_remove(avl_tree_t *tree, void *data)
+{
+       avl_node_t *delete;
+       avl_node_t *parent;
+       avl_node_t *node;
+       avl_node_t tmp;
+       int old_balance;
+       int new_balance;
+       int left;
+       int right;
+       int which_child;
+       size_t off = tree->avl_offset;
+
+       delete = AVL_DATA2NODE(data, off);
+
+       /*
+        * Deletion is easiest with a node that has at most 1 child.
+        * We swap a node with 2 children with a sequentially valued
+        * neighbor node. That node will have at most 1 child. Note this
+        * has no effect on the ordering of the remaining nodes.
+        *
+        * As an optimization, we choose the greater neighbor if the tree
+        * is right heavy, otherwise the left neighbor. This reduces the
+        * number of rotations needed.
+        */
+       if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
+
+               /*
+                * choose node to swap from whichever side is taller
+                */
+               old_balance = AVL_XBALANCE(delete);
+               left = avl_balance2child[old_balance + 1];
+               right = 1 - left;
+
+               /*
+                * get to the previous value'd node
+                * (down 1 left, as far as possible right)
+                */
+               for (node = delete->avl_child[left];
+                   node->avl_child[right] != NULL;
+                   node = node->avl_child[right])
+                       ;
+
+               /*
+                * create a temp placeholder for 'node'
+                * move 'node' to delete's spot in the tree
+                */
+               tmp = *node;
+
+               *node = *delete;
+               if (node->avl_child[left] == node)
+                       node->avl_child[left] = &tmp;
+
+               parent = AVL_XPARENT(node);
+               if (parent != NULL)
+                       parent->avl_child[AVL_XCHILD(node)] = node;
+               else
+                       tree->avl_root = node;
+               AVL_SETPARENT(node->avl_child[left], node);
+               AVL_SETPARENT(node->avl_child[right], node);
+
+               /*
+                * Put tmp where node used to be (just temporary).
+                * It always has a parent and at most 1 child.
+                */
+               delete = &tmp;
+               parent = AVL_XPARENT(delete);
+               parent->avl_child[AVL_XCHILD(delete)] = delete;
+               which_child = (delete->avl_child[1] != 0);
+               if (delete->avl_child[which_child] != NULL)
+                       AVL_SETPARENT(delete->avl_child[which_child], delete);
+       }
+
+
+       /*
+        * Here we know "delete" is at least partially a leaf node. It can
+        * be easily removed from the tree.
+        */
+       --tree->avl_numnodes;
+       parent = AVL_XPARENT(delete);
+       which_child = AVL_XCHILD(delete);
+       if (delete->avl_child[0] != NULL)
+               node = delete->avl_child[0];
+       else
+               node = delete->avl_child[1];
+
+       /*
+        * Connect parent directly to node (leaving out delete).
+        */
+       if (node != NULL) {
+               AVL_SETPARENT(node, parent);
+               AVL_SETCHILD(node, which_child);
+       }
+       if (parent == NULL) {
+               tree->avl_root = node;
+               return;
+       }
+       parent->avl_child[which_child] = node;
+
+
+       /*
+        * Since the subtree is now shorter, begin adjusting parent balances
+        * and performing any needed rotations.
+        */
+       do {
+
+               /*
+                * Move up the tree and adjust the balance
+                *
+                * Capture the parent and which_child values for the next
+                * iteration before any rotations occur.
+                */
+               node = parent;
+               old_balance = AVL_XBALANCE(node);
+               new_balance = old_balance - avl_child2balance[which_child];
+               parent = AVL_XPARENT(node);
+               which_child = AVL_XCHILD(node);
+
+               /*
+                * If a node was in perfect balance but isn't anymore then
+                * we can stop, since the height didn't change above this point
+                * due to a deletion.
+                */
+               if (old_balance == 0) {
+                       AVL_SETBALANCE(node, new_balance);
+                       break;
+               }
+
+               /*
+                * If the new balance is zero, we don't need to rotate
+                * else
+                * need a rotation to fix the balance.
+                * If the rotation doesn't change the height
+                * of the sub-tree we have finished adjusting.
+                */
+               if (new_balance == 0)
+                       AVL_SETBALANCE(node, new_balance);
+               else if (!avl_rotation(tree, node, new_balance))
+                       break;
+       } while (parent != NULL);
+}
+
+#define        AVL_REINSERT(tree, obj)         \
+       avl_remove((tree), (obj));      \
+       avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+       void *neighbor;
+
+       neighbor = AVL_PREV(t, obj);
+       if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+               AVL_REINSERT(t, obj);
+               return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+       void *neighbor;
+
+       neighbor = AVL_NEXT(t, obj);
+       if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+               AVL_REINSERT(t, obj);
+               return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+       void *neighbor;
+
+       neighbor = AVL_PREV(t, obj);
+       if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+               AVL_REINSERT(t, obj);
+               return (B_TRUE);
+       }
+
+       neighbor = AVL_NEXT(t, obj);
+       if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+               AVL_REINSERT(t, obj);
+               return (B_TRUE);
+       }
+
+       return (B_FALSE);
+}
+
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+       avl_node_t *temp_node;
+       ulong temp_numnodes;
+
+       temp_node = tree1->avl_root;
+       temp_numnodes = tree1->avl_numnodes;
+       tree1->avl_root = tree2->avl_root;
+       tree1->avl_numnodes = tree2->avl_numnodes;
+       tree2->avl_root = temp_node;
+       tree2->avl_numnodes = temp_numnodes;
+}
+
+/*
+ * initialize a new AVL tree
+ */
+void
+avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
+    size_t size, size_t offset)
+{
+       tree->avl_compar = compar;
+       tree->avl_root = NULL;
+       tree->avl_numnodes = 0;
+       tree->avl_size = size;
+       tree->avl_offset = offset;
+}
+
+/*
+ * Delete a tree.
+ */
+/* ARGSUSED */
+void
+avl_destroy(avl_tree_t *tree __unused)
+{
+       /* was asserts */
+}
+
+
+/*
+ * Return the number of nodes in an AVL tree.
+ */
+ulong
+avl_numnodes(avl_tree_t *tree)
+{
+       return (tree->avl_numnodes);
+}
+
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+       return (tree->avl_numnodes == 0);
+}
+
+#define        CHILDBIT        (1L)
+
+/*
+ * Post-order tree walk used to visit all tree nodes and destroy the tree
+ * in post order. This is used for destroying a tree without paying any cost
+ * for rebalancing it.
+ *
+ * example:
+ *
+ *     void *cookie = NULL;
+ *     my_data_t *node;
+ *
+ *     while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ *             free(node);
+ *     avl_destroy(tree);
+ *
+ * The cookie is really an avl_node_t to the current node's parent and
+ * an indication of which child you looked at last.
+ *
+ * On input, a cookie value of CHILDBIT indicates the tree is done.
+ */
+void *
+avl_destroy_nodes(avl_tree_t *tree, void **cookie)
+{
+       avl_node_t      *node;
+       avl_node_t      *parent;
+       int             child;
+       void            *first;
+       size_t          off = tree->avl_offset;
+
+       /*
+        * Initial calls go to the first node or it's right descendant.
+        */
+       if (*cookie == NULL) {
+               first = avl_first(tree);
+
+               /*
+                * deal with an empty tree
+                */
+               if (first == NULL) {
+                       *cookie = (void *)CHILDBIT;
+                       return (NULL);
+               }
+
+               node = AVL_DATA2NODE(first, off);
+               parent = AVL_XPARENT(node);
+               goto check_right_side;
+       }
+
+       /*
+        * If there is no parent to return to we are done.
+        */
+       parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
+       if (parent == NULL) {
+               if (tree->avl_root != NULL) {
+                       tree->avl_root = NULL;
+                       tree->avl_numnodes = 0;
+               }
+               return (NULL);
+       }
+
+       /*
+        * Remove the child pointer we just visited from the parent and tree.
+        */
+       child = (uintptr_t)(*cookie) & CHILDBIT;
+       parent->avl_child[child] = NULL;
+       --tree->avl_numnodes;
+
+       /*
+        * If we just did a right child or there isn't one, go up to parent.
+        */
+       if (child == 1 || parent->avl_child[1] == NULL) {
+               node = parent;
+               parent = AVL_XPARENT(parent);
+               goto done;
+       }
+
+       /*
+        * Do parent's right child, then leftmost descendent.
+        */
+       node = parent->avl_child[1];
+       while (node->avl_child[0] != NULL) {
+               parent = node;
+               node = node->avl_child[0];
+       }
+
+       /*
+        * If here, we moved to a left child. It may have one
+        * child on the right (when balance == +1).
+        */
+check_right_side:
+       if (node->avl_child[1] != NULL) {
+               parent = node;
+               node = node->avl_child[1];
+       }
+
+done:
+       if (parent == NULL) {
+               *cookie = (void *)CHILDBIT;
+       } else {
+               *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
+       }
+
+       return (AVL_NODE2DATA(node, off));
+}
diff --git a/usr.bin/localedef/avl.h b/usr.bin/localedef/avl.h
new file mode 100644 (file)
index 0000000..dd8fc89
--- /dev/null
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef        _AVL_H
+#define        _AVL_H
+
+/*
+ * This is a private header file.  Applications should not directly include
+ * this file.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include "avl_impl.h"
+
+/* LOCAL MODIFICATION */
+typedef enum { B_FALSE, B_TRUE } boolean_t;
+
+
+/*
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
+ * The interfaces provide an efficient way of implementing an ordered set of
+ * data structures.
+ *
+ * AVL trees provide an alternative to using an ordered linked list. Using AVL
+ * trees will usually be faster, however they requires more storage. An ordered
+ * linked list in general requires 2 pointers in each data structure. The
+ * AVL tree implementation uses 3 pointers. The following chart gives the
+ * approximate performance of operations with the different approaches:
+ *
+ *     Operation        Link List      AVL tree
+ *     ---------        --------       --------
+ *     lookup             O(n)         O(log(n))
+ *
+ *     insert 1 node    constant       constant
+ *
+ *     delete 1 node    constant       between constant and O(log(n))
+ *
+ *     delete all nodes   O(n)         O(n)
+ *
+ *     visit the next
+ *     or prev node     constant       between constant and O(log(n))
+ *
+ *
+ * The data structure nodes are anchored at an "avl_tree_t" (the equivalent
+ * of a list header) and the individual nodes will have a field of
+ * type "avl_node_t" (corresponding to list pointers).
+ *
+ * The type "avl_index_t" is used to indicate a position in the list for
+ * certain calls.
+ *
+ * The usage scenario is generally:
+ *
+ * 1. Create the list/tree with: avl_create()
+ *
+ * followed by any mixture of:
+ *
+ * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
+ *
+ * 2b. Visited elements with:
+ *      avl_first() - returns the lowest valued node
+ *      avl_last() - returns the highest valued node
+ *      AVL_NEXT() - given a node go to next higher one
+ *      AVL_PREV() - given a node go to previous lower one
+ *
+ * 2c.  Find the node with the closest value either less than or greater
+ *     than a given value with avl_nearest().
+ *
+ * 2d. Remove individual nodes from the list/tree with avl_remove().
+ *
+ * and finally when the list is being destroyed
+ *
+ * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes.
+ *    Note that once you use avl_destroy_nodes(), you can no longer
+ *    use any routine except avl_destroy_nodes() and avl_destoy().
+ *
+ * 4. Use avl_destroy() to destroy the AVL tree itself.
+ *
+ * Any locking for multiple thread access is up to the user to provide, just
+ * as is needed for any linked list implementation.
+ */
+
+
+/*
+ * Type used for the root of the AVL tree.
+ */
+typedef struct avl_tree avl_tree_t;
+
+/*
+ * The data nodes in the AVL tree must have a field of this type.
+ */
+typedef struct avl_node avl_node_t;
+
+/*
+ * An opaque type used to locate a position in the tree where a node
+ * would be inserted.
+ */
+typedef uintptr_t avl_index_t;
+
+
+/*
+ * Direction constants used for avl_nearest().
+ */
+#define        AVL_BEFORE      (0)
+#define        AVL_AFTER       (1)
+
+
+/*
+ * Prototypes
+ *
+ * Where not otherwise mentioned, "void *" arguments are a pointer to the
+ * user data structure which must contain a field of type avl_node_t.
+ *
+ * Also assume the user data structures looks like:
+ *     stuct my_type {
+ *             ...
+ *             avl_node_t      my_link;
+ *             ...
+ *     };
+ */
+
+/*
+ * Initialize an AVL tree. Arguments are:
+ *
+ * tree   - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ *          -1 for <, 0 for ==, and +1 for >
+ * size   - the value of sizeof(struct my_type)
+ * offset - the value of OFFSETOF(struct my_type, my_link)
+ */
+extern void avl_create(avl_tree_t *tree,
+       int (*compar) (const void *, const void *), size_t size, size_t offset);
+
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with avl_insert() or avl_nearest().
+ *
+ * node   - node that has the value being looked for
+ * where  - position for use with avl_nearest() or avl_insert(), may be NULL
+ */
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node   - the node to insert
+ * where  - position as returned from avl_find()
+ */
+extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after
+ * or before the data "here".
+ *
+ * This might be useful for avl clients caching recently accessed
+ * data to avoid doing avl_find() again for insertion.
+ *
+ * new_data    - new data to insert
+ * here                - existing node in "tree"
+ * direction   - either AVL_AFTER or AVL_BEFORE the data "here".
+ */
+extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here,
+    int direction);
+
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ *
+ */
+extern void *avl_first(avl_tree_t *tree);
+extern void *avl_last(avl_tree_t *tree);
+
+
+/*
+ * Return the next or previous valued node in the tree.
+ * AVL_NEXT() will return NULL if at the last node.
+ * AVL_PREV() will return NULL if at the first node.
+ *
+ * node   - the node from which the next or previous node is found
+ */
+#define        AVL_NEXT(tree, node)    avl_walk(tree, node, AVL_AFTER)
+#define        AVL_PREV(tree, node)    avl_walk(tree, node, AVL_BEFORE)
+
+
+/*
+ * Find the node with the nearest value either greater or less than
+ * the value from a previous avl_find(). Returns the node or NULL if
+ * there isn't a matching one.
+ *
+ * where     - position as returned from avl_find()
+ * direction - either AVL_BEFORE or AVL_AFTER
+ *
+ * EXAMPLE get the greatest node that is less than a given value:
+ *
+ *     avl_tree_t *tree;
+ *     struct my_data look_for_value = {....};
+ *     struct my_data *node;
+ *     struct my_data *less;
+ *     avl_index_t where;
+ *
+ *     node = avl_find(tree, &look_for_value, &where);
+ *     if (node != NULL)
+ *             less = AVL_PREV(tree, node);
+ *     else
+ *             less = avl_nearest(tree, where, AVL_BEFORE);
+ */
+extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
+
+
+/*
+ * Add a single node to the tree.
+ * The node must not be in the tree, and it must not
+ * compare equal to any other node already in the tree.
+ *
+ * node   - the node to add
+ */
+extern void avl_add(avl_tree_t *tree, void *node);
+
+
+/*
+ * Remove a single node from the tree.  The node must be in the tree.
+ *
+ * node   - the node to remove
+ */
+extern void avl_remove(avl_tree_t *tree, void *node);
+
+/*
+ * Reinsert a node only if its order has changed relative to its nearest
+ * neighbors. To optimize performance avl_update_lt() checks only the previous
+ * node and avl_update_gt() checks only the next node. Use avl_update_lt() and
+ * avl_update_gt() only if you know the direction in which the order of the
+ * node may change.
+ */
+extern boolean_t avl_update(avl_tree_t *, void *);
+extern boolean_t avl_update_lt(avl_tree_t *, void *);
+extern boolean_t avl_update_gt(avl_tree_t *, void *);
+
+/*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
+ * Return the number of nodes in the tree
+ */
+extern ulong avl_numnodes(avl_tree_t *tree);
+
+/*
+ * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise.
+ */
+extern boolean_t avl_is_empty(avl_tree_t *tree);
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call avl_destroy_nodes(), you can only continuing calling it and
+ * finally avl_destroy(). No other AVL routines will be valid.
+ *
+ * cookie - a "void *" used to save state between calls to avl_destroy_nodes()
+ *
+ * EXAMPLE:
+ *     avl_tree_t *tree;
+ *     struct my_data *node;
+ *     void *cookie;
+ *
+ *     cookie = NULL;
+ *     while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ *             free(node);
+ *     avl_destroy(tree);
+ */
+extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie);
+
+
+/*
+ * Final destroy of an AVL tree. Arguments are:
+ *
+ * tree   - the empty tree to destroy
+ */
+extern void avl_destroy(avl_tree_t *tree);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_H */
diff --git a/usr.bin/localedef/avl_impl.h b/usr.bin/localedef/avl_impl.h
new file mode 100644 (file)
index 0000000..d513906
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ * Use is subject to license terms.
+ */
+
+#ifndef        _AVL_IMPL_H
+#define        _AVL_IMPL_H
+
+
+/*
+ * This is a private header file.  Applications should not directly include
+ * this file.
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * generic AVL tree implementation for kernel use
+ *
+ * There are 5 pieces of information stored for each node in an AVL tree
+ *
+ *     pointer to less than child
+ *     pointer to greater than child
+ *     a pointer to the parent of this node
+ *     an indication  [0/1]  of which child I am of my parent
+ *     a "balance" (-1, 0, +1)  indicating which child tree is taller
+ *
+ * Since they only need 3 bits, the last two fields are packed into the
+ * bottom bits of the parent pointer on 64 bit machines to save on space.
+ */
+
+#ifndef _LP64
+
+struct avl_node {
+       struct avl_node *avl_child[2];  /* left/right children */
+       struct avl_node *avl_parent;    /* this node's parent */
+       unsigned short avl_child_index; /* my index in parent's avl_child[] */
+       short avl_balance;              /* balance value: -1, 0, +1 */
+};
+
+#define        AVL_XPARENT(n)          ((n)->avl_parent)
+#define        AVL_SETPARENT(n, p)     ((n)->avl_parent = (p))
+
+#define        AVL_XCHILD(n)           ((n)->avl_child_index)
+#define        AVL_SETCHILD(n, c)      ((n)->avl_child_index = (unsigned short)(c))
+
+#define        AVL_XBALANCE(n)         ((n)->avl_balance)
+#define        AVL_SETBALANCE(n, b)    ((n)->avl_balance = (short)(b))
+
+#else /* _LP64 */
+
+/*
+ * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index
+ * values packed in the following manner:
+ *
+ * |63                                  3|        2        |1          0 |
+ * |-------------------------------------|-----------------|-------------|
+ * |      avl_parent hi order bits       | avl_child_index | avl_balance |
+ * |                                     |                 |     + 1     |
+ * |-------------------------------------|-----------------|-------------|
+ *
+ */
+struct avl_node {
+       struct avl_node *avl_child[2];  /* left/right children nodes */
+       uintptr_t avl_pcb;              /* parent, child_index, balance */
+};
+
+/*
+ * macros to extract/set fields in avl_pcb
+ *
+ * pointer to the parent of the current node is the high order bits
+ */
+#define        AVL_XPARENT(n)          ((struct avl_node *)((n)->avl_pcb & ~7))
+#define        AVL_SETPARENT(n, p)                                             \
+       ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p)))
+
+/*
+ * index of this node in its parent's avl_child[]: bit #2
+ */
+#define        AVL_XCHILD(n)           (((n)->avl_pcb >> 2) & 1)
+#define        AVL_SETCHILD(n, c)                                              \
+       ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2)))
+
+/*
+ * balance indication for a node, lowest 2 bits. A valid balance is
+ * -1, 0, or +1, and is encoded by adding 1 to the value to get the
+ * unsigned values of 0, 1, 2.
+ */
+#define        AVL_XBALANCE(n)         ((int)(((n)->avl_pcb & 3) - 1))
+#define        AVL_SETBALANCE(n, b)                                            \
+       ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1))))
+
+#endif /* _LP64 */
+
+
+
+/*
+ * switch between a node and data pointer for a given tree
+ * the value of "o" is tree->avl_offset
+ */
+#define        AVL_NODE2DATA(n, o)     ((void *)((uintptr_t)(n) - (o)))
+#define        AVL_DATA2NODE(d, o)     ((struct avl_node *)((uintptr_t)(d) + (o)))
+
+
+
+/*
+ * macros used to create/access an avl_index_t
+ */
+#define        AVL_INDEX2NODE(x)       ((avl_node_t *)((x) & ~1))
+#define        AVL_INDEX2CHILD(x)      ((x) & 1)
+#define        AVL_MKINDEX(n, c)       ((avl_index_t)(n) | (c))
+
+
+/*
+ * The tree structure. The fields avl_root, avl_compar, and avl_offset come
+ * first since they are needed for avl_find().  We want them to fit into
+ * a single 64 byte cache line to make avl_find() as fast as possible.
+ */
+struct avl_tree {
+       struct avl_node *avl_root;      /* root node in tree */
+       int (*avl_compar)(const void *, const void *);
+       size_t avl_offset;              /* offsetof(type, avl_link_t field) */
+       ulong avl_numnodes;             /* number of nodes in the tree */
+       size_t avl_size;                /* sizeof user type struct */
+};
+
+
+/*
+ * This will only by used via AVL_NEXT() or AVL_PREV()
+ */
+extern void *avl_walk(struct avl_tree *, void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVL_IMPL_H */
diff --git a/usr.bin/localedef/charmap.c b/usr.bin/localedef/charmap.c
new file mode 100644 (file)
index 0000000..6ab6f88
--- /dev/null
@@ -0,0 +1,346 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * CHARMAP file handling for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "avl.h"
+
+static avl_tree_t      cmap_sym;
+static avl_tree_t      cmap_wc;
+
+typedef struct charmap {
+       const char *name;
+       wchar_t wc;
+       avl_node_t avl_sym;
+       avl_node_t avl_wc;
+} charmap_t;
+
+
+/*
+ * Array of POSIX specific portable characters.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+
+static const struct {
+       char    *name;
+       int     ch;
+} portable_chars[] = {
+       { "NUL",                '\0' },
+       { "alert",              '\a' },
+       { "backspace",          '\b' },
+       { "tab",                '\t' },
+       { "carriage-return",    '\r' },
+       { "newline",            '\n' },
+       { "vertical-tab",       '\v' },
+       { "form-feed",          '\f' },
+       { "space",              ' ' },
+       { "exclamation-mark",   '!' },
+       { "quotation-mark",     '"' },
+       { "number-sign",        '#' },
+       { "dollar-sign",        '$' },
+       { "percent-sign",       '%' },
+       { "ampersand",          '&' },
+       { "apostrophe",         '\'' },
+       { "left-parenthesis",   '(' },
+       { "right-parenthesis",  '(' },
+       { "asterisk",           '*' },
+       { "plus-sign",          '+' },
+       { "comma",               ','},
+       { "hyphen-minus",       '-' },
+       { "hyphen",             '-' },
+       { "full-stop",          '.' },
+       { "period",             '.' },
+       { "slash",              '/' },
+       { "solidus",            '/' },
+       { "zero",               '0' },
+       { "one",                '1' },
+       { "two",                '2' },
+       { "three",              '3' },
+       { "four",               '4' },
+       { "five",               '5' },
+       { "six",                '6' },
+       { "seven",              '7' },
+       { "eight",              '8' },
+       { "nine",               '9' },
+       { "colon",              ':' },
+       { "semicolon",          ';' },
+       { "less-than-sign",     '<' },
+       { "equals-sign",        '=' },
+       { "greater-than-sign",  '>' },
+       { "question-mark",      '?' },
+       { "commercial-at",      '@' },
+       { "left-square-bracket", '[' },
+       { "backslash",          '\\' },
+       { "reverse-solidus",    '\\' },
+       { "right-square-bracket", ']' },
+       { "circumflex",         '^' },
+       { "circumflex-accent",  '^' },
+       { "low-line",           '_' },
+       { "underscore",         '_' },
+       { "grave-accent",       '`' },
+       { "left-brace",         '{' },
+       { "left-curly-bracket", '{' },
+       { "vertical-line",      '|' },
+       { "right-brace",        '}' },
+       { "right-curly-bracket", '}' },
+       { "tilde",              '~' },
+       { "A", 'A' },
+       { "B", 'B' },
+       { "C", 'C' },
+       { "D", 'D' },
+       { "E", 'E' },
+       { "F", 'F' },
+       { "G", 'G' },
+       { "H", 'H' },
+       { "I", 'I' },
+       { "J", 'J' },
+       { "K", 'K' },
+       { "L", 'L' },
+       { "M", 'M' },
+       { "N", 'N' },
+       { "O", 'O' },
+       { "P", 'P' },
+       { "Q", 'Q' },
+       { "R", 'R' },
+       { "S", 'S' },
+       { "T", 'T' },
+       { "U", 'U' },
+       { "V", 'V' },
+       { "W", 'W' },
+       { "X", 'X' },
+       { "Y", 'Y' },
+       { "Z", 'Z' },
+       { "a", 'a' },
+       { "b", 'b' },
+       { "c", 'c' },
+       { "d", 'd' },
+       { "e", 'e' },
+       { "f", 'f' },
+       { "g", 'g' },
+       { "h", 'h' },
+       { "i", 'i' },
+       { "j", 'j' },
+       { "k", 'k' },
+       { "l", 'l' },
+       { "m", 'm' },
+       { "n", 'n' },
+       { "o", 'o' },
+       { "p", 'p' },
+       { "q", 'q' },
+       { "r", 'r' },
+       { "s", 's' },
+       { "t", 't' },
+       { "u", 'u' },
+       { "v", 'v' },
+       { "w", 'w' },
+       { "x", 'x' },
+       { "y", 'y' },
+       { "z", 'z' },
+       { NULL, 0 }
+};
+
+#pragma GCC diagnostic pop
+
+static int
+cmap_compare_sym(const void *n1, const void *n2)
+{
+       const charmap_t *c1 = n1;
+       const charmap_t *c2 = n2;
+       int rv;
+
+       rv = strcmp(c1->name, c2->name);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+static int
+cmap_compare_wc(const void *n1, const void *n2)
+{
+       const charmap_t *c1 = n1;
+       const charmap_t *c2 = n2;
+
+       return ((c1->wc < c2->wc) ? -1 : (c1->wc > c2->wc) ? 1 : 0);
+}
+
+void
+init_charmap(void)
+{
+       avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t),
+           offsetof(charmap_t, avl_sym));
+
+       avl_create(&cmap_wc, cmap_compare_wc, sizeof (charmap_t),
+           offsetof(charmap_t, avl_wc));
+}
+
+static void
+add_charmap_impl(char *sym, wchar_t wc, int nodups)
+{
+       charmap_t       srch;
+       charmap_t       *n = NULL;
+       avl_index_t     where;
+
+       srch.wc = wc;
+       srch.name = sym;
+
+       /*
+        * also possibly insert the wide mapping, although note that there
+        * can only be one of these per wide character code.
+        */
+       if ((wc != -1) && ((avl_find(&cmap_wc, &srch, &where)) == NULL)) {
+               if ((n = calloc(1, sizeof (*n))) == NULL) {
+                       errf("out of memory");
+                       return;
+               }
+               n->wc = wc;
+               avl_insert(&cmap_wc, n, where);
+       }
+
+       if (sym) {
+               if (avl_find(&cmap_sym, &srch, &where) != NULL) {
+                       if (nodups) {
+                               errf("duplicate character definition");
+                       }
+                       return;
+               }
+               if ((n == NULL) && ((n = calloc(1, sizeof (*n))) == NULL)) {
+                       errf("out of memory");
+                       return;
+               }
+               n->wc = wc;
+               n->name = sym;
+
+               avl_insert(&cmap_sym, n, where);
+       }
+}
+
+void
+add_charmap(char *sym, int c)
+{
+       add_charmap_impl(sym, c, 1);
+}
+
+void
+add_charmap_undefined(char *sym)
+{
+       charmap_t srch;
+       charmap_t *cm = NULL;
+
+       srch.name = sym;
+       cm = avl_find(&cmap_sym, &srch, NULL);
+
+       if ((undefok == 0) && ((cm == NULL) || (cm->wc == -1))) {
+               warn("undefined symbol <%s>", sym);
+               add_charmap_impl(sym, -1, 0);
+       } else {
+               free(sym);
+       }
+}
+
+void
+add_charmap_range(char *s, char *e, int wc)
+{
+       int     ls, le;
+       int     si;
+       int     sn, en;
+       int     i;
+
+       static const char *digits = "0123456789";
+
+       ls = strlen(s);
+       le = strlen(e);
+
+       if (((si = strcspn(s, digits)) == 0) || (si == ls) ||
+           (strncmp(s, e, si) != 0) ||
+           ((int)strspn(s + si, digits) != (ls - si)) ||
+           ((int)strspn(e + si, digits) != (le - si)) ||
+           ((sn = atoi(s + si)) > ((en = atoi(e + si))))) {
+               errf("malformed charmap range");
+               return;
+       }
+
+       s[si] = 0;
+
+       for (i = sn; i <= en; i++) {
+               char *nn;
+               (void) asprintf(&nn, "%s%0*u", s, ls - si, i);
+               if (nn == NULL) {
+                       errf("out of memory");
+                       return;
+               }
+
+               add_charmap_impl(nn, wc, 1);
+               wc++;
+       }
+       free(s);
+       free(e);
+}
+
+void
+add_charmap_char(char *name, int val)
+{
+       add_charmap_impl(name, val, 0);
+}
+
+/*
+ * POSIX insists that certain entries be present, even when not in the
+ * orginal charmap file.
+ */
+void
+add_charmap_posix(void)
+{
+       int     i;
+
+       for (i = 0; portable_chars[i].name; i++) {
+               add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
+       }
+}
+
+int
+lookup_charmap(const char *sym, wchar_t *wc)
+{
+       charmap_t       srch;
+       charmap_t       *n;
+
+       srch.name = sym;
+       n = avl_find(&cmap_sym, &srch, NULL);
+       if (n && n->wc != -1) {
+               if (wc)
+                       *wc = n->wc;
+               return (0);
+       }
+       return (-1);
+}
+
+int
+check_charmap(wchar_t wc)
+{
+       charmap_t srch;
+
+       srch.wc = wc;
+       return (avl_find(&cmap_wc, &srch, NULL) ? 0 : -1);
+}
diff --git a/usr.bin/localedef/collate.c b/usr.bin/localedef/collate.c
new file mode 100644 (file)
index 0000000..c635e6f
--- /dev/null
@@ -0,0 +1,1283 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_COLLATE database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <limits.h>
+#include "localedef.h"
+#include "parser.h"
+#include "collate.h"
+#include "avl.h"
+
+/*
+ * Design notes.
+ *
+ * It will be extremely helpful to the reader if they have access to
+ * the localedef and locale file format specifications available.
+ * Latest versions of these are available from www.opengroup.org.
+ *
+ * The design for the collation code is a bit complex.  The goal is a
+ * single collation database as described in collate.h (in
+ * libc/port/locale).  However, there are some other tidbits:
+ *
+ * a) The substitution entries are now a directly indexable array.  A
+ * priority elsewhere in the table is taken as an index into the
+ * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
+ * set.  (The bit is cleared and the result is the index into the
+ * table.
+ *
+ * b) We eliminate duplicate entries into the substitution table.
+ * This saves a lot of space.
+ *
+ * c) The priorities for each level are "compressed", so that each
+ * sorting level has consecutively numbered priorities starting at 1.
+ * (O is reserved for the ignore priority.)  This means sort levels
+ * which only have a few distinct priorities can represent the
+ * priority level in fewer bits, which makes the strxfrm output
+ * smaller.
+ *
+ * d) We record the total number of priorities so that strxfrm can
+ * figure out how many bytes to expand a numeric priority into.
+ *
+ * e) For the UNDEFINED pass (the last pass), we record the maximum
+ * number of bits needed to uniquely prioritize these entries, so that
+ * the last pass can also use smaller strxfrm output when possible.
+ *
+ * f) Priorities with the sign bit set are verboten.  This works out
+ * because no active character set needs that bit to carry significant
+ * information once the character is in wide form.
+ *
+ * To process the entire data to make the database, we actually run
+ * multiple passes over the data.
+ *
+ * The first pass, which is done at parse time, identifies elements,
+ * substitutions, and such, and records them in priority order.  As
+ * some priorities can refer to other priorities, using forward
+ * references, we use a table of references indicating whether the
+ * priority's value has been resolved, or whether it is still a
+ * reference.
+ *
+ * The second pass walks over all the items in priority order, noting
+ * that they are used directly, and not just an indirect reference.
+ * This is done by creating a "weight" structure for the item.  The
+ * weights are stashed in an AVL tree sorted by relative "priority".
+ *
+ * The third pass walks over all the weight structures, in priority
+ * order, and assigns a new monotonically increasing (per sort level)
+ * weight value to them.  These are the values that will actually be
+ * written to the file.
+ *
+ * The fourth pass just writes the data out.
+ */
+
+/*
+ * In order to resolve the priorities, we create a table of priorities.
+ * Entries in the table can be in one of three states.
+ *
+ * UNKNOWN is for newly allocated entries, and indicates that nothing
+ * is known about the priority.  (For example, when new entries are created
+ * for collating-symbols, this is the value assigned for them until the
+ * collating symbol's order has been determined.
+ *
+ * RESOLVED is used for an entry where the priority indicates the final
+ * numeric weight.
+ *
+ * REFER is used for entries that reference other entries.  Typically
+ * this is used for forward references.  A collating-symbol can never
+ * have this value.
+ *
+ * The "pass" field is used during final resolution to aid in detection
+ * of referencing loops.  (For example <A> depends on <B>, but <B> has its
+ * priority dependent on <A>.)
+ */
+typedef enum {
+       UNKNOWN,        /* priority is totally unknown */
+       RESOLVED,       /* priority value fully resolved */
+       REFER           /* priority is a reference (index) */
+} res_t;
+
+typedef struct weight {
+       int32_t         pri;
+       int             opt;
+       avl_node_t      avl;
+} weight_t;
+
+typedef struct priority {
+       res_t           res;
+       int32_t         pri;
+       int             pass;
+       int             lineno;
+} collpri_t;
+
+#define        NUM_WT  collinfo.directive_count
+
+/*
+ * These are the abstract collating symbols, which are just a symbolic
+ * way to reference a priority.
+ */
+struct collsym {
+       char            *name;
+       int32_t         ref;
+       avl_node_t      avl;
+};
+
+/*
+ * These are also abstract collating symbols, but we allow them to have
+ * different priorities at different levels.
+ */
+typedef struct collundef {
+       char            *name;
+       int32_t         ref[COLL_WEIGHTS_MAX];
+       avl_node_t      avl;
+} collundef_t;
+
+/*
+ * These are called "chains" in libc.  This records the fact that two
+ * more characters should be treated as a single collating entity when
+ * they appear together.  For example, in Spanish <C><h> gets collated
+ * as a character between <C> and <D>.
+ */
+struct collelem {
+       char            *symbol;
+       wchar_t         *expand;
+       int32_t         ref[COLL_WEIGHTS_MAX];
+       avl_node_t      avl_bysymbol;
+       avl_node_t      avl_byexpand;
+};
+
+/*
+ * Individual characters have a sequence of weights as well.
+ */
+typedef struct collchar {
+       wchar_t         wc;
+       int32_t         ref[COLL_WEIGHTS_MAX];
+       avl_node_t      avl;
+} collchar_t;
+
+/*
+ * Substitution entries.  The key is itself a priority.  Note that
+ * when we create one of these, we *automatically* wind up with a
+ * fully resolved priority for the key, because creation of
+ * substitutions creates a resolved priority at the same time.
+ */
+typedef struct {
+       int32_t         key;
+       int32_t         ref[COLLATE_STR_LEN];
+       avl_node_t      avl;
+       avl_node_t      avl_ref;
+} subst_t;
+
+static avl_tree_t      collsyms;
+static avl_tree_t      collundefs;
+static avl_tree_t      elem_by_symbol;
+static avl_tree_t      elem_by_expand;
+static avl_tree_t      collchars;
+static avl_tree_t      substs[COLL_WEIGHTS_MAX];
+static avl_tree_t      substs_ref[COLL_WEIGHTS_MAX];
+static avl_tree_t      weights[COLL_WEIGHTS_MAX];
+static int32_t         nweight[COLL_WEIGHTS_MAX];
+
+/*
+ * This is state tracking for the ellipsis token.  Note that we start
+ * the initial values so that the ellipsis logic will think we got a
+ * magic starting value of NUL.  It starts at minus one because the
+ * starting point is exclusive -- i.e. the starting point is not
+ * itself handled by the ellipsis code.
+ */
+static int currorder = EOF;
+static int lastorder = EOF;
+static collelem_t *currelem;
+static collchar_t *currchar;
+static collundef_t *currundef;
+static wchar_t ellipsis_start = 0;
+static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
+
+/*
+ * We keep a running tally of weights.
+ */
+static int nextpri = 1;
+static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
+
+/*
+ * This array collects up the weights for each level.
+ */
+static int32_t order_weights[COLL_WEIGHTS_MAX];
+static int curr_weight = 0;
+static int32_t subst_weights[COLLATE_STR_LEN];
+static int curr_subst = 0;
+
+/*
+ * Some initial priority values.
+ */
+static int32_t pri_undefined[COLL_WEIGHTS_MAX];
+static int32_t pri_ignore;
+
+static collate_info_t collinfo;
+
+static collpri_t       *prilist = NULL;
+static int             numpri = 0;
+static int             maxpri = 0;
+
+static void start_order(int);
+
+static int32_t
+new_pri(void)
+{
+       int i;
+
+       if (numpri >= maxpri) {
+               maxpri = maxpri ? maxpri * 2 : 1024;
+               prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
+               if (prilist == NULL) {
+                       fprintf(stderr,"out of memory");
+                       return (-1);
+               }
+               for (i = numpri; i < maxpri; i++) {
+                       prilist[i].res = UNKNOWN;
+                       prilist[i].pri = 0;
+                       prilist[i].pass = 0;
+               }
+       }
+       return (numpri++);
+}
+
+static collpri_t *
+get_pri(int32_t ref)
+{
+       if ((ref < 0) || (ref > numpri)) {
+               INTERR;
+               return (NULL);
+       }
+       return (&prilist[ref]);
+}
+
+static void
+set_pri(int32_t ref, int32_t v, res_t res)
+{
+       collpri_t       *pri;
+
+       pri = get_pri(ref);
+
+       if ((res == REFER) && ((v < 0) || (v >= numpri))) {
+               INTERR;
+       }
+
+       /* Resolve self references */
+       if ((res == REFER) && (ref == v)) {
+               v = nextpri;
+               res = RESOLVED;
+       }
+
+       if (pri->res != UNKNOWN) {
+               warn("repeated item in order list (first on %d)",
+                   pri->lineno);
+               return;
+       }
+       pri->lineno = lineno;
+       pri->pri = v;
+       pri->res = res;
+}
+
+static int32_t
+resolve_pri(int32_t ref)
+{
+       collpri_t       *pri;
+       static int32_t  pass = 0;
+
+       pri = get_pri(ref);
+       pass++;
+       while (pri->res == REFER) {
+               if (pri->pass == pass) {
+                       /* report a line with the circular symbol */
+                       lineno = pri->lineno;
+                       fprintf(stderr,"circular reference in order list");
+                       return (-1);
+               }
+               if ((pri->pri < 0) || (pri->pri >= numpri)) {
+                       INTERR;
+                       return (-1);
+               }
+               pri->pass = pass;
+               pri = &prilist[pri->pri];
+       }
+
+       if (pri->res == UNKNOWN) {
+               return (-1);
+       }
+       if (pri->res != RESOLVED)
+               INTERR;
+
+       return (pri->pri);
+}
+
+static int
+weight_compare(const void *n1, const void *n2)
+{
+       int32_t k1 = ((const weight_t *)n1)->pri;
+       int32_t k2 = ((const weight_t *)n2)->pri;
+
+       return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
+}
+
+static int
+collsym_compare(const void *n1, const void *n2)
+{
+       const collsym_t *c1 = n1;
+       const collsym_t *c2 = n2;
+       int rv;
+
+       rv = strcmp(c1->name, c2->name);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+static int
+collundef_compare(const void *n1, const void *n2)
+{
+       const collundef_t *c1 = n1;
+       const collundef_t *c2 = n2;
+       int rv;
+
+       rv = strcmp(c1->name, c2->name);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+static int
+element_compare_symbol(const void *n1, const void *n2)
+{
+       const collelem_t *c1 = n1;
+       const collelem_t *c2 = n2;
+       int rv;
+
+       rv = strcmp(c1->symbol, c2->symbol);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+static int
+element_compare_expand(const void *n1, const void *n2)
+{
+       const collelem_t *c1 = n1;
+       const collelem_t *c2 = n2;
+       int rv;
+
+       rv = wcscmp(c1->expand, c2->expand);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+static int
+collchar_compare(const void *n1, const void *n2)
+{
+       wchar_t k1 = ((const collchar_t *)n1)->wc;
+       wchar_t k2 = ((const collchar_t *)n2)->wc;
+
+       return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
+}
+
+static int
+subst_compare(const void *n1, const void *n2)
+{
+       int32_t k1 = ((const subst_t *)n1)->key;
+       int32_t k2 = ((const subst_t *)n2)->key;
+
+       return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static int
+subst_compare_ref(const void *n1, const void *n2)
+{
+       int32_t *c1 = ((subst_t *)n1)->ref;
+       int32_t *c2 = ((subst_t *)n2)->ref;
+       int rv;
+
+       rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
+       return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
+#pragma GCC diagnostic pop
+
+void
+init_collate(void)
+{
+       int i;
+
+       avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
+           offsetof(collsym_t, avl));
+
+       avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
+           offsetof(collundef_t, avl));
+
+       avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
+           offsetof(collelem_t, avl_bysymbol));
+       avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
+           offsetof(collelem_t, avl_byexpand));
+
+       avl_create(&collchars, collchar_compare, sizeof (collchar_t),
+           offsetof(collchar_t, avl));
+
+       for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
+               avl_create(&substs[i], subst_compare, sizeof (subst_t),
+                   offsetof(subst_t, avl));
+               avl_create(&substs_ref[i], subst_compare_ref,
+                   sizeof (subst_t), offsetof(subst_t, avl_ref));
+               avl_create(&weights[i], weight_compare, sizeof (weight_t),
+                   offsetof(weight_t, avl));
+               nweight[i] = 1;
+       }
+
+       (void) memset(&collinfo, 0, sizeof (collinfo));
+
+       /* allocate some initial priorities */
+       pri_ignore = new_pri();
+
+       set_pri(pri_ignore, 0, RESOLVED);
+
+       for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
+               pri_undefined[i] = new_pri();
+
+               /* we will override this later */
+               set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
+       }
+}
+
+void
+define_collsym(char *name)
+{
+       collsym_t       *sym;
+       avl_index_t     where;
+
+       if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
+               fprintf(stderr,"out of memory");
+               return;
+       }
+       sym->name = name;
+       sym->ref = new_pri();
+
+       if (avl_find(&collsyms, sym, &where) != NULL) {
+               /*
+                * This should never happen because we are only called
+                * for undefined symbols.
+                */
+               INTERR;
+               return;
+       }
+       avl_insert(&collsyms, sym, where);
+}
+
+collsym_t *
+lookup_collsym(char *name)
+{
+       collsym_t       srch;
+
+       srch.name = name;
+       return (avl_find(&collsyms, &srch, NULL));
+}
+
+collelem_t *
+lookup_collelem(char *symbol)
+{
+       collelem_t      srch;
+
+       srch.symbol = symbol;
+       return (avl_find(&elem_by_symbol, &srch, NULL));
+}
+
+static collundef_t *
+get_collundef(char *name)
+{
+       collundef_t     srch;
+       collundef_t     *ud;
+       avl_index_t     where;
+       int             i;
+
+       srch.name = name;
+       if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
+               if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
+                   ((ud->name = strdup(name)) == NULL)) {
+                       fprintf(stderr,"out of memory");
+                       return (NULL);
+               }
+               for (i = 0; i < NUM_WT; i++) {
+                       ud->ref[i] = new_pri();
+               }
+               avl_insert(&collundefs, ud, where);
+       }
+       add_charmap_undefined(name);
+       return (ud);
+}
+
+static collchar_t *
+get_collchar(wchar_t wc, int create)
+{
+       collchar_t      srch;
+       collchar_t      *cc;
+       avl_index_t     where;
+       int             i;
+
+       srch.wc = wc;
+       cc = avl_find(&collchars, &srch, &where);
+       if ((cc == NULL) && create) {
+               if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
+                       fprintf(stderr, "out of memory");
+                       return (NULL);
+               }
+               for (i = 0; i < NUM_WT; i++) {
+                       cc->ref[i] = new_pri();
+               }
+               cc->wc = wc;
+               avl_insert(&collchars, cc, where);
+       }
+       return (cc);
+}
+
+void
+end_order_collsym(collsym_t *sym)
+{
+       start_order(T_COLLSYM);
+       /* update the weight */
+
+       set_pri(sym->ref, nextpri, RESOLVED);
+       nextpri++;
+}
+
+void
+end_order(void)
+{
+       int             i;
+       int32_t         pri;
+       int32_t         ref;
+       collpri_t       *p;
+
+       /* advance the priority/weight */
+       pri = nextpri;
+
+       switch (currorder) {
+       case T_CHAR:
+               for (i = 0; i < NUM_WT; i++) {
+                       if (((ref = order_weights[i]) < 0) ||
+                           ((p = get_pri(ref)) == NULL) ||
+                           (p->pri == -1)) {
+                               /* unspecified weight is a self reference */
+                               set_pri(currchar->ref[i], pri, RESOLVED);
+                       } else {
+                               set_pri(currchar->ref[i], ref, REFER);
+                       }
+                       order_weights[i] = -1;
+               }
+
+               /* leave a cookie trail in case next symbol is ellipsis */
+               ellipsis_start = currchar->wc + 1;
+               currchar = NULL;
+               break;
+
+       case T_ELLIPSIS:
+               /* save off the weights were we can find them */
+               for (i = 0; i < NUM_WT; i++) {
+                       ellipsis_weights[i] = order_weights[i];
+                       order_weights[i] = -1;
+               }
+               break;
+
+       case T_COLLELEM:
+               if (currelem == NULL) {
+                       INTERR;
+               } else {
+                       for (i = 0; i < NUM_WT; i++) {
+
+                               if (((ref = order_weights[i]) < 0) ||
+                                   ((p = get_pri(ref)) == NULL) ||
+                                   (p->pri == -1)) {
+                                       set_pri(currelem->ref[i], pri,
+                                           RESOLVED);
+                               } else {
+                                       set_pri(currelem->ref[i], ref, REFER);
+                               }
+                               order_weights[i] = -1;
+                       }
+               }
+               break;
+
+       case T_UNDEFINED:
+               for (i = 0; i < NUM_WT; i++) {
+                       if (((ref = order_weights[i]) < 0) ||
+                           ((p = get_pri(ref)) == NULL) ||
+                           (p->pri == -1)) {
+                               set_pri(pri_undefined[i], -1, RESOLVED);
+                       } else {
+                               set_pri(pri_undefined[i], ref, REFER);
+                       }
+                       order_weights[i] = -1;
+               }
+               break;
+
+       case T_SYMBOL:
+               for (i = 0; i < NUM_WT; i++) {
+                       if (((ref = order_weights[i]) < 0) ||
+                           ((p = get_pri(ref)) == NULL) ||
+                           (p->pri == -1)) {
+                               set_pri(currundef->ref[i], pri, RESOLVED);
+                       } else {
+                               set_pri(currundef->ref[i], ref, REFER);
+                       }
+                       order_weights[i] = -1;
+               }
+               break;
+
+       default:
+               INTERR;
+       }
+
+       nextpri++;
+}
+
+static void
+start_order(int type)
+{
+       int     i;
+
+       lastorder = currorder;
+       currorder = type;
+
+       /* this is used to protect ELLIPSIS processing */
+       if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
+               fprintf(stderr, "character value expected");
+       }
+
+       for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
+               order_weights[i] = -1;
+       }
+       curr_weight = 0;
+}
+
+void
+start_order_undefined(void)
+{
+       start_order(T_UNDEFINED);
+}
+
+void
+start_order_symbol(char *name)
+{
+       currundef = get_collundef(name);
+       start_order(T_SYMBOL);
+}
+
+void
+start_order_char(wchar_t wc)
+{
+       collchar_t      *cc;
+       int32_t         ref;
+
+       start_order(T_CHAR);
+
+       /*
+        * If we last saw an ellipsis, then we need to close the range.
+        * Handle that here.  Note that we have to be careful because the
+        * items *inside* the range are treated exclusiveley to the items
+        * outside of the range.  The ends of the range can have quite
+        * different weights than the range members.
+        */
+       if (lastorder == T_ELLIPSIS) {
+               int             i;
+
+               if (wc < ellipsis_start) {
+                       fprintf(stderr, "malformed range!");
+                       return;
+               }
+               while (ellipsis_start < wc) {
+                       /*
+                        * pick all of the saved weights for the
+                        * ellipsis.  note that -1 encodes for the
+                        * ellipsis itself, which means to take the
+                        * current relative priority.
+                        */
+                       if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
+                               INTERR;
+                               return;
+                       }
+                       for (i = 0; i < NUM_WT; i++) {
+                               collpri_t *p;
+                               if (((ref = ellipsis_weights[i]) == -1) ||
+                                   ((p = get_pri(ref)) == NULL) ||
+                                   (p->pri == -1)) {
+                                       set_pri(cc->ref[i], nextpri, RESOLVED);
+                               } else {
+                                       set_pri(cc->ref[i], ref, REFER);
+                               }
+                               ellipsis_weights[i] = 0;
+                       }
+                       ellipsis_start++;
+                       nextpri++;
+               }
+       }
+
+       currchar = get_collchar(wc, 1);
+}
+
+void
+start_order_collelem(collelem_t *e)
+{
+       start_order(T_COLLELEM);
+       currelem = e;
+}
+
+void
+start_order_ellipsis(void)
+{
+       int     i;
+
+       start_order(T_ELLIPSIS);
+
+       if (lastorder != T_CHAR) {
+               fprintf(stderr, "illegal starting point for range");
+               return;
+       }
+
+       for (i = 0; i < NUM_WT; i++) {
+               ellipsis_weights[i] = order_weights[i];
+       }
+}
+
+void
+define_collelem(char *name, wchar_t *wcs)
+{
+       collelem_t      *e;
+       avl_index_t     where1;
+       avl_index_t     where2;
+       int             i;
+
+       if (wcslen(wcs) >= COLLATE_STR_LEN) {
+               fprintf(stderr,"expanded collation element too long");
+               return;
+       }
+
+       if ((e = calloc(sizeof (*e), 1)) == NULL) {
+               fprintf(stderr, "out of memory");
+               return;
+       }
+       e->expand = wcs;
+       e->symbol = name;
+
+       /*
+        * This is executed before the order statement, so we don't
+        * know how many priorities we *really* need.  We allocate one
+        * for each possible weight.  Not a big deal, as collating-elements
+        * prove to be quite rare.
+        */
+       for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
+               e->ref[i] = new_pri();
+       }
+
+       /* A character sequence can only reduce to one element. */
+       if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
+           (avl_find(&elem_by_expand, e, &where2) != NULL)) {
+               fprintf(stderr, "duplicate collating element definition");
+               return;
+       }
+       avl_insert(&elem_by_symbol, e, where1);
+       avl_insert(&elem_by_expand, e, where2);
+}
+
+void
+add_order_bit(int kw)
+{
+       uint8_t bit = DIRECTIVE_UNDEF;
+
+       switch (kw) {
+       case T_FORWARD:
+               bit = DIRECTIVE_FORWARD;
+               break;
+       case T_BACKWARD:
+               bit = DIRECTIVE_BACKWARD;
+               break;
+       case T_POSITION:
+               bit = DIRECTIVE_POSITION;
+               break;
+       default:
+               INTERR;
+               break;
+       }
+       collinfo.directive[collinfo.directive_count] |= bit;
+}
+
+void
+add_order_directive(void)
+{
+       if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
+               fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
+       }
+       collinfo.directive_count++;
+}
+
+static void
+add_order_pri(int32_t ref)
+{
+       if (curr_weight >= NUM_WT) {
+               fprintf(stderr,"too many weights (max %d)", NUM_WT);
+               return;
+       }
+       order_weights[curr_weight] = ref;
+       curr_weight++;
+}
+
+void
+add_order_collsym(collsym_t *s)
+{
+       add_order_pri(s->ref);
+}
+
+void
+add_order_char(wchar_t wc)
+{
+       collchar_t *cc;
+
+       if ((cc = get_collchar(wc, 1)) == NULL) {
+               INTERR;
+               return;
+       }
+
+       add_order_pri(cc->ref[curr_weight]);
+}
+
+void
+add_order_collelem(collelem_t *e)
+{
+       add_order_pri(e->ref[curr_weight]);
+}
+
+void
+add_order_ignore(void)
+{
+       add_order_pri(pri_ignore);
+}
+
+void
+add_order_symbol(char *sym)
+{
+       collundef_t *c;
+       if ((c = get_collundef(sym)) == NULL) {
+               INTERR;
+               return;
+       }
+       add_order_pri(c->ref[curr_weight]);
+}
+
+void
+add_order_ellipsis(void)
+{
+       /* special NULL value indicates self reference */
+       add_order_pri(0);
+}
+
+void
+add_order_subst(void)
+{
+       subst_t srch;
+       subst_t *s;
+       avl_index_t where;
+       int i;
+
+       (void) memset(&srch, 0, sizeof (srch));
+       for (i = 0; i < curr_subst; i++) {
+               srch.ref[i] = subst_weights[i];
+               subst_weights[i] = 0;
+       }
+       s = avl_find(&substs_ref[curr_weight], &srch, &where);
+
+       if (s == NULL) {
+               if ((s = calloc(sizeof (*s), 1)) == NULL) {
+                       fprintf(stderr,"out of memory");
+                       return;
+               }
+               s->key = new_pri();
+
+               /*
+                * We use a self reference for our key, but we set a
+                * high bit to indicate that this is a substitution
+                * reference.  This will expedite table lookups later,
+                * and prevent table lookups for situations that don't
+                * require it.  (In short, its a big win, because we
+                * can skip a lot of binary searching.)
+                */
+               set_pri(s->key,
+                   (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
+                   RESOLVED);
+               nextsubst[curr_weight] += 1;
+
+               for (i = 0; i < curr_subst; i++) {
+                       s->ref[i] = srch.ref[i];
+               }
+
+               avl_insert(&substs_ref[curr_weight], s, where);
+
+               if (avl_find(&substs[curr_weight], s, &where) != NULL) {
+                       INTERR;
+                       return;
+               }
+               avl_insert(&substs[curr_weight], s, where);
+       }
+       curr_subst = 0;
+
+
+       /*
+        * We are using the current (unique) priority as a search key
+        * in the substitution table.
+        */
+       add_order_pri(s->key);
+}
+
+static void
+add_subst_pri(int32_t ref)
+{
+       if (curr_subst >= COLLATE_STR_LEN) {
+               fprintf(stderr,"substitution string is too long");
+               return;
+       }
+       subst_weights[curr_subst] = ref;
+       curr_subst++;
+}
+
+void
+add_subst_char(wchar_t wc)
+{
+       collchar_t *cc;
+
+
+       if (((cc = get_collchar(wc, 1)) == NULL) ||
+           (cc->wc != wc)) {
+               INTERR;
+               return;
+       }
+       /* we take the weight for the character at that position */
+       add_subst_pri(cc->ref[curr_weight]);
+}
+
+void
+add_subst_collelem(collelem_t *e)
+{
+       add_subst_pri(e->ref[curr_weight]);
+}
+
+void
+add_subst_collsym(collsym_t *s)
+{
+       add_subst_pri(s->ref);
+}
+
+void
+add_subst_symbol(char *ptr)
+{
+       collundef_t *cu;
+
+       if ((cu = get_collundef(ptr)) != NULL) {
+               add_subst_pri(cu->ref[curr_weight]);
+       }
+}
+
+void
+add_weight(int32_t ref, int pass)
+{
+       weight_t srch;
+       weight_t *w;
+       avl_index_t where;
+
+       srch.pri = resolve_pri(ref);
+
+       /* No translation of ignores */
+       if (srch.pri == 0)
+               return;
+
+       /* Substitution priorities are not weights */
+       if (srch.pri & COLLATE_SUBST_PRIORITY)
+               return;
+
+       if (avl_find(&weights[pass], &srch, &where) != NULL)
+               return;
+
+       if ((w = calloc(sizeof (*w), 1)) == NULL) {
+               fprintf(stderr, "out of memory");
+               return;
+       }
+       w->pri = srch.pri;
+       avl_insert(&weights[pass], w, where);
+}
+
+void
+add_weights(int32_t *refs)
+{
+       int i;
+       for (i = 0; i < NUM_WT; i++) {
+               add_weight(refs[i], i);
+       }
+}
+
+int32_t
+get_weight(int32_t ref, int pass)
+{
+       weight_t        srch;
+       weight_t        *w;
+       int32_t         pri;
+
+       pri = resolve_pri(ref);
+       if (pri & COLLATE_SUBST_PRIORITY) {
+               return (pri);
+       }
+       if (pri <= 0) {
+               return (pri);
+       }
+       srch.pri = pri;
+       if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
+               INTERR;
+               return (-1);
+       }
+       return (w->opt);
+}
+
+wchar_t *
+wsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
+{
+       wchar_t *os1 = s1;
+
+       n++;
+       while (--n > 0 && (*s1++ = *s2++) != 0)
+               continue;
+       if (n > 0)
+               while (--n > 0)
+                       *s1++ = 0;
+       return (os1);
+}
+
+void
+dump_collate(void)
+{
+       FILE                    *f;
+       int                     i, j, n;
+       size_t                  sz;
+       int32_t                 pri;
+       collelem_t              *ce;
+       collchar_t              *cc;
+       subst_t                 *sb;
+       char                    vers[COLLATE_STR_LEN];
+       collate_char_t          chars[UCHAR_MAX + 1];
+       collate_large_t         *large;
+       collate_subst_t         *subst[COLL_WEIGHTS_MAX];
+       collate_chain_t         *chain;
+
+       /*
+        * We have to run throught a preliminary pass to identify all the
+        * weights that we use for each sorting level.
+        */
+       for (i = 0; i < NUM_WT; i++) {
+               add_weight(pri_ignore, i);
+       }
+       for (i = 0; i < NUM_WT; i++) {
+               for (sb = avl_first(&substs[i]); sb;
+                   sb = AVL_NEXT(&substs[i], sb)) {
+                       for (j = 0; sb->ref[j]; j++) {
+                               add_weight(sb->ref[j], i);
+                       }
+               }
+       }
+       for (ce = avl_first(&elem_by_expand);
+           ce != NULL;
+           ce = AVL_NEXT(&elem_by_expand, ce)) {
+               add_weights(ce->ref);
+       }
+       for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
+               add_weights(cc->ref);
+       }
+
+       /*
+        * Now we walk the entire set of weights, removing the gaps
+        * in the weights.  This gives us optimum usage.  The walk
+        * occurs in priority.
+        */
+       for (i = 0; i < NUM_WT; i++) {
+               weight_t *w;
+               for (w = avl_first(&weights[i]); w;
+                   w = AVL_NEXT(&weights[i], w)) {
+                       w->opt = nweight[i];
+                       nweight[i] += 1;
+               }
+       }
+
+       (void) memset(&chars, 0, sizeof (chars));
+       (void) memset(vers, 0, COLLATE_STR_LEN);
+       (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
+
+       /*
+        * We need to make sure we arrange for the UNDEFINED field
+        * to show up.  Also, set the total weight counts.
+        */
+       for (i = 0; i < NUM_WT; i++) {
+               if (resolve_pri(pri_undefined[i]) == -1) {
+                       set_pri(pri_undefined[i], -1, RESOLVED);
+                       /* they collate at the end of everything else */
+                       collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
+               }
+               collinfo.pri_count[i] = nweight[i];
+       }
+
+       collinfo.pri_count[NUM_WT] = max_wide();
+       collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
+       collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
+
+       /*
+        * Ordinary character priorities
+        */
+       for (i = 0; i <= UCHAR_MAX; i++) {
+               if ((cc = get_collchar(i, 0)) != NULL) {
+                       for (j = 0; j < NUM_WT; j++) {
+                               chars[i].pri[j] = get_weight(cc->ref[j], j);
+                       }
+               } else {
+                       for (j = 0; j < NUM_WT; j++) {
+                               chars[i].pri[j] =
+                                   get_weight(pri_undefined[j], j);
+                       }
+                       /*
+                        * Per POSIX, for undefined characters, we
+                        * also have to add a last item, which is the
+                        * character code.
+                        */
+                       chars[i].pri[NUM_WT] = i;
+               }
+       }
+
+       /*
+        * Substitution tables
+        */
+       for (i = 0; i < NUM_WT; i++) {
+               collate_subst_t *st = NULL;
+               n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
+               if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
+                       fprintf(stderr, "out of memory");
+                       return;
+               }
+               n = 0;
+               for (sb = avl_first(&substs[i]); sb;
+                   sb = AVL_NEXT(&substs[i], sb)) {
+                       if ((st[n].key = resolve_pri(sb->key)) < 0) {
+                               /* by definition these resolve! */
+                               INTERR;
+                       }
+                       if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
+                               INTERR;
+                       }
+                       for (j = 0; sb->ref[j]; j++) {
+                               st[n].pri[j] = get_weight(sb->ref[j], i);
+                       }
+                       n++;
+               }
+               if (n != collinfo.subst_count[i])
+                       INTERR;
+               subst[i] = st;
+       }
+
+
+       /*
+        * Chains, i.e. collating elements
+        */
+       collinfo.chain_count = avl_numnodes(&elem_by_expand);
+       chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
+       if (chain == NULL) {
+               fprintf(stderr, "out of memory");
+               return;
+       }
+       for (n = 0, ce = avl_first(&elem_by_expand);
+           ce != NULL;
+           ce = AVL_NEXT(&elem_by_expand, ce), n++) {
+               (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
+               for (i = 0; i < NUM_WT; i++) {
+                       chain[n].pri[i] = get_weight(ce->ref[i], i);
+               }
+       }
+       if (n != collinfo.chain_count)
+               INTERR;
+
+       /*
+        * Large (> UCHAR_MAX) character priorities
+        */
+       large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
+       if (large == NULL) {
+               fprintf(stderr, "out of memory");
+               return;
+       }
+
+       i = 0;
+       for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
+               int     undef = 0;
+               /* we already gathered those */
+               if (cc->wc <= UCHAR_MAX)
+                       continue;
+               for (j = 0; j < NUM_WT; j++) {
+                       if ((pri = get_weight(cc->ref[j], j)) < 0) {
+                               undef = 1;
+                       }
+                       if (undef && (pri >= 0)) {
+                               /* if undefined, then all priorities are */
+                               INTERR;
+                       } else {
+                               large[i].pri.pri[j] = pri;
+                       }
+               }
+               if (!undef) {
+                       large[i].val = cc->wc;
+                       collinfo.large_count = i++;
+               }
+       }
+
+       if ((f = open_category()) == NULL) {
+               return;
+       }
+
+       /* Time to write the entire data set out */
+
+       if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
+           (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
+           (wr_category(&chars, sizeof (chars), f) < 0)) {
+               return;
+       }
+
+       for (i = 0; i < NUM_WT; i++) {
+               sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
+               if (wr_category(subst[i], sz, f) < 0) {
+                       return;
+               }
+       }
+       sz = sizeof (collate_chain_t) * collinfo.chain_count;
+       if (wr_category(chain, sz, f) < 0) {
+               return;
+       }
+       sz = sizeof (collate_large_t) * collinfo.large_count;
+       if (wr_category(large, sz, f) < 0) {
+               return;
+       }
+
+       close_category(f);
+}
diff --git a/usr.bin/localedef/ctype.c b/usr.bin/localedef/ctype.c
new file mode 100644 (file)
index 0000000..813bf30
--- /dev/null
@@ -0,0 +1,448 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2012 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_CTYPE database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <wchar.h>
+#include <ctype.h>
+#include <wctype.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "runefile.h"
+#include "avl.h"
+
+
+#define _ISUPPER       _CTYPE_U
+#define _ISLOWER       _CTYPE_L
+#define        _ISDIGIT        _CTYPE_D
+#define        _ISXDIGIT       _CTYPE_X
+#define        _ISSPACE        _CTYPE_S
+#define        _ISBLANK        _CTYPE_B
+#define        _ISALPHA        _CTYPE_A
+#define        _ISPUNCT        _CTYPE_P
+#define        _ISGRAPH        _CTYPE_G
+#define        _ISPRINT        _CTYPE_R
+#define        _ISCNTRL        _CTYPE_C
+#define        _E1             _CTYPE_Q
+#define        _E2             _CTYPE_I
+#define        _E3             0
+#define        _E4             0
+#define        _E5             _CTYPE_T
+
+static avl_tree_t      ctypes;
+
+static wchar_t         last_ctype;
+
+typedef struct ctype_node {
+       wchar_t wc;
+       int32_t ctype;
+       int32_t toupper;
+       int32_t tolower;
+       avl_node_t avl;
+} ctype_node_t;
+
+typedef struct width_node {
+       wchar_t start;
+       wchar_t end;
+       int8_t width;
+       avl_node_t avl;
+} width_node_t;
+
+static int
+ctype_compare(const void *n1, const void *n2)
+{
+       const ctype_node_t *c1 = n1;
+       const ctype_node_t *c2 = n2;
+
+       return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
+}
+
+void
+init_ctype(void)
+{
+       avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
+           offsetof(ctype_node_t, avl));
+}
+
+
+static void
+add_ctype_impl(ctype_node_t *ctn)
+{
+       switch (last_kw) {
+       case T_ISUPPER:
+               ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
+               break;
+       case T_ISLOWER:
+               ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
+               break;
+       case T_ISALPHA:
+               ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
+               break;
+       case T_ISDIGIT:
+               ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
+               break;
+       case T_ISSPACE:
+               ctn->ctype |= _ISSPACE;
+               break;
+       case T_ISCNTRL:
+               ctn->ctype |= _ISCNTRL;
+               break;
+       case T_ISGRAPH:
+               ctn->ctype |= (_ISGRAPH | _ISPRINT);
+               break;
+       case T_ISPRINT:
+               ctn->ctype |= _ISPRINT;
+               break;
+       case T_ISPUNCT:
+               ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
+               break;
+       case T_ISXDIGIT:
+               ctn->ctype |= (_ISXDIGIT | _ISPRINT);
+               break;
+       case T_ISBLANK:
+               ctn->ctype |= (_ISBLANK | _ISSPACE);
+               break;
+       case T_ISPHONOGRAM:
+               ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
+               break;
+       case T_ISIDEOGRAM:
+               ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
+               break;
+       case T_ISENGLISH:
+               ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
+               break;
+       case T_ISNUMBER:
+               ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
+               break;
+       case T_ISSPECIAL:
+               ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
+               break;
+       case T_ISALNUM:
+               /*
+                * We can't do anything with this.  The character
+                * should already be specified as a digit or alpha.
+                */
+               break;
+       default:
+               errf("not a valid character class");
+       }
+}
+
+static ctype_node_t *
+get_ctype(wchar_t wc)
+{
+       ctype_node_t    srch;
+       ctype_node_t    *ctn;
+       avl_index_t     where;
+
+       srch.wc = wc;
+       if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
+               if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
+                       errf("out of memory");
+                       return (NULL);
+               }
+               ctn->wc = wc;
+
+               avl_insert(&ctypes, ctn, where);
+       }
+       return (ctn);
+}
+
+void
+add_ctype(int val)
+{
+       ctype_node_t    *ctn;
+
+       if ((ctn = get_ctype(val)) == NULL) {
+               INTERR;
+               return;
+       }
+       add_ctype_impl(ctn);
+       last_ctype = ctn->wc;
+}
+
+void
+add_ctype_range(int end)
+{
+       ctype_node_t    *ctn;
+       wchar_t         cur;
+
+       if (end < last_ctype) {
+               errf("malformed character range (%u ... %u))",
+                   last_ctype, end);
+               return;
+       }
+       for (cur = last_ctype + 1; cur <= end; cur++) {
+               if ((ctn = get_ctype(cur)) == NULL) {
+                       INTERR;
+                       return;
+               }
+               add_ctype_impl(ctn);
+       }
+       last_ctype = end;
+
+}
+
+/*
+ * A word about widths: if the width mask is specified, then libc
+ * unconditionally honors it.  Otherwise, it assumes printable
+ * characters have width 1, and non-printable characters have width
+ * -1 (except for NULL which is special with with 0).  Hence, we have
+ * no need to inject defaults here -- the "default" unset value of 0
+ * indicates that libc should use its own logic in wcwidth as described.
+ */
+void
+add_width(int wc, int width)
+{
+       ctype_node_t    *ctn;
+
+       if ((ctn = get_ctype(wc)) == NULL) {
+               INTERR;
+               return;
+       }
+       ctn->ctype &= ~(_CTYPE_SWM);
+       switch (width) {
+       case 0:
+               ctn->ctype |= _CTYPE_SW0;
+               break;
+       case 1:
+               ctn->ctype |= _CTYPE_SW1;
+               break;
+       case 2:
+               ctn->ctype |= _CTYPE_SW2;
+               break;
+       case 3:
+               ctn->ctype |= _CTYPE_SW3;
+               break;
+       }
+}
+
+void
+add_width_range(int start, int end, int width)
+{
+       for (; start <= end; start++) {
+               add_width(start, width);
+       }
+}
+
+void
+add_caseconv(int val, int wc)
+{
+       ctype_node_t    *ctn;
+
+       ctn = get_ctype(val);
+       if (ctn == NULL) {
+               INTERR;
+               return;
+       }
+
+       switch (last_kw) {
+       case T_TOUPPER:
+               ctn->toupper = wc;
+               break;
+       case T_TOLOWER:
+               ctn->tolower = wc;
+               break;
+       default:
+               INTERR;
+               break;
+       }
+}
+
+void
+dump_ctype(void)
+{
+       FILE            *f;
+       _FileRuneLocale rl;
+       ctype_node_t    *ctn, *last_ct, *last_lo, *last_up;
+       _FileRuneEntry  *ct = NULL;
+       _FileRuneEntry  *lo = NULL;
+       _FileRuneEntry  *up = NULL;
+       wchar_t         wc;
+
+       (void) memset(&rl, 0, sizeof (rl));
+       last_ct = NULL;
+       last_lo = NULL;
+       last_up = NULL;
+
+       if ((f = open_category()) == NULL)
+               return;
+
+       (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
+       (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
+
+       /*
+        * Initialize the identity map.
+        */
+       for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
+               rl.maplower[wc] = wc;
+               rl.mapupper[wc] = wc;
+       }
+
+       for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
+               int conflict = 0;
+
+
+               wc = ctn->wc;
+
+               /*
+                * POSIX requires certain portable characters have
+                * certain types.  Add them if they are missing.
+                */
+               if ((wc >= 1) && (wc <= 127)) {
+                       if ((wc >= 'A') && (wc <= 'Z'))
+                               ctn->ctype |= _ISUPPER;
+                       if ((wc >= 'a') && (wc <= 'z'))
+                               ctn->ctype |= _ISLOWER;
+                       if ((wc >= '0') && (wc <= '9'))
+                               ctn->ctype |= _ISDIGIT;
+                       if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
+                               ctn->ctype |= _ISSPACE;
+                       if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
+                               ctn->ctype |= _ISXDIGIT;
+                       if (strchr(" \t", (char)wc))
+                               ctn->ctype |= _ISBLANK;
+
+                       /*
+                        * Technically these settings are only
+                        * required for the C locale.  However, it
+                        * turns out that because of the historical
+                        * version of isprint(), we need them for all
+                        * locales as well.  Note that these are not
+                        * necessarily valid punctation characters in
+                        * the current language, but ispunct() needs
+                        * to return TRUE for them.
+                        */
+                       if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
+                           (char)wc))
+                               ctn->ctype |= _ISPUNCT;
+               }
+
+               /*
+                * POSIX also requires that certain types imply
+                * others.  Add any inferred types here.
+                */
+               if (ctn->ctype & (_ISUPPER |_ISLOWER))
+                       ctn->ctype |= _ISALPHA;
+               if (ctn->ctype & _ISDIGIT)
+                       ctn->ctype |= _ISXDIGIT;
+               if (ctn->ctype & _ISBLANK)
+                       ctn->ctype |= _ISSPACE;
+               if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
+                       ctn->ctype |= _ISGRAPH;
+               if (ctn->ctype & _ISGRAPH)
+                       ctn->ctype |= _ISPRINT;
+
+               /*
+                * Finally, POSIX requires that certain combinations
+                * are invalid.  We don't flag this as a fatal error,
+                * but we will warn about.
+                */
+               if ((ctn->ctype & _ISALPHA) &&
+                   (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
+                       conflict++;
+               if ((ctn->ctype & _ISPUNCT) &
+                   (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
+                       conflict++;
+               if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
+                       conflict++;
+               if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
+                       conflict++;
+               if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
+                       conflict++;
+
+               if (conflict) {
+                       warn("conflicting classes for character 0x%x (%x)",
+                           wc, ctn->ctype);
+               }
+               /*
+                * Handle the lower 256 characters using the simple
+                * optimization.  Note that if we have not defined the
+                * upper/lower case, then we identity map it.
+                */
+               if ((unsigned)wc < _CACHED_RUNES) {
+                       rl.runetype[wc] = ctn->ctype;
+                       if (ctn->tolower)
+                               rl.maplower[wc] = ctn->tolower;
+                       if (ctn->toupper)
+                               rl.mapupper[wc] = ctn->toupper;
+                       continue;
+               }
+
+               if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
+                       ct[rl.runetype_ext_nranges-1].max = wc;
+                       last_ct = ctn;
+               } else {
+                       rl.runetype_ext_nranges++;
+                       ct = realloc(ct,
+                           sizeof (*ct) * rl.runetype_ext_nranges);
+                       ct[rl.runetype_ext_nranges - 1].min = wc;
+                       ct[rl.runetype_ext_nranges - 1].max = wc;
+                       ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
+                       last_ct = ctn;
+               }
+               if (ctn->tolower == 0) {
+                       last_lo = NULL;
+               } else if ((last_lo != NULL) &&
+                   (last_lo->tolower + 1 == ctn->tolower)) {
+                       lo[rl.maplower_ext_nranges-1].max = wc;
+                       last_lo = ctn;
+               } else {
+                       rl.maplower_ext_nranges++;
+                       lo = realloc(lo,
+                           sizeof (*lo) * rl.maplower_ext_nranges);
+                       lo[rl.maplower_ext_nranges - 1].min = wc;
+                       lo[rl.maplower_ext_nranges - 1].max = wc;
+                       lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
+                       last_lo = ctn;
+               }
+
+               if (ctn->toupper == 0) {
+                       last_up = NULL;
+               } else if ((last_up != NULL) &&
+                   (last_up->toupper + 1 == ctn->toupper)) {
+                       up[rl.mapupper_ext_nranges-1].max = wc;
+                       last_up = ctn;
+               } else {
+                       rl.mapupper_ext_nranges++;
+                       up = realloc(up,
+                           sizeof (*up) * rl.mapupper_ext_nranges);
+                       up[rl.mapupper_ext_nranges - 1].min = wc;
+                       up[rl.mapupper_ext_nranges - 1].max = wc;
+                       up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
+                       last_up = ctn;
+               }
+       }
+
+       if ((wr_category(&rl, sizeof (rl), f) < 0) ||
+           (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
+           (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
+           (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
+               return;
+       }
+
+       close_category(f);
+}
diff --git a/usr.bin/localedef/localedef.1 b/usr.bin/localedef/localedef.1
new file mode 100644 (file)
index 0000000..be03f6a
--- /dev/null
@@ -0,0 +1,238 @@
+.\" Copyright (c) 1992, X/Open Company Limited  All Rights Reserved
+.\" Portions Copyright (c) 2003, Sun Microsystems, Inc.  All Rights Reserved
+.\" Portions Copyright 2013 DEY Storage Systems, Inc.
+.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for
+.\" permission to reproduce portions of its copyrighted documentation.
+.\" Original documentation from The Open Group can be obtained online at
+.\" http://www.opengroup.org/bookstore/.
+.\" The Institute of Electrical and Electronics Engineers and The Open Group,
+.\" have given us permission to reprint portions of their documentation. In
+.\" the following statement, the phrase "this text" refers to portions of the
+.\" system documentation. Portions of this text are reprinted and reproduced
+.\" in electronic form in the Sun OS Reference Manual, from IEEE Std 1003.1,
+.\" 2004 Edition, Standard for Information Technology -- Portable Operating
+.\" System Interface (POSIX), The Open Group Base Specifications Issue 6,
+.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics
+.\" Engineers, Inc and The Open Group. In the event of any discrepancy between
+.\" these versions and the original IEEE and The Open Group Standard, the
+.\" original IEEE and The Open Group Standard is the referee document. The
+.\" original Standard can be obtained online at
+.\" http://www.opengroup.org/unix/online.html.
+.\"  This notice shall appear on any product containing this material.
+.\" The contents of this file are subject to the terms of the Common
+.\" Development and Distribution License (the "License").  You may not use
+.\" this file except in compliance with the License.
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or
+.\" http://www.opensolaris.org/os/licensing.  See the License for the specific
+.\" language governing permissions and limitations under the License.
+.\" When distributing Covered Code, include this CDDL HEADER in each file and
+.\" include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable,
+.\" add the following below this CDDL HEADER, with the fields enclosed by
+.\" brackets "[]" replaced with your own identifying information:
+.\" Portions Copyright [yyyy] [name of copyright owner]
+.Dd July 28, 2015
+.Dt LOCALEDEF 1
+.Os
+.Sh NAME
+.Nm localedef
+.Nd define locale environment
+.Sh SYNOPSIS
+.Nm
+.Op Fl D
+.Op Fl c
+.Op Fl v
+.Op Fl U
+.Op Fl f Ar charmap
+.Op Fl w Ar widthfile
+.Op Fl i Ar sourcefile
+.Op Fl u Ar codeset
+localename
+.Sh DESCRIPTION
+The
+.Nm localedef
+utility converts source definitions for locale categories
+into a format usable by the functions and utilities whose operational behavior
+is determined by the setting of the locale environment variables; see
+.Xr environ 5 .
+.Pp
+The utility reads source definitions for one or more locale categories
+belonging to the same locale from the file named in the \fB-i\fR option (if
+specified) or from standard input.
+.Pp
+Each category source definition is identified by the corresponding environment
+variable name and terminated by an
+.Sy END
+.Em category-name
+statement. The following categories are supported:
+.Bl -tag -width LC_MONETARY
+.It LC_CTYPE
+Defines character classification and case conversion.
+.It LC_COLLATE
+Defines collation rules.
+.It LC_MONETARY
+Defines the format and symbols used in formatting of monetary information.
+.It LC_NUMERIC
+Defines the decimal delimiter, grouping and grouping symbol for non-monetary
+numeric editing.
+.It LC_TIME
+Defines the format and content of date and time information.
+.It LC_MESSAGES
+Defines the format and values of affirmative and negative responses.
+.El
+.Pp
+The following options are supported:
+.Bl -tag -width xx_sourcefile
+.It -D
+DragonFly-style output.  Rather than the default of creating the
+.Sy localename
+directory and creating files like LC_CTYPE, LC_COLLATE, etc, in that directory,
+the output files have the format "<localename>.<category>" and are
+dumped to the current directory.
+.It -c
+Creates permanent output even if warning messages have been issued.
+.It -v
+Emit verbose debugging output on standard output.
+.It -U
+Ignore the presence of character symbols that have no matching character
+definition.  This facilitates the use of a common locale definition file
+to be used across multiple encodings, even when some symbols are not
+present in a given encoding.
+.It -f charmap
+Specifies the pathname of a file containing a mapping of character symbols and
+collating element symbols to actual character encodings. This option must be
+specified if symbolic names (other than collating symbols defined in a
+.Sy collating-symbol
+keyword) are used. If the
+.Sy -f
+option is not present, the default character mapping will be used.
+.It -w widthfile
+The path name of the file containing character screen width definitions.
+If not supplied, then default screen widths will be assumed, which will
+generally not account for East Asian encodings requiring more than a single
+character cell to display, nor for combining or accent marks that occupy
+no additional screen width.
+.It -i sourcefile
+The path name of a file containing the source definitions. If this option is
+not present, source definitions will be read from standard input.
+.It -u codeset
+Specifies the name of a codeset used as the target mapping of character symbols
+and collating element symbols whose encoding values are defined in terms of the
+ISO/IEC 10646-1: 2000 standard position constant values. See NOTES.
+.El
+.Pp
+The following operands are required:
+.Bl -tag -width localename
+.It localename
+Identifies the locale. If the name contains one or more slash characters,
+.Ar localename
+will be interpreted as a path name where the created locale
+definitions will be stored. This capability may be restricted to users with
+appropriate privileges. (As a consequence of specifying one
+.Ar localename ,
+although several categories can be processed in one execution, only categories
+belonging to the same locale can be processed.)
+.El
+.Sh OUTPUT
+.Nm
+creates a directory of files that represents the locale's data, unless instructed
+otherwise by the
+.Sy -D
+(DragonFly output) option. The contants of this directory should generally be
+copied into the appropriate subdirectory of /usr/share/locale in order the
+definitions to be visible to programs linked with libc.
+.Sh ENVIRONMENT
+See
+.Xr Benviron 5
+for definitions of the following environment variables that affect the execution of
+.Nm :
+.Sy LANG ,
+.Sy LC_ALL ,
+.Sy LC_COLLATE ,
+.Sy LC_CTYPE ,
+.Sy LC_MESSAGES ,
+.Sy LC_MONETARY ,
+.Sy LC_MUMERIC ,
+.Sy LC_TIME ,
+and
+.Sy NLSPATH .
+.Sh EXIT STATUS
+The following exit values are returned:
+.Bl -tag -width XX
+.It 0
+No errors occurred and the locales were successfully created.
+.It 1
+Warnings occurred and the locales were successfully created.
+.It 2
+The locale specification exceeded implementation limits or the coded character
+set or sets used were not supported by the implementation, and no locale was
+created.
+.It >3
+Warnings or errors occurred and no output was created.
+.El
+.Pp
+If an error is detected, no permanent output will be created.
+.Sh SEE ALSO
+.Xr locale 1 ,
+.Xr iconv_open 3 ,
+.Xr nl_langinfo 3 ,
+.Xr strftime 3 ,
+.Xr environ 5 .
+.Sh WARNINGS
+If warnings occur, permanent output will be created if the
+.Sy -c
+option was specified. The following conditions will cause warning messages to be issued:
+.Bl -tag -width X
+.It *
+If a symbolic name not found in the
+.Em charmap
+file is used for the descriptions of the
+.Sy LC_CTYPE
+or
+.Sy LC_COLLATE
+categories (for other categories, this will be an error condition).
+.It *
+If optional keywords not supported by the implementation are present in the
+source.
+.El
+.Sh NOTES
+When the
+.Sy -u
+option is used, the
+.Em codeset
+option-argument is interpreted as a name of a codeset to which the
+ISO/IEC 10646-1: 2000 standard position constant values are converted. Both the
+ISO/IEC 10646-1: 2000 standard position constant values and other formats (decimal,
+hexadecimal, or octal) are valid as encoding values within the charmap file. The
+codeset can be any codeset that is supported by the \fBiconv_open\fR(3C) function
+on the system.
+.Pp
+When conflicts occur between the charmap specification of
+.Em codeset ,
+.Em mb_cur_max ,
+or
+.Em mb_cur_min
+and the corresponding value for the codeset represented by the
+.Sy -u
+option-argument
+.Em codeset ,
+the
+.Nm
+utility fails as an error.
+.Pp
+When conflicts occur between the charmap encoding values specified for symbolic
+names of characters of the portable character set and the character encoding
+values defined by the US-ASCII, the result is unspecified.
+.Sh HISTORY
+.Nm
+first appeared in
+.Dx
+4.4. It was ported from Illumos from the point
+.An Garrett D'Amore
+.Aq garrett@nexenta.com
+added multibyte support (October 2010).
+.An John Marino
+.Aq draco@marino.st
+provided the alternations necessary to compile cleanly on
+.Dx
+as well as altered libc to use the new collation (the changes were also based
+on Illumos, but modified to work with xlocale functionality.)
diff --git a/usr.bin/localedef/localedef.c b/usr.bin/localedef/localedef.c
new file mode 100644 (file)
index 0000000..7ba06fa
--- /dev/null
@@ -0,0 +1,331 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * POSIX localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <limits.h>
+#include <locale.h>
+#include <dirent.h>
+#include "localedef.h"
+#include "parser.h"
+
+#ifndef        TEXT_DOMAIN
+#define        TEXT_DOMAIN     "SYS_TEST"
+#endif
+
+int dragonfly = 0;
+int verbose = 0;
+int undefok = 0;
+int warnok = 0;
+static char *locname = NULL;
+static char locpath[PATH_MAX];
+
+const char *
+category_name(void)
+{
+       switch (get_category()) {
+       case T_CHARMAP:
+               return ("CHARMAP");
+       case T_WIDTH:
+               return ("WIDTH");
+       case T_COLLATE:
+               return ("LC_COLLATE");
+       case T_CTYPE:
+               return ("LC_CTYPE");
+       case T_MESSAGES:
+               return ("LC_MESSAGES");
+       case T_MONETARY:
+               return ("LC_MONETARY");
+       case T_NUMERIC:
+               return ("LC_NUMERIC");
+       case T_TIME:
+               return ("LC_TIME");
+       default:
+               INTERR;
+               return (NULL);
+       }
+}
+
+static char *
+category_file(void)
+{
+       if (dragonfly)
+               (void) snprintf(locpath, sizeof (locpath), "%s.%s",
+                   locname, category_name());
+       else
+               (void) snprintf(locpath, sizeof (locpath), "%s/%s",
+                   locname, category_name());
+       return (locpath);
+}
+
+FILE *
+open_category(void)
+{
+       FILE *file;
+
+       if (verbose) {
+               (void) printf("Writing category %s: ", category_name());
+               (void) fflush(stdout);
+       }
+
+       /* make the parent directory */
+       if (!dragonfly)
+               (void) mkdir(dirname(category_file()), 0755);
+
+       /*
+        * note that we have to regenerate the file name, as dirname
+        * clobbered it.
+        */
+       file = fopen(category_file(), "w");
+       if (file == NULL) {
+               errf(strerror(errno));
+               return (NULL);
+       }
+       return (file);
+}
+
+void
+close_category(FILE *f)
+{
+       if (fchmod(fileno(f), 0644) < 0) {
+               (void) fclose(f);
+               (void) unlink(category_file());
+               errf(strerror(errno));
+       }
+       if (fclose(f) < 0) {
+               (void) unlink(category_file());
+               errf(strerror(errno));
+       }
+       if (verbose) {
+               (void) fprintf(stdout, "done.\n");
+               (void) fflush(stdout);
+       }
+}
+
+/*
+ * This function is used when copying the category from another
+ * locale.  Note that the copy is actually performed using a hard
+ * link for efficiency.
+ */
+void
+copy_category(char *src)
+{
+       char    srcpath[PATH_MAX];
+       int     rv;
+
+       (void) snprintf(srcpath, sizeof (srcpath), "%s/%s",
+           src, category_name());
+       rv = access(srcpath, R_OK);
+       if ((rv != 0) && (strchr(srcpath, '/') == NULL)) {
+               /* Maybe we should try the system locale */
+               (void) snprintf(srcpath, sizeof (srcpath),
+                   "/usr/lib/locale/%s/%s", src, category_name());
+               rv = access(srcpath, R_OK);
+       }
+
+       if (rv != 0) {
+               fprintf(stderr,"source locale data unavailable: %s", src);
+               return;
+       }
+
+       if (verbose > 1) {
+               (void) printf("Copying category %s from %s: ",
+                   category_name(), src);
+               (void) fflush(stdout);
+       }
+
+       /* make the parent directory */
+       if (!dragonfly)
+               (void) mkdir(dirname(category_file()), 0755);
+
+       if (link(srcpath, category_file()) != 0) {
+               fprintf(stderr,"unable to copy locale data: %s",
+                       strerror(errno));
+               return;
+       }
+       if (verbose > 1) {
+               (void) printf("done.\n");
+       }
+}
+
+int
+putl_category(const char *s, FILE *f)
+{
+       if (s && fputs(s, f) == EOF) {
+               (void) fclose(f);
+               (void) unlink(category_file());
+               errf(strerror(errno));
+               return (EOF);
+       }
+       if (fputc('\n', f) == EOF) {
+               (void) fclose(f);
+               (void) unlink(category_file());
+               errf(strerror(errno));
+               return (EOF);
+       }
+       return (0);
+}
+
+int
+wr_category(void *buf, size_t sz, FILE *f)
+{
+       if (!sz) {
+               return (0);
+       }
+       if (fwrite(buf, sz, 1, f) < 1) {
+               (void) fclose(f);
+               (void) unlink(category_file());
+               errf(strerror(errno));
+               return (EOF);
+       }
+       return (0);
+}
+
+int yyparse(void);
+
+static void
+usage(void)
+{
+       (void) fprintf(stderr, "Usage: localedef [options] localename\n");
+       (void) fprintf(stderr, "[options] are:\n");
+       (void) fprintf(stderr, "  -D          : DragonFly-style output\n");
+       (void) fprintf(stderr, "  -c          : ignore warnings\n");
+       (void) fprintf(stderr, "  -v          : verbose output\n");
+       (void) fprintf(stderr, "  -U          : ignore undefined symbols\n");
+       (void) fprintf(stderr, "  -f charmap  : use given charmap file\n");
+       (void) fprintf(stderr, "  -u encoding : assume encoding\n");
+       (void) fprintf(stderr, "  -w widths   : use screen widths file\n");
+       (void) fprintf(stderr, "  -i locsrc   : source file for locale\n");
+       exit(4);
+}
+
+int
+main(int argc, char **argv)
+{
+       int c;
+       char *lfname = NULL;
+       char *cfname = NULL;
+       char *wfname = NULL;
+       DIR *dir;
+
+       init_charmap();
+       init_collate();
+       init_ctype();
+       init_messages();
+       init_monetary();
+       init_numeric();
+       init_time();
+
+       yydebug = 0;
+
+       (void) setlocale(LC_ALL, "");
+
+       while ((c = getopt(argc, argv, "w:i:cf:u:vUD")) != -1) {
+               switch (c) {
+               case 'D':
+                       dragonfly = 1;
+                       break;
+               case 'v':
+                       verbose++;
+                       break;
+               case 'i':
+                       lfname = optarg;
+                       break;
+               case 'u':
+                       set_wide_encoding(optarg);
+                       break;
+               case 'f':
+                       cfname = optarg;
+                       break;
+               case 'U':
+                       undefok++;
+                       break;
+               case 'c':
+                       warnok++;
+                       break;
+               case 'w':
+                       wfname = optarg;
+                       break;
+               case '?':
+                       usage();
+                       break;
+               }
+       }
+
+       if ((argc - 1) != (optind)) {
+               usage();
+       }
+       locname = argv[argc - 1];
+       if (verbose) {
+               (void) printf("Processing locale %s.\n", locname);
+       }
+
+       if (cfname) {
+               if (verbose)
+                       (void) printf("Loading charmap %s.\n", cfname);
+               reset_scanner(cfname);
+               (void) yyparse();
+       }
+
+       if (wfname) {
+               if (verbose)
+                       (void) printf("Loading widths %s.\n", wfname);
+               reset_scanner(wfname);
+               (void) yyparse();
+       }
+
+       if (verbose) {
+               (void) printf("Loading POSIX portable characters.\n");
+       }
+       add_charmap_posix();
+
+       if (lfname) {
+               reset_scanner(lfname);
+       } else {
+               reset_scanner(NULL);
+       }
+
+       /* make the directory for the locale if not already present */
+       if (!dragonfly) {
+               while ((dir = opendir(locname)) == NULL) {
+                       if ((errno != ENOENT) ||
+                           (mkdir(locname, 0755) <  0)) {
+                               errf(strerror(errno));
+                       }
+               }
+               (void) closedir(dir);
+               (void) mkdir(dirname(category_file()), 0755);
+       }
+
+       (void) yyparse();
+       if (verbose) {
+               (void) printf("All done.\n");
+       }
+       return (warnings ? 1 : 0);
+}
diff --git a/usr.bin/localedef/localedef.h b/usr.bin/localedef/localedef.h
new file mode 100644 (file)
index 0000000..a9be35d
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy is of the CDDL is also available via the Internet
+ * at http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 DEY Storage Systmes, Inc.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * POSIX localedef.
+ */
+
+/* Common header files. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+
+extern int com_char;
+extern int esc_char;
+extern int mb_cur_max;
+extern int mb_cur_min;
+extern int last_kw;
+extern int verbose;
+extern int yydebug;
+extern int lineno;
+extern int undefok;    /* mostly ignore undefined symbols */
+extern int warnok;
+extern int warnings;
+
+int yylex(void);
+void yyerror(const char *);
+void errf(const char *, ...);
+void warn(const char *, ...);
+
+int putl_category(const char *, FILE *);
+int wr_category(void *, size_t, FILE *);
+FILE *open_category(void);
+void close_category(FILE *);
+void copy_category(char *);
+const char *category_name(void);
+
+int get_category(void);
+int get_symbol(void);
+int get_escaped(int);
+int get_wide(void);
+void reset_scanner(const char *);
+void scan_to_eol(void);
+void add_wcs(wchar_t);
+void add_tok(int);
+wchar_t *get_wcs(void);
+
+/* charmap.c - CHARMAP handling */
+void init_charmap(void);
+void add_charmap(char *, int);
+void add_charmap_undefined(char *);
+void add_charmap_posix(void);
+void add_charmap_range(char *, char *, int);
+void add_charmap_char(char *name, int val);
+int lookup_charmap(const char *, wchar_t *);
+int check_charmap_undefined(char *);
+int check_charmap(wchar_t);
+
+/* collate.o - LC_COLLATE handling */
+typedef struct collelem collelem_t;
+typedef struct collsym collsym_t;
+void init_collate(void);
+void define_collsym(char *);
+void define_collelem(char *, wchar_t *);
+void add_order_directive(void);
+void add_order_bit(int);
+void dump_collate(void);
+collsym_t *lookup_collsym(char *);
+collelem_t *lookup_collelem(char *);
+void start_order_collelem(collelem_t *);
+void start_order_undefined(void);
+void start_order_symbol(char *);
+void start_order_char(wchar_t);
+void start_order_ellipsis(void);
+void end_order_collsym(collsym_t *);
+void end_order(void);
+void add_weight(int32_t, int);
+void add_weights(int32_t *);
+void add_weight_num(int);
+void add_order_collelem(collelem_t *);
+void add_order_collsym(collsym_t *);
+void add_order_char(wchar_t);
+void add_order_ignore(void);
+void add_order_ellipsis(void);
+void add_order_symbol(char *);
+void add_order_subst(void);
+void add_subst_char(wchar_t);
+void add_subst_collsym(collsym_t *);
+void add_subst_collelem(collelem_t *);
+void add_subst_symbol(char *);
+int32_t get_weight(int32_t, int);
+wchar_t * wsncpy(wchar_t *, const wchar_t *, size_t);
+
+
+/* ctype.c - LC_CTYPE handling */
+void init_ctype(void);
+void add_ctype(int);
+void add_ctype_range(int);
+void add_width(int, int);
+void add_width_range(int, int, int);
+void add_caseconv(int, int);
+void dump_ctype(void);
+
+/* messages.c - LC_MESSAGES handling */
+void init_messages(void);
+void add_message(wchar_t *);
+void dump_messages(void);
+
+/* monetary.c - LC_MONETARY handling */
+void init_monetary(void);
+void add_monetary_str(wchar_t *);
+void add_monetary_num(int);
+void reset_monetary_group(void);
+void add_monetary_group(int);
+void dump_monetary(void);
+
+/* numeric.c - LC_NUMERIC handling */
+void init_numeric(void);
+void add_numeric_str(wchar_t *);
+void reset_numeric_group(void);
+void add_numeric_group(int);
+void dump_numeric(void);
+
+/* time.c - LC_TIME handling */
+void init_time(void);
+void add_time_str(wchar_t *);
+void reset_time_list(void);
+void add_time_list(wchar_t *);
+void check_time_list(void);
+void dump_time(void);
+
+/* wide.c -  Wide character handling. */
+int to_wide(wchar_t *, const char *);
+int to_mbs(char *, wchar_t);
+int to_mb(char *, wchar_t);
+char *to_mb_string(const wchar_t *);
+void set_wide_encoding(const char *);
+void werr(const char *, ...);
+const char *get_wide_encoding(void);
+int max_wide(void);
+
+//#define      _(x)    gettext(x)
+#define        INTERR  fprintf(stderr,"internal fault (%s:%d)", __FILE__, __LINE__)
diff --git a/usr.bin/localedef/messages.c b/usr.bin/localedef/messages.c
new file mode 100644 (file)
index 0000000..05579bb
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_MESSAGES database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "lmessages.h"
+
+static struct lc_messages_T msgs;
+
+void
+init_messages(void)
+{
+       (void) memset(&msgs, 0, sizeof (msgs));
+}
+
+void
+add_message(wchar_t *wcs)
+{
+       char *str;
+
+       if ((str = to_mb_string(wcs)) == NULL) {
+               INTERR;
+               return;
+       }
+       free(wcs);
+
+       switch (last_kw) {
+       case T_YESSTR:
+               msgs.yesstr = str;
+               break;
+       case T_NOSTR:
+               msgs.nostr = str;
+               break;
+       case T_YESEXPR:
+               msgs.yesexpr = str;
+               break;
+       case T_NOEXPR:
+               msgs.noexpr = str;
+               break;
+       default:
+               free(str);
+               INTERR;
+               break;
+       }
+}
+
+void
+dump_messages(void)
+{
+       FILE *f;
+       char *ptr;
+
+       if (msgs.yesstr == NULL) {
+               warn("missing field 'yesstr'");
+               msgs.yesstr = "";
+       }
+       if (msgs.nostr == NULL) {
+               warn("missing field 'nostr'");
+               msgs.nostr = "";
+       }
+
+       /*
+        * CLDR likes to add : separated lists for yesstr and nostr.
+        * Legacy Solaris code does not seem to grok this.  Fix it.
+        */
+       if ((ptr = strchr(msgs.yesstr, ':')) != NULL)
+               *ptr = 0;
+       if ((ptr = strchr(msgs.nostr, ':')) != NULL)
+               *ptr = 0;
+
+       if ((f = open_category()) == NULL) {
+               return;
+       }
+
+       if ((putl_category(msgs.yesexpr, f) == EOF) ||
+           (putl_category(msgs.noexpr, f) == EOF) ||
+           (putl_category(msgs.yesstr, f) == EOF) ||
+           (putl_category(msgs.nostr, f) == EOF)) {
+               return;
+       }
+       close_category(f);
+}
diff --git a/usr.bin/localedef/monetary.c b/usr.bin/localedef/monetary.c
new file mode 100644 (file)
index 0000000..35905b3
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_MONETARY database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "lmonetary.h"
+
+static struct lc_monetary_T mon;
+
+void
+init_monetary(void)
+{
+       (void) memset(&mon, 0, sizeof (mon));
+}
+
+void
+add_monetary_str(wchar_t *wcs)
+{
+       char *str;
+
+       if ((str = to_mb_string(wcs)) == NULL) {
+               INTERR;
+               return;
+       }
+       free(wcs);
+       switch (last_kw) {
+       case T_INT_CURR_SYMBOL:
+               mon.int_curr_symbol = str;
+               break;
+       case T_CURRENCY_SYMBOL:
+               mon.currency_symbol = str;
+               break;
+       case T_MON_DECIMAL_POINT:
+               mon.mon_decimal_point = str;
+               break;
+       case T_MON_THOUSANDS_SEP:
+               mon.mon_thousands_sep = str;
+               break;
+       case T_POSITIVE_SIGN:
+               mon.positive_sign = str;
+               break;
+       case T_NEGATIVE_SIGN:
+               mon.negative_sign = str;
+               break;
+       default:
+               free(str);
+               INTERR;
+               break;
+       }
+}
+
+void
+add_monetary_num(int n)
+{
+       char *str = NULL;
+
+       (void) asprintf(&str, "%d", n);
+       if (str == NULL) {
+               fprintf(stderr, "out of memory");
+               return;
+       }
+
+       switch (last_kw) {
+       case T_INT_FRAC_DIGITS:
+               mon.int_frac_digits = str;
+               break;
+       case T_FRAC_DIGITS:
+               mon.frac_digits = str;
+               break;
+       case T_P_CS_PRECEDES:
+               mon.p_cs_precedes = str;
+               break;
+       case T_P_SEP_BY_SPACE:
+               mon.p_sep_by_space = str;
+               break;
+       case T_N_CS_PRECEDES:
+               mon.n_cs_precedes = str;
+               break;
+       case T_N_SEP_BY_SPACE:
+               mon.n_sep_by_space = str;
+               break;
+       case T_P_SIGN_POSN:
+               mon.p_sign_posn = str;
+               break;
+       case T_N_SIGN_POSN:
+               mon.n_sign_posn = str;
+               break;
+       case T_INT_P_CS_PRECEDES:
+               mon.int_p_cs_precedes = str;
+               break;
+       case T_INT_N_CS_PRECEDES:
+               mon.int_n_cs_precedes = str;
+               break;
+       case T_INT_P_SEP_BY_SPACE:
+               mon.int_p_sep_by_space = str;
+               break;
+       case T_INT_N_SEP_BY_SPACE:
+               mon.int_n_sep_by_space = str;
+               break;
+       case T_INT_P_SIGN_POSN:
+               mon.int_p_sign_posn = str;
+               break;
+       case T_INT_N_SIGN_POSN:
+               mon.int_n_sign_posn = str;
+               break;
+       case T_MON_GROUPING:
+               mon.mon_grouping = str;
+               break;
+       default:
+               INTERR;
+               break;
+       }
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+void
+reset_monetary_group(void)
+{
+       free((char *)mon.mon_grouping);
+       mon.mon_grouping = NULL;
+}
+
+void
+add_monetary_group(int n)
+{
+       char *s = NULL;
+
+       if (mon.mon_grouping == NULL) {
+               (void) asprintf(&s, "%d", n);
+       } else {
+               (void) asprintf(&s, "%s;%d", mon.mon_grouping, n);
+       }
+       if (s == NULL)
+               fprintf(stderr, "out of memory");
+
+       free((char *)mon.mon_grouping);
+       mon.mon_grouping = s;
+}
+
+#pragma GCC diagnostic pop
+
+void
+dump_monetary(void)
+{
+       FILE *f;
+
+       if ((f = open_category()) == NULL) {
+               return;
+       }
+
+       if ((putl_category(mon.int_curr_symbol, f) == EOF) ||
+           (putl_category(mon.currency_symbol, f) == EOF) ||
+           (putl_category(mon.mon_decimal_point, f) == EOF) ||
+           (putl_category(mon.mon_thousands_sep, f) == EOF) ||
+           (putl_category(mon.mon_grouping, f) == EOF) ||
+           (putl_category(mon.positive_sign, f) == EOF) ||
+           (putl_category(mon.negative_sign, f) == EOF) ||
+           (putl_category(mon.int_frac_digits, f) == EOF) ||
+           (putl_category(mon.frac_digits, f) == EOF) ||
+           (putl_category(mon.p_cs_precedes, f) == EOF) ||
+           (putl_category(mon.p_sep_by_space, f) == EOF) ||
+           (putl_category(mon.n_cs_precedes, f) == EOF) ||
+           (putl_category(mon.n_sep_by_space, f) == EOF) ||
+           (putl_category(mon.p_sign_posn, f) == EOF) ||
+           (putl_category(mon.n_sign_posn, f) == EOF) ||
+           (putl_category(mon.int_p_cs_precedes, f) == EOF) ||
+           (putl_category(mon.int_n_cs_precedes, f) == EOF) ||
+           (putl_category(mon.int_p_sep_by_space, f) == EOF) ||
+           (putl_category(mon.int_n_sep_by_space, f) == EOF) ||
+           (putl_category(mon.int_p_sign_posn, f) == EOF) ||
+           (putl_category(mon.int_n_sign_posn, f) == EOF)) {
+               return;
+       }
+       close_category(f);
+}
diff --git a/usr.bin/localedef/numeric.c b/usr.bin/localedef/numeric.c
new file mode 100644 (file)
index 0000000..de487eb
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_NUMERIC database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "lnumeric.h"
+
+static struct lc_numeric_T numeric;
+
+void
+init_numeric(void)
+{
+       (void) memset(&numeric, 0, sizeof (numeric));
+}
+
+void
+add_numeric_str(wchar_t *wcs)
+{
+       char *str;
+
+       if ((str = to_mb_string(wcs)) == NULL) {
+               INTERR;
+               return;
+       }
+       free(wcs);
+
+       switch (last_kw) {
+       case T_DECIMAL_POINT:
+               numeric.decimal_point = str;
+               break;
+       case T_THOUSANDS_SEP:
+               numeric.thousands_sep = str;
+               break;
+       default:
+               free(str);
+               INTERR;
+               break;
+       }
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+void
+reset_numeric_group(void)
+{
+       free((char *)numeric.grouping);
+       numeric.grouping = NULL;
+}
+
+void
+add_numeric_group(int n)
+{
+       char *s;
+
+       if (numeric.grouping == NULL) {
+               (void) asprintf(&s, "%d", n);
+       } else {
+               (void) asprintf(&s, "%s;%d", numeric.grouping, n);
+       }
+       if (s == NULL)
+               fprintf(stderr, "out of memory");
+
+       free((char *)numeric.grouping);
+       numeric.grouping = s;
+}
+
+#pragma GCC diagnostic pop
+
+void
+dump_numeric(void)
+{
+       FILE *f;
+
+       if ((f = open_category()) == NULL) {
+               return;
+       }
+
+       if ((putl_category(numeric.decimal_point, f) == EOF) ||
+           (putl_category(numeric.thousands_sep, f) == EOF) ||
+           (putl_category(numeric.grouping, f) == EOF)) {
+               return;
+       }
+       close_category(f);
+}
diff --git a/usr.bin/localedef/parser.y b/usr.bin/localedef/parser.y
new file mode 100644 (file)
index 0000000..c9223ab
--- /dev/null
@@ -0,0 +1,693 @@
+%{
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * POSIX localedef grammar.
+ */
+
+#include <wchar.h>
+#include <stdio.h>
+#include <limits.h>
+#include "localedef.h"
+
+%}
+%union {
+       int             num;
+       wchar_t         wc;
+       char            *token;
+       collsym_t       *collsym;
+       collelem_t      *collelem;
+}
+
+%token         T_CODE_SET
+%token         T_MB_CUR_MAX
+%token         T_MB_CUR_MIN
+%token         T_COM_CHAR
+%token         T_ESC_CHAR
+%token         T_LT
+%token         T_GT
+%token         T_NL
+%token         T_SEMI
+%token         T_COMMA
+%token         T_ELLIPSIS
+%token         T_RPAREN
+%token         T_LPAREN
+%token         T_QUOTE
+%token         T_NULL
+%token         T_WS
+%token         T_END
+%token         T_COPY
+%token         T_CHARMAP
+%token         T_WIDTH
+%token         T_CTYPE
+%token         T_ISUPPER
+%token         T_ISLOWER
+%token         T_ISALPHA
+%token         T_ISDIGIT
+%token         T_ISPUNCT
+%token         T_ISXDIGIT
+%token         T_ISSPACE
+%token         T_ISPRINT
+%token         T_ISGRAPH
+%token         T_ISBLANK
+%token         T_ISCNTRL
+%token         T_ISALNUM
+%token         T_ISSPECIAL
+%token         T_ISPHONOGRAM
+%token         T_ISIDEOGRAM
+%token         T_ISENGLISH
+%token         T_ISNUMBER
+%token         T_TOUPPER
+%token         T_TOLOWER
+%token         T_COLLATE
+%token         T_COLLATING_SYMBOL
+%token         T_COLLATING_ELEMENT
+%token         T_ORDER_START
+%token         T_ORDER_END
+%token         T_FORWARD
+%token         T_BACKWARD
+%token         T_POSITION
+%token         T_FROM
+%token         T_UNDEFINED
+%token         T_IGNORE
+%token         T_MESSAGES
+%token         T_YESSTR
+%token         T_NOSTR
+%token         T_YESEXPR
+%token         T_NOEXPR
+%token         T_MONETARY
+%token         T_INT_CURR_SYMBOL
+%token         T_CURRENCY_SYMBOL
+%token         T_MON_DECIMAL_POINT
+%token         T_MON_THOUSANDS_SEP
+%token         T_POSITIVE_SIGN
+%token         T_NEGATIVE_SIGN
+%token         T_MON_GROUPING
+%token         T_INT_FRAC_DIGITS
+%token         T_FRAC_DIGITS
+%token         T_P_CS_PRECEDES
+%token         T_P_SEP_BY_SPACE
+%token         T_N_CS_PRECEDES
+%token         T_N_SEP_BY_SPACE
+%token         T_P_SIGN_POSN
+%token         T_N_SIGN_POSN
+%token         T_INT_P_CS_PRECEDES
+%token         T_INT_N_CS_PRECEDES
+%token         T_INT_P_SEP_BY_SPACE
+%token         T_INT_N_SEP_BY_SPACE
+%token         T_INT_P_SIGN_POSN
+%token         T_INT_N_SIGN_POSN
+%token         T_NUMERIC
+%token         T_DECIMAL_POINT
+%token         T_THOUSANDS_SEP
+%token         T_GROUPING
+%token         T_TIME
+%token         T_ABDAY
+%token         T_DAY
+%token         T_ABMON
+%token         T_MON
+%token         T_ERA
+%token         T_ERA_D_FMT
+%token         T_ERA_T_FMT
+%token         T_ERA_D_T_FMT
+%token         T_ALT_DIGITS
+%token         T_D_T_FMT
+%token         T_D_FMT
+%token         T_T_FMT
+%token         T_AM_PM
+%token         T_T_FMT_AMPM
+%token         T_DATE_FMT
+%token <wc>            T_CHAR
+%token <token>         T_NAME
+%token <num>           T_NUMBER
+%token <token>         T_SYMBOL
+%token <collsym>       T_COLLSYM
+%token <collelem>      T_COLLELEM
+
+%%
+
+localedef      : setting_list categories
+               | categories
+               ;
+
+string         : T_QUOTE charlist T_QUOTE
+               | T_QUOTE T_QUOTE
+               ;
+
+charlist       : charlist T_CHAR
+               {
+                       add_wcs($2);
+               }
+               | T_CHAR
+               {
+                       add_wcs($1);
+               }
+               ;
+
+setting_list   : setting_list setting
+               | setting
+               ;
+
+
+setting                : T_COM_CHAR T_CHAR T_NL
+               {
+                       com_char = $2;
+               }
+               | T_ESC_CHAR T_CHAR T_NL
+               {
+                       esc_char = $2;
+               }
+               | T_MB_CUR_MAX T_NUMBER T_NL
+               {
+                       mb_cur_max = $2;
+               }
+               | T_MB_CUR_MIN T_NUMBER T_NL
+               {
+                       mb_cur_min = $2;
+               }
+               | T_CODE_SET string T_NL
+               {
+                       wchar_t *w = get_wcs();
+                       set_wide_encoding(to_mb_string(w));
+                       free(w);
+               }
+               | T_CODE_SET T_NAME T_NL
+               {
+                       set_wide_encoding($2);
+               }
+               ;
+
+copycat                : T_COPY T_NAME T_NL
+               {
+                       copy_category($2);
+               }
+               | T_COPY string T_NL
+               {
+                       wchar_t *w = get_wcs();
+                       copy_category(to_mb_string(w));
+                       free(w);
+               }
+               ;
+
+categories     : categories category
+               | category
+               ;
+
+
+category       : charmap
+               | messages
+               | monetary
+               | ctype
+               | collate
+               | numeric
+               | time
+               ;
+
+
+charmap                : T_CHARMAP T_NL charmap_list T_END T_CHARMAP T_NL
+               | T_WIDTH T_NL width_list T_END T_WIDTH T_NL
+               ;
+
+
+charmap_list   : charmap_list charmap_entry
+               | charmap_entry
+               ;
+
+
+charmap_entry  : T_SYMBOL T_CHAR
+               {
+                       add_charmap($1, $2);
+                       scan_to_eol();
+               }
+               | T_SYMBOL T_ELLIPSIS T_SYMBOL T_CHAR
+               {
+                       add_charmap_range($1, $3, $4);
+                       scan_to_eol();
+               }
+               | T_NL
+               ;
+
+width_list     : width_list width_entry
+               | width_entry
+               ;
+
+width_entry    : T_CHAR T_NUMBER T_NL
+               {
+                       add_width($1, $2);
+               }
+               | T_SYMBOL T_NUMBER T_NL
+               {
+                       add_charmap_undefined($1);
+               }
+               | T_CHAR T_ELLIPSIS T_CHAR T_NUMBER T_NL
+               {
+                       add_width_range($1, $3, $4);
+               }
+               | T_SYMBOL T_ELLIPSIS T_SYMBOL T_NUMBER T_NL
+               {
+                       add_charmap_undefined($1);
+                       add_charmap_undefined($3);
+               }
+               | T_CHAR T_ELLIPSIS T_SYMBOL T_NUMBER T_NL
+               {
+                       add_width($1, $4);
+                       add_charmap_undefined($3);
+               }
+               | T_SYMBOL T_ELLIPSIS T_CHAR T_NUMBER T_NL
+               {
+                       add_width($3, $4);
+                       add_charmap_undefined($1);
+               }
+               | T_NL
+               ;
+
+ctype          : T_CTYPE T_NL ctype_list T_END T_CTYPE T_NL
+               {
+                       dump_ctype();
+               }
+               | T_CTYPE T_NL copycat  T_END T_CTYPE T_NL
+               ;
+
+ctype_list     : ctype_list ctype_kw
+               | ctype_kw
+               ;
+
+ctype_kw       : T_ISUPPER cc_list T_NL
+               | T_ISLOWER cc_list T_NL
+               | T_ISALPHA cc_list T_NL
+               | T_ISDIGIT cc_list T_NL
+               | T_ISPUNCT cc_list T_NL
+               | T_ISXDIGIT cc_list T_NL
+               | T_ISSPACE cc_list T_NL
+               | T_ISPRINT cc_list T_NL
+               | T_ISGRAPH cc_list T_NL
+               | T_ISBLANK cc_list T_NL
+               | T_ISCNTRL cc_list T_NL
+               | T_ISALNUM cc_list T_NL
+               | T_ISSPECIAL cc_list T_NL
+               | T_ISENGLISH cc_list T_NL
+               | T_ISNUMBER cc_list T_NL
+               | T_ISIDEOGRAM cc_list T_NL
+               | T_ISPHONOGRAM cc_list T_NL
+               | T_TOUPPER conv_list T_NL
+               | T_TOLOWER conv_list T_NL
+               ;
+
+
+cc_list                : cc_list T_SEMI T_CHAR
+               {
+                       add_ctype($3);
+               }
+               | cc_list T_SEMI T_SYMBOL
+               {
+                       add_charmap_undefined($3);
+               }
+               | cc_list T_SEMI T_ELLIPSIS T_SEMI T_CHAR
+               {
+                       /* note that the endpoints *must* be characters */
+                       add_ctype_range($5);
+               }
+               | T_CHAR
+               {
+                       add_ctype($1);
+               }
+               | T_SYMBOL
+               {
+                       add_charmap_undefined($1);
+               }
+               ;
+
+conv_list      : conv_list T_SEMI conv_pair
+               | conv_pair
+               ;
+
+
+conv_pair      : T_LPAREN T_CHAR T_COMMA T_CHAR T_RPAREN
+               {
+                       add_caseconv($2, $4);
+               }
+               | T_LPAREN T_SYMBOL T_COMMA T_CHAR T_RPAREN
+               {
+                       add_charmap_undefined($2);
+               }
+               | T_LPAREN T_SYMBOL T_COMMA T_SYMBOL T_RPAREN
+               {
+                       add_charmap_undefined($2);
+                       add_charmap_undefined($4);
+               }
+               | T_LPAREN T_CHAR T_COMMA T_SYMBOL T_RPAREN
+               {
+                       add_charmap_undefined($4);
+               }
+               ;
+
+collate                : T_COLLATE T_NL coll_order T_END T_COLLATE T_NL
+               {
+                       dump_collate();
+               }
+               | T_COLLATE T_NL coll_optional coll_order T_END T_COLLATE T_NL
+               {
+                       dump_collate();
+               }
+               | T_COLLATE T_NL copycat T_END T_COLLATE T_NL
+               ;
+
+
+coll_optional  : coll_optional coll_symbols
+               | coll_optional coll_elements
+               | coll_symbols
+               | coll_elements
+               ;
+
+
+coll_symbols   : T_COLLATING_SYMBOL T_SYMBOL T_NL
+               {
+                       define_collsym($2);
+               }
+               ;
+
+
+coll_elements  : T_COLLATING_ELEMENT T_SYMBOL T_FROM string T_NL
+               {
+                       define_collelem($2, get_wcs());
+               }
+               ;
+
+coll_order     : T_ORDER_START T_NL order_list T_ORDER_END T_NL
+               {
+                       /* If no order list supplied default to one forward */
+                       add_order_bit(T_FORWARD);
+                       add_order_directive();
+               }
+               | T_ORDER_START order_args T_NL order_list T_ORDER_END T_NL
+               ;
+
+
+order_args     : order_args T_SEMI order_arg
+               {
+                       add_order_directive();
+               }
+               | order_arg
+               {
+                       add_order_directive();
+               }
+               ;
+
+order_arg      : order_arg T_COMMA order_dir
+               | order_dir
+               ;
+
+order_dir      : T_FORWARD
+               {
+                       add_order_bit(T_FORWARD);
+               }
+               | T_BACKWARD
+               {
+                       add_order_bit(T_BACKWARD);
+               }
+               | T_POSITION
+               {
+                       add_order_bit(T_POSITION);
+               }
+               ;
+
+order_list     : order_list order_item
+               | order_item
+               ;
+
+order_item     : T_COLLSYM T_NL
+               {
+                       end_order_collsym($1);
+               }
+               | order_itemkw T_NL
+               {
+                       end_order();
+               }
+               | order_itemkw order_weights T_NL
+               {
+                       end_order();
+               }
+               ;
+
+order_itemkw   : T_CHAR
+               {
+                       start_order_char($1);
+               }
+               | T_ELLIPSIS
+               {
+                       start_order_ellipsis();
+               }
+               | T_COLLELEM
+               {
+                       start_order_collelem($1);
+               }
+               | T_UNDEFINED
+               {
+                       start_order_undefined();
+               }
+               | T_SYMBOL
+               {
+                       start_order_symbol($1);
+               }
+               ;
+
+order_weights  : order_weights T_SEMI order_weight
+               | order_weights T_SEMI
+               | order_weight
+               ;
+
+order_weight   : T_COLLELEM
+               {
+                       add_order_collelem($1);
+               }
+               | T_COLLSYM
+               {
+                       add_order_collsym($1);
+               }
+               | T_CHAR
+               {
+                       add_order_char($1);
+               }
+               | T_ELLIPSIS
+               {
+                       add_order_ellipsis();
+               }
+               | T_IGNORE
+               {
+                       add_order_ignore();
+               }
+               | T_SYMBOL
+               {
+                       add_order_symbol($1);
+               }
+               | T_QUOTE order_str T_QUOTE
+               {
+                       add_order_subst();
+               }
+               ;
+
+order_str      : order_str order_stritem
+               | order_stritem
+               ;
+
+order_stritem  : T_CHAR
+               {
+                       add_subst_char($1);
+               }
+               | T_COLLSYM
+               {
+                       add_subst_collsym($1);
+               }
+               | T_COLLELEM
+               {
+                       add_subst_collelem($1);
+               }
+               | T_SYMBOL
+               {
+                       add_subst_symbol($1);
+               }
+               ;
+
+messages       : T_MESSAGES T_NL messages_list T_END T_MESSAGES T_NL
+               {
+                       dump_messages();
+               }
+               | T_MESSAGES T_NL copycat T_END T_MESSAGES T_NL
+               ;
+
+messages_list  : messages_list messages_item
+               | messages_item
+               ;
+
+messages_kw    : T_YESSTR
+               | T_NOSTR
+               | T_YESEXPR
+               | T_NOEXPR
+               ;
+
+messages_item  : messages_kw string T_NL
+               {
+                       add_message(get_wcs());
+               }
+               ;
+
+monetary       : T_MONETARY T_NL monetary_list T_END T_MONETARY T_NL
+               {
+                       dump_monetary();
+               }
+               | T_MONETARY T_NL copycat T_END T_MONETARY T_NL
+               ;
+
+monetary_list  : monetary_list monetary_kw
+               | monetary_kw
+               ;
+
+monetary_strkw : T_INT_CURR_SYMBOL
+               | T_CURRENCY_SYMBOL
+               | T_MON_DECIMAL_POINT
+               | T_MON_THOUSANDS_SEP
+               | T_POSITIVE_SIGN
+               | T_NEGATIVE_SIGN
+               ;
+
+monetary_numkw : T_INT_FRAC_DIGITS
+               | T_FRAC_DIGITS
+               | T_P_CS_PRECEDES
+               | T_P_SEP_BY_SPACE
+               | T_N_CS_PRECEDES
+               | T_N_SEP_BY_SPACE
+               | T_P_SIGN_POSN
+               | T_N_SIGN_POSN
+               | T_INT_P_CS_PRECEDES
+               | T_INT_N_CS_PRECEDES
+               | T_INT_P_SEP_BY_SPACE
+               | T_INT_N_SEP_BY_SPACE
+               | T_INT_P_SIGN_POSN
+               | T_INT_N_SIGN_POSN
+               ;
+
+monetary_kw    : monetary_strkw string T_NL
+               {
+                       add_monetary_str(get_wcs());
+               }
+               | monetary_numkw T_NUMBER T_NL
+               {
+                       add_monetary_num($2);
+               }
+               | T_MON_GROUPING mon_group_list T_NL
+               ;
+
+mon_group_list : T_NUMBER
+               {
+                       reset_monetary_group();
+                       add_monetary_group($1);
+               }
+               | mon_group_list T_SEMI T_NUMBER
+               {
+                       add_monetary_group($3);
+               }
+               ;
+
+
+numeric                : T_NUMERIC T_NL numeric_list T_END T_NUMERIC T_NL
+               {
+                       dump_numeric();
+               }
+               | T_NUMERIC T_NL copycat T_END T_NUMERIC T_NL
+               ;
+
+
+numeric_list   : numeric_list numeric_item
+               | numeric_item
+               ;
+
+
+numeric_item   : numeric_strkw string T_NL
+               {
+                       add_numeric_str(get_wcs());
+               }
+               | T_GROUPING group_list T_NL
+               ;
+
+numeric_strkw  : T_DECIMAL_POINT
+               | T_THOUSANDS_SEP
+               ;
+
+
+group_list     : T_NUMBER
+               {
+                       reset_numeric_group();
+                       add_numeric_group($1);
+               }
+               | group_list T_SEMI T_NUMBER
+               {
+                       add_numeric_group($3);
+               }
+               ;
+
+
+time           : T_TIME T_NL time_kwlist T_END T_TIME T_NL
+               {
+                       dump_time();
+               }
+               | T_TIME T_NL copycat T_END T_NUMERIC T_NL
+               ;
+
+time_kwlist    : time_kwlist time_kw
+               | time_kw
+               ;
+
+time_kw                : time_strkw string T_NL
+               {
+                       add_time_str(get_wcs());
+               }
+               | time_listkw time_list T_NL
+               {
+                       check_time_list();
+               }
+               ;
+
+time_listkw    : T_ABDAY
+               | T_DAY
+               | T_ABMON
+               | T_MON
+               | T_ERA
+               | T_ALT_DIGITS
+               | T_AM_PM
+               ;
+
+time_strkw     : T_ERA_D_T_FMT
+               | T_ERA_T_FMT
+               | T_ERA_D_FMT
+               | T_D_T_FMT
+               | T_D_FMT
+               | T_T_FMT
+               | T_T_FMT_AMPM
+               | T_DATE_FMT
+               ;
+
+time_list      : time_list T_SEMI string
+               {
+                       add_time_list(get_wcs());
+               }
+               | string
+               {
+                       reset_time_list();
+                       add_time_list(get_wcs());
+               }
+               ;
diff --git a/usr.bin/localedef/scanner.c b/usr.bin/localedef/scanner.c
new file mode 100644 (file)
index 0000000..cc40fd9
--- /dev/null
@@ -0,0 +1,851 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * This file contains the "scanner", which tokenizes the input files
+ * for localedef for processing by the higher level grammar processor.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <limits.h>
+#include <string.h>
+#include <wchar.h>
+#include <sys/types.h>
+#include <assert.h>
+#include "localedef.h"
+#include "parser.h"
+
+int                    com_char = '#';
+int                    esc_char = '\\';
+int                    mb_cur_min = 1;
+int                    mb_cur_max = 1;
+int                    lineno = 1;
+int                    warnings = 0;
+int                    is_stdin = 1;
+FILE                   *input;
+static int             nextline;
+//static FILE          *input = stdin;
+static const char      *filename = "<stdin>";
+static int             instring = 0;
+static int             escaped = 0;
+
+/*
+ * Token space ... grows on demand.
+ */
+static char *token = NULL;
+static int tokidx;
+static int toksz = 0;
+static int hadtok = 0;
+
+/*
+ * Wide string space ... grows on demand.
+ */
+static wchar_t *widestr = NULL;
+static int wideidx = 0;
+static int widesz = 0;
+
+/*
+ * The last keyword seen.  This is useful to trigger the special lexer rules
+ * for "copy" and also collating symbols and elements.
+ */
+int    last_kw = 0;
+static int     category = T_END;
+
+static struct token {
+       int id;
+       const char *name;
+} keywords[] = {
+       { T_COM_CHAR,           "comment_char" },
+       { T_ESC_CHAR,           "escape_char" },
+       { T_END,                "END" },
+       { T_COPY,               "copy" },
+       { T_MESSAGES,           "LC_MESSAGES" },
+       { T_YESSTR,             "yesstr" },
+       { T_YESEXPR,            "yesexpr" },
+       { T_NOSTR,              "nostr" },
+       { T_NOEXPR,             "noexpr" },
+       { T_MONETARY,           "LC_MONETARY" },
+       { T_INT_CURR_SYMBOL,    "int_curr_symbol" },
+       { T_CURRENCY_SYMBOL,    "currency_symbol" },
+       { T_MON_DECIMAL_POINT,  "mon_decimal_point" },
+       { T_MON_THOUSANDS_SEP,  "mon_thousands_sep" },
+       { T_POSITIVE_SIGN,      "positive_sign" },
+       { T_NEGATIVE_SIGN,      "negative_sign" },
+       { T_MON_GROUPING,       "mon_grouping" },
+       { T_INT_FRAC_DIGITS,    "int_frac_digits" },
+       { T_FRAC_DIGITS,        "frac_digits" },
+       { T_P_CS_PRECEDES,      "p_cs_precedes" },
+       { T_P_SEP_BY_SPACE,     "p_sep_by_space" },
+       { T_N_CS_PRECEDES,      "n_cs_precedes" },
+       { T_N_SEP_BY_SPACE,     "n_sep_by_space" },
+       { T_P_SIGN_POSN,        "p_sign_posn" },
+       { T_N_SIGN_POSN,        "n_sign_posn" },
+       { T_INT_P_CS_PRECEDES,  "int_p_cs_precedes" },
+       { T_INT_N_CS_PRECEDES,  "int_n_cs_precedes" },
+       { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
+       { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
+       { T_INT_P_SIGN_POSN,    "int_p_sign_posn" },
+       { T_INT_N_SIGN_POSN,    "int_n_sign_posn" },
+       { T_COLLATE,            "LC_COLLATE" },
+       { T_COLLATING_SYMBOL,   "collating-symbol" },
+       { T_COLLATING_ELEMENT,  "collating-element" },
+       { T_FROM,               "from" },
+       { T_ORDER_START,        "order_start" },
+       { T_ORDER_END,          "order_end" },
+       { T_FORWARD,            "forward" },
+       { T_BACKWARD,           "backward" },
+       { T_POSITION,           "position" },
+       { T_IGNORE,             "IGNORE" },
+       { T_UNDEFINED,          "UNDEFINED" },
+       { T_NUMERIC,            "LC_NUMERIC" },
+       { T_DECIMAL_POINT,      "decimal_point" },
+       { T_THOUSANDS_SEP,      "thousands_sep" },
+       { T_GROUPING,           "grouping" },
+       { T_TIME,               "LC_TIME" },
+       { T_ABDAY,              "abday" },
+       { T_DAY,                "day" },
+       { T_ABMON,              "abmon" },
+       { T_MON,                "mon" },
+       { T_D_T_FMT,            "d_t_fmt" },
+       { T_D_FMT,              "d_fmt" },
+       { T_T_FMT,              "t_fmt" },
+       { T_AM_PM,              "am_pm" },
+       { T_T_FMT_AMPM,         "t_fmt_ampm" },
+       { T_ERA,                "era" },
+       { T_ERA_D_FMT,          "era_d_fmt" },
+       { T_ERA_T_FMT,          "era_t_fmt" },
+       { T_ERA_D_T_FMT,        "era_d_t_fmt" },
+       { T_ALT_DIGITS,         "alt_digits" },
+       { T_CTYPE,              "LC_CTYPE" },
+       { T_ISUPPER,            "upper" },
+       { T_ISLOWER,            "lower" },
+       { T_ISALPHA,            "alpha" },
+       { T_ISDIGIT,            "digit" },
+       { T_ISPUNCT,            "punct" },
+       { T_ISXDIGIT,           "xdigit" },
+       { T_ISSPACE,            "space" },
+       { T_ISPRINT,            "print" },
+       { T_ISGRAPH,            "graph" },
+       { T_ISBLANK,            "blank" },
+       { T_ISCNTRL,            "cntrl" },
+       /*
+        * These entries are local additions, and not specified by
+        * TOG.  Note that they are not guaranteed to be accurate for
+        * all locales, and so applications should not depend on them.
+        */
+       { T_ISSPECIAL,          "special" },
+       { T_ISENGLISH,          "english" },
+       { T_ISPHONOGRAM,        "phonogram" },
+       { T_ISIDEOGRAM,         "ideogram" },
+       { T_ISNUMBER,           "number" },
+       /*
+        * We have to support this in the grammar, but it would be a
+        * syntax error to define a character as one of these without
+        * also defining it as an alpha or digit.  We ignore it in our
+        * parsing.
+        */
+       { T_ISALNUM,            "alnum" },
+       { T_TOUPPER,            "toupper" },
+       { T_TOLOWER,            "tolower" },
+
+       /*
+        * These are keywords used in the charmap file.  Note that
+        * Solaris orginally used angle brackets to wrap some of them,
+        * but we removed that to simplify our parser.  The first of these
+        * items are "global items."
+        */
+       { T_CHARMAP,            "CHARMAP" },
+       { T_WIDTH,              "WIDTH" },
+
+       { -1, NULL },
+};
+
+/*
+ * These special words are only used in a charmap file, enclosed in <>.
+ */
+static struct token symwords[] = {
+       { T_COM_CHAR,           "comment_char" },
+       { T_ESC_CHAR,           "escape_char" },
+       { T_CODE_SET,           "code_set_name" },
+       { T_MB_CUR_MAX,         "mb_cur_max" },
+       { T_MB_CUR_MIN,         "mb_cur_min" },
+       { -1, NULL },
+};
+
+static int categories[] = {
+       T_CHARMAP,
+       T_CTYPE,
+       T_COLLATE,
+       T_MESSAGES,
+       T_MONETARY,
+       T_NUMERIC,
+       T_TIME,
+       T_WIDTH,
+       0
+};
+
+void
+reset_scanner(const char *fname)
+{
+       if (fname == NULL) {
+               filename = "<stdin>";
+               is_stdin = 1;
+       } else {
+               if (!is_stdin)
+                       (void) fclose(input);
+               if ((input = fopen(fname, "r")) == NULL) {
+                       perror("fopen");
+                       exit(4);
+               } else {
+                       is_stdin = 0;
+               }
+               filename = fname;
+       }
+       com_char = '#';
+       esc_char = '\\';
+       instring = 0;
+       escaped = 0;
+       lineno = 1;
+       nextline = 1;
+       tokidx = 0;
+       wideidx = 0;
+}
+
+#define        hex(x)  \
+       (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
+#define        isodigit(x)     ((x >= '0') && (x <= '7'))
+
+static int
+scanc(void)
+{
+       int     c;
+
+       if (is_stdin)
+               c = getc(stdin);
+       else
+               c = getc(input);
+       lineno = nextline;
+       if (c == '\n') {
+               nextline++;
+       }
+       return (c);
+}
+
+static void
+unscanc(int c)
+{
+       if (c == '\n') {
+               nextline--;
+       }
+       if (ungetc(c, is_stdin ? stdin : input) < 0) {
+               yyerror("ungetc failed");
+       }
+}
+
+static int
+scan_hex_byte(void)
+{
+       int     c1, c2;
+       int     v;
+
+       c1 = scanc();
+       if (!isxdigit(c1)) {
+               yyerror("malformed hex digit");
+               return (0);
+       }
+       c2 = scanc();
+       if (!isxdigit(c2)) {
+               yyerror("malformed hex digit");
+               return (0);
+       }
+       v = ((hex(c1) << 4) | hex(c2));
+       return (v);
+}
+
+static int
+scan_dec_byte(void)
+{
+       int     c1, c2, c3;
+       int     b;
+
+       c1 = scanc();
+       if (!isdigit(c1)) {
+               yyerror("malformed decimal digit");
+               return (0);
+       }
+       b = c1 - '0';
+       c2 = scanc();
+       if (!isdigit(c2)) {
+               yyerror("malformed decimal digit");
+               return (0);
+       }
+       b *= 10;
+       b += (c2 - '0');
+       c3 = scanc();
+       if (!isdigit(c3)) {
+               unscanc(c3);
+       } else {
+               b *= 10;
+               b += (c3 - '0');
+       }
+       return (b);
+}
+
+static int
+scan_oct_byte(void)
+{
+       int c1, c2, c3;
+       int     b;
+
+       b = 0;
+
+       c1 = scanc();
+       if (!isodigit(c1)) {
+               yyerror("malformed octal digit");
+               return (0);
+       }
+       b = c1 - '0';
+       c2 = scanc();
+       if (!isodigit(c2)) {
+               yyerror("malformed octal digit");
+               return (0);
+       }
+       b *= 8;
+       b += (c2 - '0');
+       c3 = scanc();
+       if (!isodigit(c3)) {
+               unscanc(c3);
+       } else {
+               b *= 8;
+               b += (c3 - '0');
+       }
+       return (b);
+}
+
+void
+add_tok(int c)
+{
+       if ((tokidx + 1) >= toksz) {
+               toksz += 64;
+               if ((token = realloc(token, toksz)) == NULL) {
+                       yyerror("out of memory");
+                       tokidx = 0;
+                       toksz = 0;
+                       return;
+               }
+       }
+
+       token[tokidx++] = (char)c;
+       token[tokidx] = 0;
+}
+void
+add_wcs(wchar_t c)
+{
+       if ((wideidx + 1) >= widesz) {
+               widesz += 64;
+               widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
+               if (widestr == NULL) {
+                       yyerror("out of memory");
+                       wideidx = 0;
+                       widesz = 0;
+                       return;
+               }
+       }
+
+       widestr[wideidx++] = c;
+       widestr[wideidx] = 0;
+}
+
+wchar_t *
+get_wcs(void)
+{
+       wchar_t *ws = widestr;
+       wideidx = 0;
+       widestr = NULL;
+       widesz = 0;
+       if (ws == NULL) {
+               if ((ws = wcsdup(L"")) == NULL) {
+                       yyerror("out of memory");
+               }
+       }
+       return (ws);
+}
+
+static int
+get_byte(void)
+{
+       int     c;
+
+       if ((c = scanc()) != esc_char) {
+               unscanc(c);
+               return (EOF);
+       }
+       c = scanc();
+
+       switch (c) {
+       case 'd':
+       case 'D':
+               return (scan_dec_byte());
+       case 'x':
+       case 'X':
+               return (scan_hex_byte());
+       case '0':
+       case '1':
+       case '2':
+       case '3':
+       case '4':
+       case '5':
+       case '6':
+       case '7':
+               /* put the character back so we can get it */
+               unscanc(c);
+               return (scan_oct_byte());
+       default:
+               unscanc(c);
+               unscanc(esc_char);
+               return (EOF);
+       }
+}
+
+int
+get_escaped(int c)
+{
+       switch (c) {
+       case 'n':
+               return ('\n');
+       case 'r':
+               return ('\r');
+       case 't':
+               return ('\t');
+       case 'f':
+               return ('\f');
+       case 'v':
+               return ('\v');
+       case 'b':
+               return ('\b');
+       case 'a':
+               return ('\a');
+       default:
+               return (c);
+       }
+}
+
+int
+get_wide(void)
+{
+       static char mbs[MB_LEN_MAX + 1] = "";
+       static int mbi = 0;
+       int c;
+       wchar_t wc;
+
+       if (mb_cur_max >= (int)sizeof (mbs)) {
+               yyerror("max multibyte character size too big");
+               mbi = 0;
+               return (T_NULL);
+       }
+       for (;;) {
+               if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
+                       /*
+                        * end of the byte sequence reached, but no
+                        * valid wide decoding.  fatal error.
+                        */
+                       mbi = 0;
+                       yyerror("not a valid character encoding");
+                       return (T_NULL);
+               }
+               mbs[mbi++] = c;
+               mbs[mbi] = 0;
+
+               /* does it decode? */
+               if (to_wide(&wc, mbs) >= 0) {
+                       break;
+               }
+       }
+
+       mbi = 0;
+       if ((category != T_CHARMAP) && (category != T_WIDTH)) {
+               if (check_charmap(wc) < 0) {
+                       yyerror("no symbolic name for character");
+                       return (T_NULL);
+               }
+       }
+
+       yylval.wc = wc;
+       return (T_CHAR);
+}
+
+int
+get_symbol(void)
+{
+       int     c;
+
+       while ((c = scanc()) != EOF) {
+               if (escaped) {
+                       escaped = 0;
+                       if (c == '\n')
+                               continue;
+                       add_tok(get_escaped(c));
+                       continue;
+               }
+               if (c == esc_char) {
+                       escaped = 1;
+                       continue;
+               }
+               if (c == '\n') {        /* well that's strange! */
+                       yyerror("unterminated symbolic name");
+                       continue;
+               }
+               if (c == '>') {         /* end of symbol */
+
+                       /*
+                        * This restarts the token from the beginning
+                        * the next time we scan a character.  (This
+                        * token is complete.)
+                        */
+
+                       if (token == NULL) {
+                               yyerror("missing symbolic name");
+                               return (T_NULL);
+                       }
+                       tokidx = 0;
+
+                       /*
+                        * A few symbols are handled as keywords outside
+                        * of the normal categories.
+                        */
+                       if (category == T_END) {
+                               int i;
+                               for (i = 0; symwords[i].name != 0; i++) {
+                                       if (strcmp(token, symwords[i].name) ==
+                                           0) {
+                                               last_kw = symwords[i].id;
+                                               return (last_kw);
+                                       }
+                               }
+                       }
+                       /*
+                        * Contextual rule: Only literal characters are
+                        * permitted in CHARMAP.  Anywhere else the symbolic
+                        * forms are fine.
+                        */
+                       if ((category != T_CHARMAP) &&
+                           (lookup_charmap(token, &yylval.wc)) != -1) {
+                               return (T_CHAR);
+                       }
+                       if ((yylval.collsym = lookup_collsym(token)) != NULL) {
+                               return (T_COLLSYM);
+                       }
+                       if ((yylval.collelem = lookup_collelem(token)) !=
+                           NULL) {
+                               return (T_COLLELEM);
+                       }
+                       /* its an undefined symbol */
+                       yylval.token = strdup(token);
+                       token = NULL;
+                       toksz = 0;
+                       tokidx = 0;
+                       return (T_SYMBOL);
+               }
+               add_tok(c);
+       }
+
+       yyerror("unterminated symbolic name");
+       return (EOF);
+}
+
+int
+get_category(void)
+{
+       return (category);
+}
+
+static int
+consume_token(void)
+{
+       int     len = tokidx;
+       int     i;
+
+       tokidx = 0;
+       if (token == NULL)
+               return (T_NULL);
+
+       /*
+        * this one is special, because we don't want it to alter the
+        * last_kw field.
+        */
+       if (strcmp(token, "...") == 0) {
+               return (T_ELLIPSIS);
+       }
+
+       /* search for reserved words first */
+       for (i = 0; keywords[i].name; i++) {
+               int j;
+               if (strcmp(keywords[i].name, token) != 0) {
+                       continue;
+               }
+
+               last_kw = keywords[i].id;
+
+               /* clear the top level category if we're done with it */
+               if (last_kw == T_END) {
+                       category = T_END;
+               }
+
+               /* set the top level category if we're changing */
+               for (j = 0; categories[j]; j++) {
+                       if (categories[j] != last_kw)
+                               continue;
+                       category = last_kw;
+               }
+
+               return (keywords[i].id);
+       }
+
+       /* maybe its a numeric constant? */
+       if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
+               char *eptr;
+               yylval.num = strtol(token, &eptr, 10);
+               if (*eptr != 0)
+                       yyerror("malformed number");
+               return (T_NUMBER);
+       }
+
+       /*
+        * A single lone character is treated as a character literal.
+        * To avoid duplication of effort, we stick in the charmap.
+        */
+       if (len == 1) {
+               yylval.wc = token[0];
+               return (T_CHAR);
+       }
+
+       /* anything else is treated as a symbolic name */
+       yylval.token = strdup(token);
+       token = NULL;
+       toksz = 0;
+       tokidx = 0;
+       return (T_NAME);
+}
+
+void
+scan_to_eol(void)
+{
+       int     c;
+       while ((c = scanc()) != '\n') {
+               if (c == EOF) {
+                       /* end of file without newline! */
+                       errf("missing newline");
+                       return;
+               }
+       }
+       assert(c == '\n');
+}
+
+int
+yylex(void)
+{
+       int             c;
+
+       while ((c = scanc()) != EOF) {
+
+               /* special handling for quoted string */
+               if (instring) {
+                       if (escaped) {
+                               escaped = 0;
+
+                               /* if newline, just eat and forget it */
+                               if (c == '\n')
+                                       continue;
+
+                               if (strchr("xXd01234567", c)) {
+                                       unscanc(c);
+                                       unscanc(esc_char);
+                                       return (get_wide());
+                               }
+                               yylval.wc = get_escaped(c);
+                               return (T_CHAR);
+                       }
+                       if (c == esc_char) {
+                               escaped = 1;
+                               continue;
+                       }
+                       switch (c) {
+                       case '<':
+                               return (get_symbol());
+                       case '>':
+                               /* oops! should generate syntax error  */
+                               return (T_GT);
+                       case '"':
+                               instring = 0;
+                               return (T_QUOTE);
+                       default:
+                               yylval.wc = c;
+                               return (T_CHAR);
+                       }
+               }
+
+               /* escaped characters first */
+               if (escaped) {
+                       escaped = 0;
+                       if (c == '\n') {
+                               /* eat the newline */
+                               continue;
+                       }
+                       hadtok = 1;
+                       if (tokidx) {
+                               /* an escape mid-token is nonsense */
+                               return (T_NULL);
+                       }
+
+                       /* numeric escapes are treated as wide characters */
+                       if (strchr("xXd01234567", c)) {
+                               unscanc(c);
+                               unscanc(esc_char);
+                               return (get_wide());
+                       }
+
+                       add_tok(get_escaped(c));
+                       continue;
+               }
+
+               /* if it is the escape charter itself note it */
+               if (c == esc_char) {
+                       escaped = 1;
+                       continue;
+               }
+
+               /* remove from the comment char to end of line */
+               if (c == com_char) {
+                       while (c != '\n') {
+                               if ((c = scanc()) == EOF) {
+                                       /* end of file without newline! */
+                                       return (EOF);
+                               }
+                       }
+                       assert(c == '\n');
+                       if (!hadtok) {
+                               /*
+                                * If there were no tokens on this line,
+                                * then just pretend it didn't exist at all.
+                                */
+                               continue;
+                       }
+                       hadtok = 0;
+                       return (T_NL);
+               }
+
+               if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
+                       /*
+                        * These are all token delimiters.  If there
+                        * is a token already in progress, we need to
+                        * process it.
+                        */
+                       unscanc(c);
+                       return (consume_token());
+               }
+
+               switch (c) {
+               case '\n':
+                       if (!hadtok) {
+                               /*
+                                * If the line was completely devoid of tokens,
+                                * then just ignore it.
+                                */
+                               continue;
+                       }
+                       /* we're starting a new line, reset the token state */
+                       hadtok = 0;
+                       return (T_NL);
+               case ',':
+                       hadtok = 1;
+                       return (T_COMMA);
+               case ';':
+                       hadtok = 1;
+                       return (T_SEMI);
+               case '(':
+                       hadtok = 1;
+                       return (T_LPAREN);
+               case ')':
+                       hadtok = 1;
+                       return (T_RPAREN);
+               case '>':
+                       hadtok = 1;
+                       return (T_GT);
+               case '<':
+                       /* symbol start! */
+                       hadtok = 1;
+                       return (get_symbol());
+               case ' ':
+               case '\t':
+                       /* whitespace, just ignore it */
+                       continue;
+               case '"':
+                       hadtok = 1;
+                       instring = 1;
+                       return (T_QUOTE);
+               default:
+                       hadtok = 1;
+                       add_tok(c);
+                       continue;
+               }
+       }
+       return (EOF);
+}
+
+void
+yyerror(const char *msg)
+{
+       (void) fprintf(stderr, "%s: %d: error: %s\n",
+           filename, lineno, msg);
+       exit(4);
+}
+
+void
+errf(const char *fmt, ...)
+{
+       char    *msg;
+
+       va_list va;
+       va_start(va, fmt);
+       (void) vasprintf(&msg, fmt, va);
+       va_end(va);
+
+       (void) fprintf(stderr, "%s: %d: error: %s\n",
+           filename, lineno, msg);
+       free(msg);
+       exit(4);
+}
+
+void
+warn(const char *fmt, ...)
+{
+       char    *msg;
+
+       va_list va;
+       va_start(va, fmt);
+       (void) vasprintf(&msg, fmt, va);
+       va_end(va);
+
+       (void) fprintf(stderr, "%s: %d: warning: %s\n",
+           filename, lineno, msg);
+       free(msg);
+       warnings++;
+       if (!warnok)
+               exit(4);
+}
diff --git a/usr.bin/localedef/time.c b/usr.bin/localedef/time.c
new file mode 100644 (file)
index 0000000..c4db5f2
--- /dev/null
@@ -0,0 +1,264 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * LC_TIME database generation routines for localedef.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <string.h>
+#include <unistd.h>
+#include "localedef.h"
+#include "parser.h"
+#include "timelocal.h"
+
+struct lc_time_T tm;
+
+void
+init_time(void)
+{
+       (void) memset(&tm, 0, sizeof (tm));
+}
+
+void
+add_time_str(wchar_t *wcs)
+{
+       char    *str;
+
+       if ((str = to_mb_string(wcs)) == NULL) {
+               INTERR;
+               return;
+       }
+       free(wcs);
+
+       switch (last_kw) {
+       case T_D_T_FMT:
+               tm.c_fmt = str;
+               break;
+       case T_D_FMT:
+               tm.x_fmt = str;
+               break;
+       case T_T_FMT:
+               tm.X_fmt = str;
+               break;
+       case T_T_FMT_AMPM:
+               tm.ampm_fmt = str;
+               break;
+       case T_DATE_FMT:
+               /*
+                * This one is a Solaris extension, Too bad date just
+                * doesn't use %c, which would be simpler.
+                */
+               tm.date_fmt = str;
+               break;
+       case T_ERA_D_FMT:
+       case T_ERA_T_FMT:
+       case T_ERA_D_T_FMT:
+               /* Silently ignore it. */
+               break;
+       default:
+               free(str);
+               INTERR;
+               break;
+       }
+}
+
+static void
+add_list(const char *ptr[], char *str, int limit)
+{
+       int     i;
+       for (i = 0; i < limit; i++) {
+               if (ptr[i] == NULL) {
+                       ptr[i] = str;
+                       return;
+               }
+       }
+       fprintf(stderr,"too many list elements");
+}
+
+void
+add_time_list(wchar_t *wcs)
+{
+       char *str;
+
+       if ((str = to_mb_string(wcs)) == NULL) {
+               INTERR;
+               return;
+       }
+       free(wcs);
+
+       switch (last_kw) {
+       case T_ABMON:
+               add_list(tm.mon, str, 12);
+               break;
+       case T_MON:
+               add_list(tm.month, str, 12);
+               break;
+       case T_ABDAY:
+               add_list(tm.wday, str, 7);
+               break;
+       case T_DAY:
+               add_list(tm.weekday, str, 7);
+               break;
+       case T_AM_PM:
+               if (tm.am == NULL) {
+                       tm.am = str;
+               } else if (tm.pm == NULL) {
+                       tm.pm = str;
+               } else {
+                       fprintf(stderr,"too many list elements");
+               }
+               break;
+       case T_ALT_DIGITS:
+       case T_ERA:
+               free(str);
+               break;
+       default:
+               free(str);
+               INTERR;
+               break;
+       }
+}
+
+void
+check_time_list(void)
+{
+       switch (last_kw) {
+       case T_ABMON:
+               if (tm.mon[11] != NULL)
+                       return;
+               break;
+       case T_MON:
+               if (tm.month[11] != NULL)
+                       return;
+               break;
+       case T_ABDAY:
+               if (tm.wday[6] != NULL)
+                       return;
+               break;
+       case T_DAY:
+               if (tm.weekday[6] != NULL)
+                       return;
+               break;
+       case T_AM_PM:
+               if (tm.pm != NULL)
+                       return;
+               break;
+       case T_ERA:
+       case T_ALT_DIGITS:
+               return;
+       default:
+               fprintf(stderr,"unknown list");
+               break;
+       }
+
+       fprintf(stderr,"too few items in list (%d)", last_kw);
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+void
+reset_time_list(void)
+{
+       int i;
+       switch (last_kw) {
+       case T_ABMON:
+               for (i = 0; i < 12; i++) {
+                       free((char *)tm.mon[i]);
+                       tm.mon[i] = NULL;
+               }
+               break;
+       case T_MON:
+               for (i = 0; i < 12; i++) {
+                       free((char *)tm.month[i]);
+                       tm.month[i] = NULL;
+               }
+               break;
+       case T_ABDAY:
+               for (i = 0; i < 7; i++) {
+                       free((char *)tm.wday[i]);
+                       tm.wday[i] = NULL;
+               }
+               break;
+       case T_DAY:
+               for (i = 0; i < 7; i++) {
+                       free((char *)tm.weekday[i]);
+                       tm.weekday[i] = NULL;
+               }
+               break;
+       case T_AM_PM:
+               free((char *)tm.am);
+               tm.am = NULL;
+               free((char *)tm.pm);
+               tm.pm = NULL;
+               break;
+       }
+}
+
+#pragma GCC diagnostic pop
+
+void
+dump_time(void)
+{
+       FILE *f;
+       int i;
+
+       if ((f = open_category()) == NULL) {
+               return;
+       }
+
+       for (i = 0; i < 12; i++) {
+               if (putl_category(tm.mon[i], f) == EOF) {
+                       return;
+               }
+       }
+       for (i = 0; i < 12; i++) {
+               if (putl_category(tm.month[i], f) == EOF) {
+                       return;
+               }
+       }
+       for (i = 0; i < 7; i++) {
+               if (putl_category(tm.wday[i], f) == EOF) {
+                       return;
+               }
+       }
+       for (i = 0; i < 7; i++) {
+               if (putl_category(tm.weekday[i], f) == EOF) {
+                       return;
+               }
+       }
+
+       /*
+        * NOTE: If date_fmt is not specified, then we'll default to
+        * using the %c for date.  This is reasonable for most
+        * locales, although for reasons that I don't understand
+        * Solaris historically has had a seperate format for date.
+        */
+       if ((putl_category(tm.X_fmt, f) == EOF) ||
+           (putl_category(tm.x_fmt, f) == EOF) ||
+           (putl_category(tm.c_fmt, f) == EOF) ||
+           (putl_category(tm.am, f) == EOF) ||
+           (putl_category(tm.pm, f) == EOF) ||
+           (putl_category(tm.date_fmt ? tm.date_fmt : tm.c_fmt, f) == EOF) ||
+           (putl_category(tm.ampm_fmt, f) == EOF)) {
+               return;
+       }
+       close_category(f);
+}
diff --git a/usr.bin/localedef/wide.c b/usr.bin/localedef/wide.c
new file mode 100644 (file)
index 0000000..4456474
--- /dev/null
@@ -0,0 +1,652 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
+ * Copyright 2015 John Marino <draco@marino.st>
+ */
+
+/*
+ * The functions in this file convert from the standard multibyte forms
+ * to the wide character forms used internally by libc.  Unfortunately,
+ * this approach means that we need a method for each and every encoding.
+ */
+
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#include <sys/types.h>
+#include "localedef.h"
+
+static int towide_none(wchar_t *, const char *, unsigned);
+static int towide_utf8(wchar_t *, const char *, unsigned);
+static int towide_big5(wchar_t *, const char *, unsigned);
+static int towide_gbk(wchar_t *, const char *, unsigned);
+static int towide_gb2312(wchar_t *, const char *, unsigned);
+static int towide_gb18030(wchar_t *, const char *, unsigned);
+static int towide_mskanji(wchar_t *, const char *, unsigned);
+static int towide_euccn(wchar_t *, const char *, unsigned);
+static int towide_eucjp(wchar_t *, const char *, unsigned);
+static int towide_euckr(wchar_t *, const char *, unsigned);
+static int towide_euctw(wchar_t *, const char *, unsigned);
+
+static int tomb_none(char *, wchar_t);
+static int tomb_utf8(char *, wchar_t);
+static int tomb_mbs(char *, wchar_t);
+
+static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none;
+static int (*_tomb)(char *, wchar_t) = tomb_none;
+static const char *_encoding = "NONE";
+static int _nbits = 7;
+
+/*
+ * Table of supported encodings.  We only bother to list the multibyte
+ * encodings here, because single byte locales are handed by "NONE".
+ */
+static struct {
+       const char *name;
+       /* the name that the underlying libc implemenation uses */
+       const char *cname;
+       /* the maximum number of bits required for priorities */
+       int nbits;
+       int (*towide)(wchar_t *, const char *, unsigned);
+       int (*tomb)(char *, wchar_t);
+} mb_encodings[] = {
+       /*
+        * UTF8 values max out at 0x1fffff (although in theory there could
+        * be later extensions, but it won't happen.)  This means we only need
+        * 21 bits to be able to encode the entire range of priorities.
+        */
+       { "UTF-8",      "UTF-8",        21, towide_utf8, tomb_utf8 },
+       { "UTF8",       "UTF-8",        21, towide_utf8, tomb_utf8 },
+       { "utf8",       "UTF-8",        21, towide_utf8, tomb_utf8 },
+       { "utf-8",      "UTF-8",        21, towide_utf8, tomb_utf8 },
+
+       { "EUC-CN",     "EUC-CN",       16, towide_euccn, tomb_mbs },
+       { "eucCN",      "EUC-CN",       16, towide_euccn, tomb_mbs },
+       /*
+        * Becuase the 3-byte form of EUC-JP use the same leading byte,
+        * only 17 bits required to provide unique priorities.  (The low
+        * bit of that first byte is set.)  By setting this value low,
+        * we can get by with only 3 bytes in the strxfrm expansion.
+        */
+       { "EUC-JP",     "EUC-JP",       17, towide_eucjp, tomb_mbs },
+       { "eucJP",      "EUC-JP",       17, towide_eucjp, tomb_mbs },
+
+       { "EUC-KR",     "EUC-KR",       16, towide_euckr, tomb_mbs },
+       { "eucKR",      "EUC-KR",       16, towide_euckr, tomb_mbs },
+       /*
+        * EUC-TW uses 2 bytes most of the time, but 4 bytes if the
+        * high order byte is 0x8E.  However, with 4 byte encodings,
+        * the third byte will be A0-B0.  So we only need to consider
+        * the lower order 24 bits for collation.
+        */
+       { "EUC-TW",     "EUC-TW",       24, towide_euctw, tomb_mbs },
+       { "eucTW",      "EUC-TW",       24, towide_euctw, tomb_mbs },
+
+       { "MS_Kanji",   "MSKanji",      16, towide_mskanji, tomb_mbs },
+       { "MSKanji",    "MSKanji",      16, towide_mskanji, tomb_mbs },
+       { "PCK",        "MSKanji",      16, towide_mskanji, tomb_mbs },
+       { "SJIS",       "MSKanji",      16, towide_mskanji, tomb_mbs },
+       { "Shift_JIS",  "MSKanji",      16, towide_mskanji, tomb_mbs },
+
+       { "BIG5",       "BIG5",         16, towide_big5, tomb_mbs },
+       { "big5",       "BIG5",         16, towide_big5, tomb_mbs },
+       { "Big5",       "BIG5",         16, towide_big5, tomb_mbs },
+
+       { "GBK",        "GBK",          16, towide_gbk, tomb_mbs },
+
+       /*
+        * GB18030 can get away with just 31 bits.  This is because the
+        * high order bit is always set for 4 byte values, and the
+        * at least one of the other bits in that 4 byte value will
+        * be non-zero.
+        */
+       { "GB18030",    "GB18030",      31, towide_gb18030, tomb_mbs },
+
+       /*
+        * This should probably be an aliase for euc-cn, or vice versa.
+        */
+       { "GB2312",     "GB2312",       16, towide_gb2312, tomb_mbs },
+
+       { NULL, NULL, 0, 0, 0 },
+};
+
+static char *
+show_mb(const char *mb)
+{
+       static char buf[64];
+
+       /* ASCII stuff we just print */
+       if (isascii(*mb) && isgraph(*mb)) {
+               buf[0] = *mb;
+               buf[1] = 0;
+               return (buf);
+       }
+       buf[0] = 0;
+       while (*mb != 0) {
+               char scr[8];
+               (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
+               (void) strlcat(buf, scr, sizeof (buf));
+               mb++;
+       }
+       return (buf);
+}
+
+static char    *widemsg;
+
+void
+werr(const char *fmt, ...)
+{
+       char    *msg;
+
+       va_list va;
+       va_start(va, fmt);
+       (void) vasprintf(&msg, fmt, va);
+       va_end(va);
+
+       free(widemsg);
+       widemsg = msg;
+}
+
+/*
+ * This is used for 8-bit encodings.
+ */
+int
+towide_none(wchar_t *c, const char *mb, unsigned n __unused)
+{
+       if (mb_cur_max != 1) {
+               werr("invalid or unsupported multibyte locale");
+               return (-1);
+       }
+       *c = (uint8_t)*mb;
+       return (1);
+}
+
+int
+tomb_none(char *mb, wchar_t wc)
+{
+       if (mb_cur_max != 1) {
+               werr("invalid or unsupported multibyte locale");
+               return (-1);
+       }
+       *(uint8_t *)mb = (wc & 0xff);
+       mb[1] = 0;
+       return (1);
+}
+
+/*
+ * UTF-8 stores wide characters in UTF-32 form.
+ */
+int
+towide_utf8(wchar_t *wc, const char *mb, unsigned n)
+{
+       wchar_t c;
+       int     nb;
+       int     lv;     /* lowest legal value */
+       int     i;
+       const uint8_t *s = (const uint8_t *)mb;
+
+       c = *s;
+
+       if ((c & 0x80) == 0) {
+               /* 7-bit ASCII */
+               *wc = c;
+               return (1);
+       } else if ((c & 0xe0) == 0xc0) {
+               /* u80-u7ff - two bytes encoded */
+               nb = 2;
+               lv = 0x80;
+               c &= ~0xe0;
+       } else if ((c & 0xf0) == 0xe0) {
+               /* u800-uffff - three bytes encoded */
+               nb = 3;
+               lv = 0x800;
+               c &= ~0xf0;
+       } else if ((c & 0xf8) == 0xf0) {
+               /* u1000-u1fffff - four bytes encoded */
+               nb = 4;
+               lv = 0x1000;
+               c &= ~0xf8;
+       } else {
+               /* 5 and 6 byte encodings are not legal unicode */
+               werr("utf8 encoding too large (%s)", show_mb(mb));
+               return (-1);
+       }
+       if (nb > (int)n) {
+               werr("incomplete utf8 sequence (%s)", show_mb(mb));
+               return (-1);
+       }
+
+       for (i = 1; i < nb; i++) {
+               if (((s[i]) & 0xc0) != 0x80) {
+                       werr("illegal utf8 byte (%x)", s[i]);
+                       return (-1);
+               }
+               c <<= 6;
+               c |= (s[i] & 0x3f);
+       }
+
+       if (c < lv) {
+               werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
+               return (-1);
+       }
+       *wc = c;
+       return (nb);
+}
+
+int
+tomb_utf8(char *mb, wchar_t wc)
+{
+       uint8_t *s = (uint8_t *)mb;
+       uint8_t msk;
+       int cnt;
+       int i;
+
+       if (wc <= 0x7f) {
+               s[0] = wc & 0x7f;
+               s[1] = 0;
+               return (1);
+       }
+       if (wc <= 0x7ff) {
+               cnt = 2;
+               msk = 0xc0;
+       } else if (wc <= 0xffff) {
+               cnt = 3;
+               msk = 0xe0;
+       } else if (wc <= 0x1fffff) {
+               cnt = 4;
+               msk = 0xf0;
+       } else {
+               werr("illegal uf8 char (%x)", wc);
+               return (-1);
+       }
+       for (i = cnt - 1; i; i--) {
+               s[i] = (wc & 0x3f) | 0x80;
+               wc >>= 6;
+       }
+       s[0] = (msk) | wc;
+       s[cnt] = 0;
+       return (cnt);
+}
+
+/*
+ * Several encodings share a simplistic dual byte encoding.  In these
+ * forms, they all indicate that a two byte sequence is to be used if
+ * the first byte has its high bit set.  They all store this simple
+ * encoding as a 16-bit value, although a great many of the possible
+ * code points are not used in most character sets.  This gives a possible
+ * set of just over 32,000 valid code points.
+ *
+ * 0x00 - 0x7f         - 1 byte encoding
+ * 0x80 - 0x7fff       - illegal
+ * 0x8000 - 0xffff     - 2 byte encoding
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+static int
+towide_dbcs(wchar_t *wc, const char *mb, unsigned n)
+{
+       wchar_t c;
+
+       c = *(uint8_t *)mb;
+
+       if ((c & 0x80) == 0) {
+               /* 7-bit */
+               *wc = c;
+               return (1);
+       }
+       if (n < 2) {
+               werr("incomplete character sequence (%s)", show_mb(mb));
+               return (-1);
+       }
+
+       /* Store both bytes as a single 16-bit wide. */
+       c <<= 8;
+       c |= (uint8_t)(mb[1]);
+       *wc = c;
+       return (2);
+}
+
+/*
+ * Most multibyte locales just convert the wide character to the multibyte
+ * form by stripping leading null bytes, and writing the 32-bit quantity
+ * in big-endian order.
+ */
+int
+tomb_mbs(char *mb, wchar_t wc)
+{
+       uint8_t *s = (uint8_t *)mb;
+       int     n = 0, c;
+
+       if ((wc & 0xff000000U) != 0) {
+               n = 4;
+       } else if ((wc & 0x00ff0000U) != 0) {
+               n = 3;
+       } else if ((wc & 0x0000ff00U) != 0) {
+               n = 2;
+       } else {
+               n = 1;
+       }
+       c = n;
+       while (n) {
+               n--;
+               s[n] = wc & 0xff;
+               wc >>= 8;
+       }
+       /* ensure null termination */
+       s[c] = 0;
+       return (c);
+}
+
+
+/*
+ * big5 is a simple dual byte character set.
+ */
+int
+towide_big5(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_dbcs(wc, mb, n));
+}
+
+/*
+ * GBK encodes wides in the same way that big5 does, the high order
+ * bit of the first byte indicates a double byte character.
+ */
+int
+towide_gbk(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_dbcs(wc, mb, n));
+}
+
+/*
+ * GB2312 is another DBCS.  Its cleaner than others in that the second
+ * byte does not encode ASCII, but it supports characters.
+ */
+int
+towide_gb2312(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_dbcs(wc, mb, n));
+}
+
+/*
+ * GB18030.  This encodes as 8, 16, or 32-bits.
+ * 7-bit values are in 1 byte,  4 byte sequences are used when
+ * the second byte encodes 0x30-39 and all other sequences are 2 bytes.
+ */
+int
+towide_gb18030(wchar_t *wc, const char *mb, unsigned n)
+{
+       wchar_t c;
+
+       c = *(uint8_t *)mb;
+
+       if ((c & 0x80) == 0) {
+               /* 7-bit */
+               *wc = c;
+               return (1);
+       }
+       if (n < 2) {
+               werr("incomplete character sequence (%s)", show_mb(mb));
+               return (-1);
+       }
+
+       /* pull in the second byte */
+       c <<= 8;
+       c |= (uint8_t)(mb[1]);
+
+       if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
+               if (n < 4) {
+                       werr("incomplete 4-byte character sequence (%s)",
+                           show_mb(mb));
+                       return (-1);
+               }
+               c <<= 8;
+               c |= (uint8_t)(mb[2]);
+               c <<= 8;
+               c |= (uint8_t)(mb[3]);
+               *wc = c;
+               return (4);
+       }
+
+       *wc = c;
+       return (2);
+}
+
+/*
+ * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
+ * also has a range of single byte characters above 0x80.  (0xa1-0xdf).
+ */
+int
+towide_mskanji(wchar_t *wc, const char *mb, unsigned n)
+{
+       wchar_t c;
+
+       c = *(uint8_t *)mb;
+
+       if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
+               /* 7-bit */
+               *wc = c;
+               return (1);
+       }
+
+       if (n < 2) {
+               werr("incomplete character sequence (%s)", show_mb(mb));
+               return (-1);
+       }
+
+       /* Store both bytes as a single 16-bit wide. */
+       c <<= 8;
+       c |= (uint8_t)(mb[1]);
+       *wc = c;
+       return (2);
+}
+
+/*
+ * EUC forms.  EUC encodings are "variable".  FreeBSD carries some additional
+ * variable data to encode these, but we're going to treat each as independent
+ * instead.  Its the only way we can sensibly move forward.
+ *
+ * Note that the way in which the different EUC forms vary is how wide
+ * CS2 and CS3 are and what the first byte of them is.
+ */
+static int
+towide_euc_impl(wchar_t *wc, const char *mb, unsigned n,
+    uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
+{
+       int i;
+       int width = 2;
+       wchar_t c;
+
+       c = *(uint8_t *)mb;
+
+       /*
+        * All variations of EUC encode 7-bit ASCII as one byte, and use
+        * additional bytes for more than that.
+        */
+       if ((c & 0x80) == 0) {
+               /* 7-bit */
+               *wc = c;
+               return (1);
+       }
+
+       /*
+        * All EUC variants reserve 0xa1-0xff to identify CS1, which
+        * is always two bytes wide.  Note that unused CS will be zero,
+        * and that cannot be true because we know that the high order
+        * bit must be set.
+        */
+       if (c >= 0xa1) {
+               width = 2;
+       } else if (c == cs2) {
+               width = cs2width;
+       } else if (c == cs3) {
+               width = cs3width;
+       }
+
+       if ((int)n < width) {
+               werr("incomplete character sequence (%s)", show_mb(mb));
+               return (-1);
+       }
+
+       for (i = 1; i < width; i++) {
+               /* pull in the next byte */
+               c <<= 8;
+               c |= (uint8_t)(mb[i]);
+       }
+
+       *wc = c;
+       return (width);
+}
+
+#pragma GCC diagnostic pop
+
+/*
+ * EUC-CN encodes as follows:
+ *
+ * Code set 0 (ASCII):                         0x21-0x7E
+ * Code set 1 (CNS 11643-1992 Plane 1):                0xA1A1-0xFEFE
+ * Code set 2:                                 unused
+ * Code set 3:                                 unused
+ */
+int
+towide_euccn(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
+}
+
+/*
+ * EUC-JP encodes as follows:
+ *
+ * Code set 0 (ASCII or JIS X 0201-1976 Roman):        0x21-0x7E
+ * Code set 1 (JIS X 0208):                    0xA1A1-0xFEFE
+ * Code set 2 (half-width katakana):           0x8EA1-0x8EDF
+ * Code set 3 (JIS X 0212-1990):               0x8FA1A1-0x8FFEFE
+ */
+int
+towide_eucjp(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
+}
+
+/*
+ * EUC-KR encodes as follows:
+ *
+ * Code set 0 (ASCII or KS C 5636-1993):       0x21-0x7E
+ * Code set 1 (KS C 5601-1992):                        0xA1A1-0xFEFE
+ * Code set 2:                                 unused
+ * Code set 3:                                 unused
+ */
+int
+towide_euckr(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
+}
+
+/*
+ * EUC-TW encodes as follows:
+ *
+ * Code set 0 (ASCII):                         0x21-0x7E
+ * Code set 1 (CNS 11643-1992 Plane 1):                0xA1A1-0xFEFE
+ * Code set 2 (CNS 11643-1992 Planes 1-16):    0x8EA1A1A1-0x8EB0FEFE
+ * Code set 3:                                 unused
+ */
+int
+towide_euctw(wchar_t *wc, const char *mb, unsigned n)
+{
+       return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
+}
+
+/*
+ * Public entry points.
+ */
+
+int
+to_wide(wchar_t *wc, const char *mb)
+{
+       /* this won't fail hard */
+       return (_towide(wc, mb, strlen(mb)));
+}
+
+int
+to_mb(char *mb, wchar_t wc)
+{
+       int     rv;
+
+       if ((rv = _tomb(mb, wc)) < 0) {
+               errf(widemsg);
+               free(widemsg);
+               widemsg = NULL;
+       }
+       return (rv);
+}
+
+char *
+to_mb_string(const wchar_t *wcs)
+{
+       char    *mbs;
+       char    *ptr;
+       int     len;
+
+       mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
+       if (mbs == NULL) {
+               errf("out of memory");
+               return (NULL);
+       }
+       ptr = mbs;
+       while (*wcs) {
+               if ((len = to_mb(ptr, *wcs)) < 0) {
+                       INTERR;
+                       free(mbs);
+                       return (NULL);
+               }
+               wcs++;
+               ptr += len;
+       }
+       *ptr = 0;
+       return (mbs);
+}
+
+void
+set_wide_encoding(const char *encoding)
+{
+       int i;
+
+       _towide = towide_none;
+       _tomb = tomb_none;
+       _encoding = "NONE";
+       _nbits = 8;
+
+       for (i = 0; mb_encodings[i].name; i++) {
+               if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
+                       _towide = mb_encodings[i].towide;
+                       _tomb = mb_encodings[i].tomb;
+                       _encoding = mb_encodings[i].cname;
+                       _nbits = mb_encodings[i].nbits;
+                       break;
+               }
+       }
+}
+
+const char *
+get_wide_encoding(void)
+{
+       return (_encoding);
+}
+
+int
+max_wide(void)
+{
+       return ((int)((1U << _nbits) - 1));
+}