usr.bin/localedef/collate.c

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11
  12 /*
  13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
  14  * Copyright 2015 John Marino <draco@marino.st>
  15  */
  16
  17 /*
  18  * LC_COLLATE database generation routines for localedef.
  19  */
  20
  21 #include <stdio.h>
  22 #include <stddef.h>
  23 #include <stdlib.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <string.h>
  28 #include <unistd.h>
  29 #include <wchar.h>
  30 #include <limits.h>
  31 #include "localedef.h"
  32 #include "parser.h"
  33 #include "collate.h"
  34 #include "avl.h"
  35
  36 /*
  37  * Design notes.
  38  *
  39  * It will be extremely helpful to the reader if they have access to
  40  * the localedef and locale file format specifications available.
  41  * Latest versions of these are available from www.opengroup.org.
  42  *
  43  * The design for the collation code is a bit complex.  The goal is a
  44  * single collation database as described in collate.h (in
  45  * libc/port/locale).  However, there are some other tidbits:
  46  *
  47  * a) The substitution entries are now a directly indexable array.  A
  48  * priority elsewhere in the table is taken as an index into the
  49  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
  50  * set.  (The bit is cleared and the result is the index into the
  51  * table.
  52  *
  53  * b) We eliminate duplicate entries into the substitution table.
  54  * This saves a lot of space.
  55  *
  56  * c) The priorities for each level are "compressed", so that each
  57  * sorting level has consecutively numbered priorities starting at 1.
  58  * (O is reserved for the ignore priority.)  This means sort levels
  59  * which only have a few distinct priorities can represent the
  60  * priority level in fewer bits, which makes the strxfrm output
  61  * smaller.
  62  *
  63  * d) We record the total number of priorities so that strxfrm can
  64  * figure out how many bytes to expand a numeric priority into.
  65  *
  66  * e) For the UNDEFINED pass (the last pass), we record the maximum
  67  * number of bits needed to uniquely prioritize these entries, so that
  68  * the last pass can also use smaller strxfrm output when possible.
  69  *
  70  * f) Priorities with the sign bit set are verboten.  This works out
  71  * because no active character set needs that bit to carry significant
  72  * information once the character is in wide form.
  73  *
  74  * To process the entire data to make the database, we actually run
  75  * multiple passes over the data.
  76  *
  77  * The first pass, which is done at parse time, identifies elements,
  78  * substitutions, and such, and records them in priority order.  As
  79  * some priorities can refer to other priorities, using forward
  80  * references, we use a table of references indicating whether the
  81  * priority's value has been resolved, or whether it is still a
  82  * reference.
  83  *
  84  * The second pass walks over all the items in priority order, noting
  85  * that they are used directly, and not just an indirect reference.
  86  * This is done by creating a "weight" structure for the item.  The
  87  * weights are stashed in an AVL tree sorted by relative "priority".
  88  *
  89  * The third pass walks over all the weight structures, in priority
  90  * order, and assigns a new monotonically increasing (per sort level)
  91  * weight value to them.  These are the values that will actually be
  92  * written to the file.
  93  *
  94  * The fourth pass just writes the data out.
  95  */
  96
  97 /*
  98  * In order to resolve the priorities, we create a table of priorities.
  99  * Entries in the table can be in one of three states.
 100  *
 101  * UNKNOWN is for newly allocated entries, and indicates that nothing
 102  * is known about the priority.  (For example, when new entries are created
 103  * for collating-symbols, this is the value assigned for them until the
 104  * collating symbol's order has been determined.
 105  *
 106  * RESOLVED is used for an entry where the priority indicates the final
 107  * numeric weight.
 108  *
 109  * REFER is used for entries that reference other entries.  Typically
 110  * this is used for forward references.  A collating-symbol can never
 111  * have this value.
 112  *
 113  * The "pass" field is used during final resolution to aid in detection
 114  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
 115  * priority dependent on <A>.)
 116  */
 117 typedef enum {
 118         UNKNOWN,        /* priority is totally unknown */
 119         RESOLVED,       /* priority value fully resolved */
 120         REFER           /* priority is a reference (index) */
 121 } res_t;
 122
 123 typedef struct weight {
 124         int32_t         pri;
 125         int             opt;
 126         avl_node_t      avl;
 127 } weight_t;
 128
 129 typedef struct priority {
 130         res_t           res;
 131         int32_t         pri;
 132         int             pass;
 133         int             lineno;
 134 } collpri_t;
 135
 136 #define NUM_WT  collinfo.directive_count
 137
 138 /*
 139  * These are the abstract collating symbols, which are just a symbolic
 140  * way to reference a priority.
 141  */
 142 struct collsym {
 143         char            *name;
 144         int32_t         ref;
 145         avl_node_t      avl;
 146 };
 147
 148 /*
 149  * These are also abstract collating symbols, but we allow them to have
 150  * different priorities at different levels.
 151  */
 152 typedef struct collundef {
 153         char            *name;
 154         int32_t         ref[COLL_WEIGHTS_MAX];
 155         avl_node_t      avl;
 156 } collundef_t;
 157
 158 /*
 159  * These are called "chains" in libc.  This records the fact that two
 160  * more characters should be treated as a single collating entity when
 161  * they appear together.  For example, in Spanish <C><h> gets collated
 162  * as a character between <C> and <D>.
 163  */
 164 struct collelem {
 165         char            *symbol;
 166         wchar_t         *expand;
 167         int32_t         ref[COLL_WEIGHTS_MAX];
 168         avl_node_t      avl_bysymbol;
 169         avl_node_t      avl_byexpand;
 170 };
 171
 172 /*
 173  * Individual characters have a sequence of weights as well.
 174  */
 175 typedef struct collchar {
 176         wchar_t         wc;
 177         int32_t         ref[COLL_WEIGHTS_MAX];
 178         avl_node_t      avl;
 179 } collchar_t;
 180
 181 /*
 182  * Substitution entries.  The key is itself a priority.  Note that
 183  * when we create one of these, we *automatically* wind up with a
 184  * fully resolved priority for the key, because creation of
 185  * substitutions creates a resolved priority at the same time.
 186  */
 187 typedef struct {
 188         int32_t         key;
 189         int32_t         ref[COLLATE_STR_LEN];
 190         avl_node_t      avl;
 191         avl_node_t      avl_ref;
 192 } subst_t;
 193
 194 static avl_tree_t       collsyms;
 195 static avl_tree_t       collundefs;
 196 static avl_tree_t       elem_by_symbol;
 197 static avl_tree_t       elem_by_expand;
 198 static avl_tree_t       collchars;
 199 static avl_tree_t       substs[COLL_WEIGHTS_MAX];
 200 static avl_tree_t       substs_ref[COLL_WEIGHTS_MAX];
 201 static avl_tree_t       weights[COLL_WEIGHTS_MAX];
 202 static int32_t          nweight[COLL_WEIGHTS_MAX];
 203
 204 /*
 205  * This is state tracking for the ellipsis token.  Note that we start
 206  * the initial values so that the ellipsis logic will think we got a
 207  * magic starting value of NUL.  It starts at minus one because the
 208  * starting point is exclusive -- i.e. the starting point is not
 209  * itself handled by the ellipsis code.
 210  */
 211 static int currorder = EOF;
 212 static int lastorder = EOF;
 213 static collelem_t *currelem;
 214 static collchar_t *currchar;
 215 static collundef_t *currundef;
 216 static wchar_t ellipsis_start = 0;
 217 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
 218
 219 /*
 220  * We keep a running tally of weights.
 221  */
 222 static int nextpri = 1;
 223 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
 224
 225 /*
 226  * This array collects up the weights for each level.
 227  */
 228 static int32_t order_weights[COLL_WEIGHTS_MAX];
 229 static int curr_weight = 0;
 230 static int32_t subst_weights[COLLATE_STR_LEN];
 231 static int curr_subst = 0;
 232
 233 /*
 234  * Some initial priority values.
 235  */
 236 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
 237 static int32_t pri_ignore;
 238
 239 static collate_info_t collinfo;
 240
 241 static collpri_t        *prilist = NULL;
 242 static int              numpri = 0;
 243 static int              maxpri = 0;
 244
 245 static void start_order(int);
 246
 247 static int32_t
 248 new_pri(void)
 249 {
 250         int i;
 251
 252         if (numpri >= maxpri) {
 253                 maxpri = maxpri ? maxpri * 2 : 1024;
 254                 prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
 255                 if (prilist == NULL) {
 256                         fprintf(stderr,"out of memory");
 257                         return (-1);
 258                 }
 259                 for (i = numpri; i < maxpri; i++) {
 260                         prilist[i].res = UNKNOWN;
 261                         prilist[i].pri = 0;
 262                         prilist[i].pass = 0;
 263                 }
 264         }
 265         return (numpri++);
 266 }
 267
 268 static collpri_t *
 269 get_pri(int32_t ref)
 270 {
 271         if ((ref < 0) || (ref > numpri)) {
 272                 INTERR;
 273                 return (NULL);
 274         }
 275         return (&prilist[ref]);
 276 }
 277
 278 static void
 279 set_pri(int32_t ref, int32_t v, res_t res)
 280 {
 281         collpri_t       *pri;
 282
 283         pri = get_pri(ref);
 284
 285         if ((res == REFER) && ((v < 0) || (v >= numpri))) {
 286                 INTERR;
 287         }
 288
 289         /* Resolve self references */
 290         if ((res == REFER) && (ref == v)) {
 291                 v = nextpri;
 292                 res = RESOLVED;
 293         }
 294
 295         if (pri->res != UNKNOWN) {
 296                 warn("repeated item in order list (first on %d)",
 297                     pri->lineno);
 298                 return;
 299         }
 300         pri->lineno = lineno;
 301         pri->pri = v;
 302         pri->res = res;
 303 }
 304
 305 static int32_t
 306 resolve_pri(int32_t ref)
 307 {
 308         collpri_t       *pri;
 309         static int32_t  pass = 0;
 310
 311         pri = get_pri(ref);
 312         pass++;
 313         while (pri->res == REFER) {
 314                 if (pri->pass == pass) {
 315                         /* report a line with the circular symbol */
 316                         lineno = pri->lineno;
 317                         fprintf(stderr,"circular reference in order list");
 318                         return (-1);
 319                 }
 320                 if ((pri->pri < 0) || (pri->pri >= numpri)) {
 321                         INTERR;
 322                         return (-1);
 323                 }
 324                 pri->pass = pass;
 325                 pri = &prilist[pri->pri];
 326         }
 327
 328         if (pri->res == UNKNOWN) {
 329                 return (-1);
 330         }
 331         if (pri->res != RESOLVED)
 332                 INTERR;
 333
 334         return (pri->pri);
 335 }
 336
 337 static int
 338 weight_compare(const void *n1, const void *n2)
 339 {
 340         int32_t k1 = ((const weight_t *)n1)->pri;
 341         int32_t k2 = ((const weight_t *)n2)->pri;
 342
 343         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 344 }
 345
 346 static int
 347 collsym_compare(const void *n1, const void *n2)
 348 {
 349         const collsym_t *c1 = n1;
 350         const collsym_t *c2 = n2;
 351         int rv;
 352
 353         rv = strcmp(c1->name, c2->name);
 354         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 355 }
 356
 357 static int
 358 collundef_compare(const void *n1, const void *n2)
 359 {
 360         const collundef_t *c1 = n1;
 361         const collundef_t *c2 = n2;
 362         int rv;
 363
 364         rv = strcmp(c1->name, c2->name);
 365         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 366 }
 367
 368 static int
 369 element_compare_symbol(const void *n1, const void *n2)
 370 {
 371         const collelem_t *c1 = n1;
 372         const collelem_t *c2 = n2;
 373         int rv;
 374
 375         rv = strcmp(c1->symbol, c2->symbol);
 376         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 377 }
 378
 379 static int
 380 element_compare_expand(const void *n1, const void *n2)
 381 {
 382         const collelem_t *c1 = n1;
 383         const collelem_t *c2 = n2;
 384         int rv;
 385
 386         rv = wcscmp(c1->expand, c2->expand);
 387         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 388 }
 389
 390 static int
 391 collchar_compare(const void *n1, const void *n2)
 392 {
 393         wchar_t k1 = ((const collchar_t *)n1)->wc;
 394         wchar_t k2 = ((const collchar_t *)n2)->wc;
 395
 396         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 397 }
 398
 399 static int
 400 subst_compare(const void *n1, const void *n2)
 401 {
 402         int32_t k1 = ((const subst_t *)n1)->key;
 403         int32_t k2 = ((const subst_t *)n2)->key;
 404
 405         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 406 }
 407
 408 #pragma GCC diagnostic push
 409 #pragma GCC diagnostic ignored "-Wcast-qual"
 410
 411 static int
 412 subst_compare_ref(const void *n1, const void *n2)
 413 {
 414         int32_t *c1 = ((subst_t *)n1)->ref;
 415         int32_t *c2 = ((subst_t *)n2)->ref;
 416         int rv;
 417
 418         rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
 419         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 420 }
 421
 422 #pragma GCC diagnostic pop
 423
 424 void
 425 init_collate(void)
 426 {
 427         int i;
 428
 429         avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
 430             offsetof(collsym_t, avl));
 431
 432         avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
 433             offsetof(collundef_t, avl));
 434
 435         avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
 436             offsetof(collelem_t, avl_bysymbol));
 437         avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
 438             offsetof(collelem_t, avl_byexpand));
 439
 440         avl_create(&collchars, collchar_compare, sizeof (collchar_t),
 441             offsetof(collchar_t, avl));
 442
 443         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 444                 avl_create(&substs[i], subst_compare, sizeof (subst_t),
 445                     offsetof(subst_t, avl));
 446                 avl_create(&substs_ref[i], subst_compare_ref,
 447                     sizeof (subst_t), offsetof(subst_t, avl_ref));
 448                 avl_create(&weights[i], weight_compare, sizeof (weight_t),
 449                     offsetof(weight_t, avl));
 450                 nweight[i] = 1;
 451         }
 452
 453         (void) memset(&collinfo, 0, sizeof (collinfo));
 454
 455         /* allocate some initial priorities */
 456         pri_ignore = new_pri();
 457
 458         set_pri(pri_ignore, 0, RESOLVED);
 459
 460         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 461                 pri_undefined[i] = new_pri();
 462
 463                 /* we will override this later */
 464                 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
 465         }
 466 }
 467
 468 void
 469 define_collsym(char *name)
 470 {
 471         collsym_t       *sym;
 472         avl_index_t     where;
 473
 474         if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
 475                 fprintf(stderr,"out of memory");
 476                 return;
 477         }
 478         sym->name = name;
 479         sym->ref = new_pri();
 480
 481         if (avl_find(&collsyms, sym, &where) != NULL) {
 482                 /*
 483                  * This should never happen because we are only called
 484                  * for undefined symbols.
 485                  */
 486                 INTERR;
 487                 return;
 488         }
 489         avl_insert(&collsyms, sym, where);
 490 }
 491
 492 collsym_t *
 493 lookup_collsym(char *name)
 494 {
 495         collsym_t       srch;
 496
 497         srch.name = name;
 498         return (avl_find(&collsyms, &srch, NULL));
 499 }
 500
 501 collelem_t *
 502 lookup_collelem(char *symbol)
 503 {
 504         collelem_t      srch;
 505
 506         srch.symbol = symbol;
 507         return (avl_find(&elem_by_symbol, &srch, NULL));
 508 }
 509
 510 static collundef_t *
 511 get_collundef(char *name)
 512 {
 513         collundef_t     srch;
 514         collundef_t     *ud;
 515         avl_index_t     where;
 516         int             i;
 517
 518         srch.name = name;
 519         if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
 520                 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
 521                     ((ud->name = strdup(name)) == NULL)) {
 522                         fprintf(stderr,"out of memory");
 523                         return (NULL);
 524                 }
 525                 for (i = 0; i < NUM_WT; i++) {
 526                         ud->ref[i] = new_pri();
 527                 }
 528                 avl_insert(&collundefs, ud, where);
 529         }
 530         add_charmap_undefined(name);
 531         return (ud);
 532 }
 533
 534 static collchar_t *
 535 get_collchar(wchar_t wc, int create)
 536 {
 537         collchar_t      srch;
 538         collchar_t      *cc;
 539         avl_index_t     where;
 540         int             i;
 541
 542         srch.wc = wc;
 543         cc = avl_find(&collchars, &srch, &where);
 544         if ((cc == NULL) && create) {
 545                 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
 546                         fprintf(stderr, "out of memory");
 547                         return (NULL);
 548                 }
 549                 for (i = 0; i < NUM_WT; i++) {
 550                         cc->ref[i] = new_pri();
 551                 }
 552                 cc->wc = wc;
 553                 avl_insert(&collchars, cc, where);
 554         }
 555         return (cc);
 556 }
 557
 558 void
 559 end_order_collsym(collsym_t *sym)
 560 {
 561         start_order(T_COLLSYM);
 562         /* update the weight */
 563
 564         set_pri(sym->ref, nextpri, RESOLVED);
 565         nextpri++;
 566 }
 567
 568 void
 569 end_order(void)
 570 {
 571         int             i;
 572         int32_t         pri;
 573         int32_t         ref;
 574         collpri_t       *p;
 575
 576         /* advance the priority/weight */
 577         pri = nextpri;
 578
 579         switch (currorder) {
 580         case T_CHAR:
 581                 for (i = 0; i < NUM_WT; i++) {
 582                         if (((ref = order_weights[i]) < 0) ||
 583                             ((p = get_pri(ref)) == NULL) ||
 584                             (p->pri == -1)) {
 585                                 /* unspecified weight is a self reference */
 586                                 set_pri(currchar->ref[i], pri, RESOLVED);
 587                         } else {
 588                                 set_pri(currchar->ref[i], ref, REFER);
 589                         }
 590                         order_weights[i] = -1;
 591                 }
 592
 593                 /* leave a cookie trail in case next symbol is ellipsis */
 594                 ellipsis_start = currchar->wc + 1;
 595                 currchar = NULL;
 596                 break;
 597
 598         case T_ELLIPSIS:
 599                 /* save off the weights were we can find them */
 600                 for (i = 0; i < NUM_WT; i++) {
 601                         ellipsis_weights[i] = order_weights[i];
 602                         order_weights[i] = -1;
 603                 }
 604                 break;
 605
 606         case T_COLLELEM:
 607                 if (currelem == NULL) {
 608                         INTERR;
 609                 } else {
 610                         for (i = 0; i < NUM_WT; i++) {
 611
 612                                 if (((ref = order_weights[i]) < 0) ||
 613                                     ((p = get_pri(ref)) == NULL) ||
 614                                     (p->pri == -1)) {
 615                                         set_pri(currelem->ref[i], pri,
 616                                             RESOLVED);
 617                                 } else {
 618                                         set_pri(currelem->ref[i], ref, REFER);
 619                                 }
 620                                 order_weights[i] = -1;
 621                         }
 622                 }
 623                 break;
 624
 625         case T_UNDEFINED:
 626                 for (i = 0; i < NUM_WT; i++) {
 627                         if (((ref = order_weights[i]) < 0) ||
 628                             ((p = get_pri(ref)) == NULL) ||
 629                             (p->pri == -1)) {
 630                                 set_pri(pri_undefined[i], -1, RESOLVED);
 631                         } else {
 632                                 set_pri(pri_undefined[i], ref, REFER);
 633                         }
 634                         order_weights[i] = -1;
 635                 }
 636                 break;
 637
 638         case T_SYMBOL:
 639                 for (i = 0; i < NUM_WT; i++) {
 640                         if (((ref = order_weights[i]) < 0) ||
 641                             ((p = get_pri(ref)) == NULL) ||
 642                             (p->pri == -1)) {
 643                                 set_pri(currundef->ref[i], pri, RESOLVED);
 644                         } else {
 645                                 set_pri(currundef->ref[i], ref, REFER);
 646                         }
 647                         order_weights[i] = -1;
 648                 }
 649                 break;
 650
 651         default:
 652                 INTERR;
 653         }
 654
 655         nextpri++;
 656 }
 657
 658 static void
 659 start_order(int type)
 660 {
 661         int     i;
 662
 663         lastorder = currorder;
 664         currorder = type;
 665
 666         /* this is used to protect ELLIPSIS processing */
 667         if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
 668                 fprintf(stderr, "character value expected");
 669         }
 670
 671         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 672                 order_weights[i] = -1;
 673         }
 674         curr_weight = 0;
 675 }
 676
 677 void
 678 start_order_undefined(void)
 679 {
 680         start_order(T_UNDEFINED);
 681 }
 682
 683 void
 684 start_order_symbol(char *name)
 685 {
 686         currundef = get_collundef(name);
 687         start_order(T_SYMBOL);
 688 }
 689
 690 void
 691 start_order_char(wchar_t wc)
 692 {
 693         collchar_t      *cc;
 694         int32_t         ref;
 695
 696         start_order(T_CHAR);
 697
 698         /*
 699          * If we last saw an ellipsis, then we need to close the range.
 700          * Handle that here.  Note that we have to be careful because the
 701          * items *inside* the range are treated exclusiveley to the items
 702          * outside of the range.  The ends of the range can have quite
 703          * different weights than the range members.
 704          */
 705         if (lastorder == T_ELLIPSIS) {
 706                 int             i;
 707
 708                 if (wc < ellipsis_start) {
 709                         fprintf(stderr, "malformed range!");
 710                         return;
 711                 }
 712                 while (ellipsis_start < wc) {
 713                         /*
 714                          * pick all of the saved weights for the
 715                          * ellipsis.  note that -1 encodes for the
 716                          * ellipsis itself, which means to take the
 717                          * current relative priority.
 718                          */
 719                         if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
 720                                 INTERR;
 721                                 return;
 722                         }
 723                         for (i = 0; i < NUM_WT; i++) {
 724                                 collpri_t *p;
 725                                 if (((ref = ellipsis_weights[i]) == -1) ||
 726                                     ((p = get_pri(ref)) == NULL) ||
 727                                     (p->pri == -1)) {
 728                                         set_pri(cc->ref[i], nextpri, RESOLVED);
 729                                 } else {
 730                                         set_pri(cc->ref[i], ref, REFER);
 731                                 }
 732                                 ellipsis_weights[i] = 0;
 733                         }
 734                         ellipsis_start++;
 735                         nextpri++;
 736                 }
 737         }
 738
 739         currchar = get_collchar(wc, 1);
 740 }
 741
 742 void
 743 start_order_collelem(collelem_t *e)
 744 {
 745         start_order(T_COLLELEM);
 746         currelem = e;
 747 }
 748
 749 void
 750 start_order_ellipsis(void)
 751 {
 752         int     i;
 753
 754         start_order(T_ELLIPSIS);
 755
 756         if (lastorder != T_CHAR) {
 757                 fprintf(stderr, "illegal starting point for range");
 758                 return;
 759         }
 760
 761         for (i = 0; i < NUM_WT; i++) {
 762                 ellipsis_weights[i] = order_weights[i];
 763         }
 764 }
 765
 766 void
 767 define_collelem(char *name, wchar_t *wcs)
 768 {
 769         collelem_t      *e;
 770         avl_index_t     where1;
 771         avl_index_t     where2;
 772         int             i;
 773
 774         if (wcslen(wcs) >= COLLATE_STR_LEN) {
 775                 fprintf(stderr,"expanded collation element too long");
 776                 return;
 777         }
 778
 779         if ((e = calloc(sizeof (*e), 1)) == NULL) {
 780                 fprintf(stderr, "out of memory");
 781                 return;
 782         }
 783         e->expand = wcs;
 784         e->symbol = name;
 785
 786         /*
 787          * This is executed before the order statement, so we don't
 788          * know how many priorities we *really* need.  We allocate one
 789          * for each possible weight.  Not a big deal, as collating-elements
 790          * prove to be quite rare.
 791          */
 792         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 793                 e->ref[i] = new_pri();
 794         }
 795
 796         /* A character sequence can only reduce to one element. */
 797         if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
 798             (avl_find(&elem_by_expand, e, &where2) != NULL)) {
 799                 fprintf(stderr, "duplicate collating element definition");
 800                 return;
 801         }
 802         avl_insert(&elem_by_symbol, e, where1);
 803         avl_insert(&elem_by_expand, e, where2);
 804 }
 805
 806 void
 807 add_order_bit(int kw)
 808 {
 809         uint8_t bit = DIRECTIVE_UNDEF;
 810
 811         switch (kw) {
 812         case T_FORWARD:
 813                 bit = DIRECTIVE_FORWARD;
 814                 break;
 815         case T_BACKWARD:
 816                 bit = DIRECTIVE_BACKWARD;
 817                 break;
 818         case T_POSITION:
 819                 bit = DIRECTIVE_POSITION;
 820                 break;
 821         default:
 822                 INTERR;
 823                 break;
 824         }
 825         collinfo.directive[collinfo.directive_count] |= bit;
 826 }
 827
 828 void
 829 add_order_directive(void)
 830 {
 831         if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
 832                 fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
 833         }
 834         collinfo.directive_count++;
 835 }
 836
 837 static void
 838 add_order_pri(int32_t ref)
 839 {
 840         if (curr_weight >= NUM_WT) {
 841                 fprintf(stderr,"too many weights (max %d)", NUM_WT);
 842                 return;
 843         }
 844         order_weights[curr_weight] = ref;
 845         curr_weight++;
 846 }
 847
 848 void
 849 add_order_collsym(collsym_t *s)
 850 {
 851         add_order_pri(s->ref);
 852 }
 853
 854 void
 855 add_order_char(wchar_t wc)
 856 {
 857         collchar_t *cc;
 858
 859         if ((cc = get_collchar(wc, 1)) == NULL) {
 860                 INTERR;
 861                 return;
 862         }
 863
 864         add_order_pri(cc->ref[curr_weight]);
 865 }
 866
 867 void
 868 add_order_collelem(collelem_t *e)
 869 {
 870         add_order_pri(e->ref[curr_weight]);
 871 }
 872
 873 void
 874 add_order_ignore(void)
 875 {
 876         add_order_pri(pri_ignore);
 877 }
 878
 879 void
 880 add_order_symbol(char *sym)
 881 {
 882         collundef_t *c;
 883         if ((c = get_collundef(sym)) == NULL) {
 884                 INTERR;
 885                 return;
 886         }
 887         add_order_pri(c->ref[curr_weight]);
 888 }
 889
 890 void
 891 add_order_ellipsis(void)
 892 {
 893         /* special NULL value indicates self reference */
 894         add_order_pri(0);
 895 }
 896
 897 void
 898 add_order_subst(void)
 899 {
 900         subst_t srch;
 901         subst_t *s;
 902         avl_index_t where;
 903         int i;
 904
 905         (void) memset(&srch, 0, sizeof (srch));
 906         for (i = 0; i < curr_subst; i++) {
 907                 srch.ref[i] = subst_weights[i];
 908                 subst_weights[i] = 0;
 909         }
 910         s = avl_find(&substs_ref[curr_weight], &srch, &where);
 911
 912         if (s == NULL) {
 913                 if ((s = calloc(sizeof (*s), 1)) == NULL) {
 914                         fprintf(stderr,"out of memory");
 915                         return;
 916                 }
 917                 s->key = new_pri();
 918
 919                 /*
 920                  * We use a self reference for our key, but we set a
 921                  * high bit to indicate that this is a substitution
 922                  * reference.  This will expedite table lookups later,
 923                  * and prevent table lookups for situations that don't
 924                  * require it.  (In short, its a big win, because we
 925                  * can skip a lot of binary searching.)
 926                  */
 927                 set_pri(s->key,
 928                     (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
 929                     RESOLVED);
 930                 nextsubst[curr_weight] += 1;
 931
 932                 for (i = 0; i < curr_subst; i++) {
 933                         s->ref[i] = srch.ref[i];
 934                 }
 935
 936                 avl_insert(&substs_ref[curr_weight], s, where);
 937
 938                 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
 939                         INTERR;
 940                         return;
 941                 }
 942                 avl_insert(&substs[curr_weight], s, where);
 943         }
 944         curr_subst = 0;
 945
 946
 947         /*
 948          * We are using the current (unique) priority as a search key
 949          * in the substitution table.
 950          */
 951         add_order_pri(s->key);
 952 }
 953
 954 static void
 955 add_subst_pri(int32_t ref)
 956 {
 957         if (curr_subst >= COLLATE_STR_LEN) {
 958                 fprintf(stderr,"substitution string is too long");
 959                 return;
 960         }
 961         subst_weights[curr_subst] = ref;
 962         curr_subst++;
 963 }
 964
 965 void
 966 add_subst_char(wchar_t wc)
 967 {
 968         collchar_t *cc;
 969
 970
 971         if (((cc = get_collchar(wc, 1)) == NULL) ||
 972             (cc->wc != wc)) {
 973                 INTERR;
 974                 return;
 975         }
 976         /* we take the weight for the character at that position */
 977         add_subst_pri(cc->ref[curr_weight]);
 978 }
 979
 980 void
 981 add_subst_collelem(collelem_t *e)
 982 {
 983         add_subst_pri(e->ref[curr_weight]);
 984 }
 985
 986 void
 987 add_subst_collsym(collsym_t *s)
 988 {
 989         add_subst_pri(s->ref);
 990 }
 991
 992 void
 993 add_subst_symbol(char *ptr)
 994 {
 995         collundef_t *cu;
 996
 997         if ((cu = get_collundef(ptr)) != NULL) {
 998                 add_subst_pri(cu->ref[curr_weight]);
 999         }
1000 }
1001
1002 void
1003 add_weight(int32_t ref, int pass)
1004 {
1005         weight_t srch;
1006         weight_t *w;
1007         avl_index_t where;
1008
1009         srch.pri = resolve_pri(ref);
1010
1011         /* No translation of ignores */
1012         if (srch.pri == 0)
1013                 return;
1014
1015         /* Substitution priorities are not weights */
1016         if (srch.pri & COLLATE_SUBST_PRIORITY)
1017                 return;
1018
1019         if (avl_find(&weights[pass], &srch, &where) != NULL)
1020                 return;
1021
1022         if ((w = calloc(sizeof (*w), 1)) == NULL) {
1023                 fprintf(stderr, "out of memory");
1024                 return;
1025         }
1026         w->pri = srch.pri;
1027         avl_insert(&weights[pass], w, where);
1028 }
1029
1030 void
1031 add_weights(int32_t *refs)
1032 {
1033         int i;
1034         for (i = 0; i < NUM_WT; i++) {
1035                 add_weight(refs[i], i);
1036         }
1037 }
1038
1039 int32_t
1040 get_weight(int32_t ref, int pass)
1041 {
1042         weight_t        srch;
1043         weight_t        *w;
1044         int32_t         pri;
1045
1046         pri = resolve_pri(ref);
1047         if (pri & COLLATE_SUBST_PRIORITY) {
1048                 return (pri);
1049         }
1050         if (pri <= 0) {
1051                 return (pri);
1052         }
1053         srch.pri = pri;
1054         if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1055                 INTERR;
1056                 return (-1);
1057         }
1058         return (w->opt);
1059 }
1060
1061 wchar_t *
1062 wsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
1063 {
1064         wchar_t *os1 = s1;
1065
1066         n++;
1067         while (--n > 0 && (*s1++ = *s2++) != 0)
1068                 continue;
1069         if (n > 0)
1070                 while (--n > 0)
1071                         *s1++ = 0;
1072         return (os1);
1073 }
1074
1075 void
1076 dump_collate(void)
1077 {
1078         FILE                    *f;
1079         int                     i, j, n;
1080         size_t                  sz;
1081         int32_t                 pri;
1082         collelem_t              *ce;
1083         collchar_t              *cc;
1084         subst_t                 *sb;
1085         char                    vers[COLLATE_STR_LEN];
1086         collate_char_t          chars[UCHAR_MAX + 1];
1087         collate_large_t         *large;
1088         collate_subst_t         *subst[COLL_WEIGHTS_MAX];
1089         collate_chain_t         *chain;
1090
1091         /*
1092          * We have to run throught a preliminary pass to identify all the
1093          * weights that we use for each sorting level.
1094          */
1095         for (i = 0; i < NUM_WT; i++) {
1096                 add_weight(pri_ignore, i);
1097         }
1098         for (i = 0; i < NUM_WT; i++) {
1099                 for (sb = avl_first(&substs[i]); sb;
1100                     sb = AVL_NEXT(&substs[i], sb)) {
1101                         for (j = 0; sb->ref[j]; j++) {
1102                                 add_weight(sb->ref[j], i);
1103                         }
1104                 }
1105         }
1106         for (ce = avl_first(&elem_by_expand);
1107             ce != NULL;
1108             ce = AVL_NEXT(&elem_by_expand, ce)) {
1109                 add_weights(ce->ref);
1110         }
1111         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1112                 add_weights(cc->ref);
1113         }
1114
1115         /*
1116          * Now we walk the entire set of weights, removing the gaps
1117          * in the weights.  This gives us optimum usage.  The walk
1118          * occurs in priority.
1119          */
1120         for (i = 0; i < NUM_WT; i++) {
1121                 weight_t *w;
1122                 for (w = avl_first(&weights[i]); w;
1123                     w = AVL_NEXT(&weights[i], w)) {
1124                         w->opt = nweight[i];
1125                         nweight[i] += 1;
1126                 }
1127         }
1128
1129         (void) memset(&chars, 0, sizeof (chars));
1130         (void) memset(vers, 0, COLLATE_STR_LEN);
1131         (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1132
1133         /*
1134          * We need to make sure we arrange for the UNDEFINED field
1135          * to show up.  Also, set the total weight counts.
1136          */
1137         for (i = 0; i < NUM_WT; i++) {
1138                 if (resolve_pri(pri_undefined[i]) == -1) {
1139                         set_pri(pri_undefined[i], -1, RESOLVED);
1140                         /* they collate at the end of everything else */
1141                         collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1142                 }
1143                 collinfo.pri_count[i] = nweight[i];
1144         }
1145
1146         collinfo.pri_count[NUM_WT] = max_wide();
1147         collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1148         collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1149
1150         /*
1151          * Ordinary character priorities
1152          */
1153         for (i = 0; i <= UCHAR_MAX; i++) {
1154                 if ((cc = get_collchar(i, 0)) != NULL) {
1155                         for (j = 0; j < NUM_WT; j++) {
1156                                 chars[i].pri[j] = get_weight(cc->ref[j], j);
1157                         }
1158                 } else {
1159                         for (j = 0; j < NUM_WT; j++) {
1160                                 chars[i].pri[j] =
1161                                     get_weight(pri_undefined[j], j);
1162                         }
1163                         /*
1164                          * Per POSIX, for undefined characters, we
1165                          * also have to add a last item, which is the
1166                          * character code.
1167                          */
1168                         chars[i].pri[NUM_WT] = i;
1169                 }
1170         }
1171
1172         /*
1173          * Substitution tables
1174          */
1175         for (i = 0; i < NUM_WT; i++) {
1176                 collate_subst_t *st = NULL;
1177                 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1178                 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1179                         fprintf(stderr, "out of memory");
1180                         return;
1181                 }
1182                 n = 0;
1183                 for (sb = avl_first(&substs[i]); sb;
1184                     sb = AVL_NEXT(&substs[i], sb)) {
1185                         if ((st[n].key = resolve_pri(sb->key)) < 0) {
1186                                 /* by definition these resolve! */
1187                                 INTERR;
1188                         }
1189                         if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1190                                 INTERR;
1191                         }
1192                         for (j = 0; sb->ref[j]; j++) {
1193                                 st[n].pri[j] = get_weight(sb->ref[j], i);
1194                         }
1195                         n++;
1196                 }
1197                 if (n != collinfo.subst_count[i])
1198                         INTERR;
1199                 subst[i] = st;
1200         }
1201
1202
1203         /*
1204          * Chains, i.e. collating elements
1205          */
1206         collinfo.chain_count = avl_numnodes(&elem_by_expand);
1207         chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1208         if (chain == NULL) {
1209                 fprintf(stderr, "out of memory");
1210                 return;
1211         }
1212         for (n = 0, ce = avl_first(&elem_by_expand);
1213             ce != NULL;
1214             ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1215                 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1216                 for (i = 0; i < NUM_WT; i++) {
1217                         chain[n].pri[i] = get_weight(ce->ref[i], i);
1218                 }
1219         }
1220         if (n != collinfo.chain_count)
1221                 INTERR;
1222
1223         /*
1224          * Large (> UCHAR_MAX) character priorities
1225          */
1226         large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1227         if (large == NULL) {
1228                 fprintf(stderr, "out of memory");
1229                 return;
1230         }
1231
1232         i = 0;
1233         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1234                 int     undef = 0;
1235                 /* we already gathered those */
1236                 if (cc->wc <= UCHAR_MAX)
1237                         continue;
1238                 for (j = 0; j < NUM_WT; j++) {
1239                         if ((pri = get_weight(cc->ref[j], j)) < 0) {
1240                                 undef = 1;
1241                         }
1242                         if (undef && (pri >= 0)) {
1243                                 /* if undefined, then all priorities are */
1244                                 INTERR;
1245                         } else {
1246                                 large[i].pri.pri[j] = pri;
1247                         }
1248                 }
1249                 if (!undef) {
1250                         large[i].val = cc->wc;
1251                         collinfo.large_count = i++;
1252                 }
1253         }
1254
1255         if ((f = open_category()) == NULL) {
1256                 return;
1257         }
1258
1259         /* Time to write the entire data set out */
1260
1261         if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1262             (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1263             (wr_category(&chars, sizeof (chars), f) < 0)) {
1264                 return;
1265         }
1266
1267         for (i = 0; i < NUM_WT; i++) {
1268                 sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1269                 if (wr_category(subst[i], sz, f) < 0) {
1270                         return;
1271                 }
1272         }
1273         sz = sizeof (collate_chain_t) * collinfo.chain_count;
1274         if (wr_category(chain, sz, f) < 0) {
1275                 return;
1276         }
1277         sz = sizeof (collate_large_t) * collinfo.large_count;
1278         if (wr_category(large, sz, f) < 0) {
1279                 return;
1280         }
1281
1282         close_category(f);
1283 }