localedef - Fix buildworld bootstrapping issue
[dragonfly.git] / usr.bin / localedef / collate.c
CommitLineData
cd1c6085
JM
1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2015 John Marino <draco@marino.st>
15 */
16
17/*
18 * LC_COLLATE database generation routines for localedef.
19 */
20
21#include <stdio.h>
22#include <stddef.h>
23#include <stdlib.h>
24#include <errno.h>
25#include <string.h>
26#include <sys/types.h>
27#include <string.h>
28#include <unistd.h>
29#include <wchar.h>
30#include <limits.h>
31#include "localedef.h"
32#include "parser.h"
33#include "collate.h"
34#include "avl.h"
35
5c0fb8c4
MD
36/*
37 * Work around buildworld bootstrapping from older systems whos limits.h
38 * sets COLL_WEIGHTS_MAX to 0.
39 */
40#if COLL_WEIGHTS_MAX == 0
41#undef COLL_WEIGHTS_MAX
42#define COLL_WEIGHTS_MAX 10
43#endif
44
cd1c6085
JM
45/*
46 * Design notes.
47 *
48 * It will be extremely helpful to the reader if they have access to
49 * the localedef and locale file format specifications available.
50 * Latest versions of these are available from www.opengroup.org.
51 *
52 * The design for the collation code is a bit complex. The goal is a
53 * single collation database as described in collate.h (in
54 * libc/port/locale). However, there are some other tidbits:
55 *
56 * a) The substitution entries are now a directly indexable array. A
57 * priority elsewhere in the table is taken as an index into the
58 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
59 * set. (The bit is cleared and the result is the index into the
60 * table.
61 *
62 * b) We eliminate duplicate entries into the substitution table.
63 * This saves a lot of space.
64 *
65 * c) The priorities for each level are "compressed", so that each
66 * sorting level has consecutively numbered priorities starting at 1.
67 * (O is reserved for the ignore priority.) This means sort levels
68 * which only have a few distinct priorities can represent the
69 * priority level in fewer bits, which makes the strxfrm output
70 * smaller.
71 *
72 * d) We record the total number of priorities so that strxfrm can
73 * figure out how many bytes to expand a numeric priority into.
74 *
75 * e) For the UNDEFINED pass (the last pass), we record the maximum
76 * number of bits needed to uniquely prioritize these entries, so that
77 * the last pass can also use smaller strxfrm output when possible.
78 *
79 * f) Priorities with the sign bit set are verboten. This works out
80 * because no active character set needs that bit to carry significant
81 * information once the character is in wide form.
82 *
83 * To process the entire data to make the database, we actually run
84 * multiple passes over the data.
85 *
86 * The first pass, which is done at parse time, identifies elements,
87 * substitutions, and such, and records them in priority order. As
88 * some priorities can refer to other priorities, using forward
89 * references, we use a table of references indicating whether the
90 * priority's value has been resolved, or whether it is still a
91 * reference.
92 *
93 * The second pass walks over all the items in priority order, noting
94 * that they are used directly, and not just an indirect reference.
95 * This is done by creating a "weight" structure for the item. The
96 * weights are stashed in an AVL tree sorted by relative "priority".
97 *
98 * The third pass walks over all the weight structures, in priority
99 * order, and assigns a new monotonically increasing (per sort level)
100 * weight value to them. These are the values that will actually be
101 * written to the file.
102 *
103 * The fourth pass just writes the data out.
104 */
105
106/*
107 * In order to resolve the priorities, we create a table of priorities.
108 * Entries in the table can be in one of three states.
109 *
110 * UNKNOWN is for newly allocated entries, and indicates that nothing
111 * is known about the priority. (For example, when new entries are created
112 * for collating-symbols, this is the value assigned for them until the
113 * collating symbol's order has been determined.
114 *
115 * RESOLVED is used for an entry where the priority indicates the final
116 * numeric weight.
117 *
118 * REFER is used for entries that reference other entries. Typically
119 * this is used for forward references. A collating-symbol can never
120 * have this value.
121 *
122 * The "pass" field is used during final resolution to aid in detection
123 * of referencing loops. (For example <A> depends on <B>, but <B> has its
124 * priority dependent on <A>.)
125 */
126typedef enum {
127 UNKNOWN, /* priority is totally unknown */
128 RESOLVED, /* priority value fully resolved */
129 REFER /* priority is a reference (index) */
130} res_t;
131
132typedef struct weight {
133 int32_t pri;
134 int opt;
135 avl_node_t avl;
136} weight_t;
137
138typedef struct priority {
139 res_t res;
140 int32_t pri;
141 int pass;
142 int lineno;
143} collpri_t;
144
145#define NUM_WT collinfo.directive_count
146
147/*
148 * These are the abstract collating symbols, which are just a symbolic
149 * way to reference a priority.
150 */
151struct collsym {
152 char *name;
153 int32_t ref;
154 avl_node_t avl;
155};
156
157/*
158 * These are also abstract collating symbols, but we allow them to have
159 * different priorities at different levels.
160 */
161typedef struct collundef {
162 char *name;
163 int32_t ref[COLL_WEIGHTS_MAX];
164 avl_node_t avl;
165} collundef_t;
166
167/*
168 * These are called "chains" in libc. This records the fact that two
169 * more characters should be treated as a single collating entity when
170 * they appear together. For example, in Spanish <C><h> gets collated
171 * as a character between <C> and <D>.
172 */
173struct collelem {
174 char *symbol;
175 wchar_t *expand;
176 int32_t ref[COLL_WEIGHTS_MAX];
177 avl_node_t avl_bysymbol;
178 avl_node_t avl_byexpand;
179};
180
181/*
182 * Individual characters have a sequence of weights as well.
183 */
184typedef struct collchar {
185 wchar_t wc;
186 int32_t ref[COLL_WEIGHTS_MAX];
187 avl_node_t avl;
188} collchar_t;
189
190/*
191 * Substitution entries. The key is itself a priority. Note that
192 * when we create one of these, we *automatically* wind up with a
193 * fully resolved priority for the key, because creation of
194 * substitutions creates a resolved priority at the same time.
195 */
196typedef struct {
197 int32_t key;
198 int32_t ref[COLLATE_STR_LEN];
199 avl_node_t avl;
200 avl_node_t avl_ref;
201} subst_t;
202
203static avl_tree_t collsyms;
204static avl_tree_t collundefs;
205static avl_tree_t elem_by_symbol;
206static avl_tree_t elem_by_expand;
207static avl_tree_t collchars;
208static avl_tree_t substs[COLL_WEIGHTS_MAX];
209static avl_tree_t substs_ref[COLL_WEIGHTS_MAX];
210static avl_tree_t weights[COLL_WEIGHTS_MAX];
211static int32_t nweight[COLL_WEIGHTS_MAX];
212
213/*
214 * This is state tracking for the ellipsis token. Note that we start
215 * the initial values so that the ellipsis logic will think we got a
216 * magic starting value of NUL. It starts at minus one because the
217 * starting point is exclusive -- i.e. the starting point is not
218 * itself handled by the ellipsis code.
219 */
220static int currorder = EOF;
221static int lastorder = EOF;
222static collelem_t *currelem;
223static collchar_t *currchar;
224static collundef_t *currundef;
225static wchar_t ellipsis_start = 0;
226static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
227
228/*
229 * We keep a running tally of weights.
230 */
231static int nextpri = 1;
232static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
233
234/*
235 * This array collects up the weights for each level.
236 */
237static int32_t order_weights[COLL_WEIGHTS_MAX];
238static int curr_weight = 0;
239static int32_t subst_weights[COLLATE_STR_LEN];
240static int curr_subst = 0;
241
242/*
243 * Some initial priority values.
244 */
245static int32_t pri_undefined[COLL_WEIGHTS_MAX];
246static int32_t pri_ignore;
247
248static collate_info_t collinfo;
249
250static collpri_t *prilist = NULL;
251static int numpri = 0;
252static int maxpri = 0;
253
254static void start_order(int);
255
256static int32_t
257new_pri(void)
258{
259 int i;
260
261 if (numpri >= maxpri) {
262 maxpri = maxpri ? maxpri * 2 : 1024;
263 prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
264 if (prilist == NULL) {
265 fprintf(stderr,"out of memory");
266 return (-1);
267 }
268 for (i = numpri; i < maxpri; i++) {
269 prilist[i].res = UNKNOWN;
270 prilist[i].pri = 0;
271 prilist[i].pass = 0;
272 }
273 }
274 return (numpri++);
275}
276
277static collpri_t *
278get_pri(int32_t ref)
279{
280 if ((ref < 0) || (ref > numpri)) {
281 INTERR;
282 return (NULL);
283 }
284 return (&prilist[ref]);
285}
286
287static void
288set_pri(int32_t ref, int32_t v, res_t res)
289{
290 collpri_t *pri;
291
292 pri = get_pri(ref);
293
294 if ((res == REFER) && ((v < 0) || (v >= numpri))) {
295 INTERR;
296 }
297
298 /* Resolve self references */
299 if ((res == REFER) && (ref == v)) {
300 v = nextpri;
301 res = RESOLVED;
302 }
303
304 if (pri->res != UNKNOWN) {
305 warn("repeated item in order list (first on %d)",
306 pri->lineno);
307 return;
308 }
309 pri->lineno = lineno;
310 pri->pri = v;
311 pri->res = res;
312}
313
314static int32_t
315resolve_pri(int32_t ref)
316{
317 collpri_t *pri;
318 static int32_t pass = 0;
319
320 pri = get_pri(ref);
321 pass++;
322 while (pri->res == REFER) {
323 if (pri->pass == pass) {
324 /* report a line with the circular symbol */
325 lineno = pri->lineno;
326 fprintf(stderr,"circular reference in order list");
327 return (-1);
328 }
329 if ((pri->pri < 0) || (pri->pri >= numpri)) {
330 INTERR;
331 return (-1);
332 }
333 pri->pass = pass;
334 pri = &prilist[pri->pri];
335 }
336
337 if (pri->res == UNKNOWN) {
338 return (-1);
339 }
340 if (pri->res != RESOLVED)
341 INTERR;
342
343 return (pri->pri);
344}
345
346static int
347weight_compare(const void *n1, const void *n2)
348{
349 int32_t k1 = ((const weight_t *)n1)->pri;
350 int32_t k2 = ((const weight_t *)n2)->pri;
351
352 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
353}
354
355static int
356collsym_compare(const void *n1, const void *n2)
357{
358 const collsym_t *c1 = n1;
359 const collsym_t *c2 = n2;
360 int rv;
361
362 rv = strcmp(c1->name, c2->name);
363 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
364}
365
366static int
367collundef_compare(const void *n1, const void *n2)
368{
369 const collundef_t *c1 = n1;
370 const collundef_t *c2 = n2;
371 int rv;
372
373 rv = strcmp(c1->name, c2->name);
374 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
375}
376
377static int
378element_compare_symbol(const void *n1, const void *n2)
379{
380 const collelem_t *c1 = n1;
381 const collelem_t *c2 = n2;
382 int rv;
383
384 rv = strcmp(c1->symbol, c2->symbol);
385 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
386}
387
388static int
389element_compare_expand(const void *n1, const void *n2)
390{
391 const collelem_t *c1 = n1;
392 const collelem_t *c2 = n2;
393 int rv;
394
395 rv = wcscmp(c1->expand, c2->expand);
396 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
397}
398
399static int
400collchar_compare(const void *n1, const void *n2)
401{
402 wchar_t k1 = ((const collchar_t *)n1)->wc;
403 wchar_t k2 = ((const collchar_t *)n2)->wc;
404
405 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
406}
407
408static int
409subst_compare(const void *n1, const void *n2)
410{
411 int32_t k1 = ((const subst_t *)n1)->key;
412 int32_t k2 = ((const subst_t *)n2)->key;
413
414 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
415}
416
417#pragma GCC diagnostic push
418#pragma GCC diagnostic ignored "-Wcast-qual"
419
420static int
421subst_compare_ref(const void *n1, const void *n2)
422{
423 int32_t *c1 = ((subst_t *)n1)->ref;
424 int32_t *c2 = ((subst_t *)n2)->ref;
425 int rv;
426
427 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
428 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
429}
430
431#pragma GCC diagnostic pop
432
433void
434init_collate(void)
435{
436 int i;
437
438 avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
439 offsetof(collsym_t, avl));
440
441 avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
442 offsetof(collundef_t, avl));
443
444 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
445 offsetof(collelem_t, avl_bysymbol));
446 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
447 offsetof(collelem_t, avl_byexpand));
448
449 avl_create(&collchars, collchar_compare, sizeof (collchar_t),
450 offsetof(collchar_t, avl));
451
452 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
453 avl_create(&substs[i], subst_compare, sizeof (subst_t),
454 offsetof(subst_t, avl));
455 avl_create(&substs_ref[i], subst_compare_ref,
456 sizeof (subst_t), offsetof(subst_t, avl_ref));
457 avl_create(&weights[i], weight_compare, sizeof (weight_t),
458 offsetof(weight_t, avl));
459 nweight[i] = 1;
460 }
461
462 (void) memset(&collinfo, 0, sizeof (collinfo));
463
464 /* allocate some initial priorities */
465 pri_ignore = new_pri();
466
467 set_pri(pri_ignore, 0, RESOLVED);
468
469 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
470 pri_undefined[i] = new_pri();
471
472 /* we will override this later */
473 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
474 }
475}
476
477void
478define_collsym(char *name)
479{
480 collsym_t *sym;
481 avl_index_t where;
482
483 if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
484 fprintf(stderr,"out of memory");
485 return;
486 }
487 sym->name = name;
488 sym->ref = new_pri();
489
490 if (avl_find(&collsyms, sym, &where) != NULL) {
491 /*
492 * This should never happen because we are only called
493 * for undefined symbols.
494 */
495 INTERR;
496 return;
497 }
498 avl_insert(&collsyms, sym, where);
499}
500
501collsym_t *
502lookup_collsym(char *name)
503{
504 collsym_t srch;
505
506 srch.name = name;
507 return (avl_find(&collsyms, &srch, NULL));
508}
509
510collelem_t *
511lookup_collelem(char *symbol)
512{
513 collelem_t srch;
514
515 srch.symbol = symbol;
516 return (avl_find(&elem_by_symbol, &srch, NULL));
517}
518
519static collundef_t *
520get_collundef(char *name)
521{
522 collundef_t srch;
523 collundef_t *ud;
524 avl_index_t where;
525 int i;
526
527 srch.name = name;
528 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
529 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
530 ((ud->name = strdup(name)) == NULL)) {
531 fprintf(stderr,"out of memory");
532 return (NULL);
533 }
534 for (i = 0; i < NUM_WT; i++) {
535 ud->ref[i] = new_pri();
536 }
537 avl_insert(&collundefs, ud, where);
538 }
539 add_charmap_undefined(name);
540 return (ud);
541}
542
543static collchar_t *
544get_collchar(wchar_t wc, int create)
545{
546 collchar_t srch;
547 collchar_t *cc;
548 avl_index_t where;
549 int i;
550
551 srch.wc = wc;
552 cc = avl_find(&collchars, &srch, &where);
553 if ((cc == NULL) && create) {
554 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
555 fprintf(stderr, "out of memory");
556 return (NULL);
557 }
558 for (i = 0; i < NUM_WT; i++) {
559 cc->ref[i] = new_pri();
560 }
561 cc->wc = wc;
562 avl_insert(&collchars, cc, where);
563 }
564 return (cc);
565}
566
567void
568end_order_collsym(collsym_t *sym)
569{
570 start_order(T_COLLSYM);
571 /* update the weight */
572
573 set_pri(sym->ref, nextpri, RESOLVED);
574 nextpri++;
575}
576
577void
578end_order(void)
579{
580 int i;
581 int32_t pri;
582 int32_t ref;
583 collpri_t *p;
584
585 /* advance the priority/weight */
586 pri = nextpri;
587
588 switch (currorder) {
589 case T_CHAR:
590 for (i = 0; i < NUM_WT; i++) {
591 if (((ref = order_weights[i]) < 0) ||
592 ((p = get_pri(ref)) == NULL) ||
593 (p->pri == -1)) {
594 /* unspecified weight is a self reference */
595 set_pri(currchar->ref[i], pri, RESOLVED);
596 } else {
597 set_pri(currchar->ref[i], ref, REFER);
598 }
599 order_weights[i] = -1;
600 }
601
602 /* leave a cookie trail in case next symbol is ellipsis */
603 ellipsis_start = currchar->wc + 1;
604 currchar = NULL;
605 break;
606
607 case T_ELLIPSIS:
608 /* save off the weights were we can find them */
609 for (i = 0; i < NUM_WT; i++) {
610 ellipsis_weights[i] = order_weights[i];
611 order_weights[i] = -1;
612 }
613 break;
614
615 case T_COLLELEM:
616 if (currelem == NULL) {
617 INTERR;
618 } else {
619 for (i = 0; i < NUM_WT; i++) {
620
621 if (((ref = order_weights[i]) < 0) ||
622 ((p = get_pri(ref)) == NULL) ||
623 (p->pri == -1)) {
624 set_pri(currelem->ref[i], pri,
625 RESOLVED);
626 } else {
627 set_pri(currelem->ref[i], ref, REFER);
628 }
629 order_weights[i] = -1;
630 }
631 }
632 break;
633
634 case T_UNDEFINED:
635 for (i = 0; i < NUM_WT; i++) {
636 if (((ref = order_weights[i]) < 0) ||
637 ((p = get_pri(ref)) == NULL) ||
638 (p->pri == -1)) {
639 set_pri(pri_undefined[i], -1, RESOLVED);
640 } else {
641 set_pri(pri_undefined[i], ref, REFER);
642 }
643 order_weights[i] = -1;
644 }
645 break;
646
647 case T_SYMBOL:
648 for (i = 0; i < NUM_WT; i++) {
649 if (((ref = order_weights[i]) < 0) ||
650 ((p = get_pri(ref)) == NULL) ||
651 (p->pri == -1)) {
652 set_pri(currundef->ref[i], pri, RESOLVED);
653 } else {
654 set_pri(currundef->ref[i], ref, REFER);
655 }
656 order_weights[i] = -1;
657 }
658 break;
659
660 default:
661 INTERR;
662 }
663
664 nextpri++;
665}
666
667static void
668start_order(int type)
669{
670 int i;
671
672 lastorder = currorder;
673 currorder = type;
674
675 /* this is used to protect ELLIPSIS processing */
676 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
677 fprintf(stderr, "character value expected");
678 }
679
680 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
681 order_weights[i] = -1;
682 }
683 curr_weight = 0;
684}
685
686void
687start_order_undefined(void)
688{
689 start_order(T_UNDEFINED);
690}
691
692void
693start_order_symbol(char *name)
694{
695 currundef = get_collundef(name);
696 start_order(T_SYMBOL);
697}
698
699void
700start_order_char(wchar_t wc)
701{
702 collchar_t *cc;
703 int32_t ref;
704
705 start_order(T_CHAR);
706
707 /*
708 * If we last saw an ellipsis, then we need to close the range.
709 * Handle that here. Note that we have to be careful because the
710 * items *inside* the range are treated exclusiveley to the items
711 * outside of the range. The ends of the range can have quite
712 * different weights than the range members.
713 */
714 if (lastorder == T_ELLIPSIS) {
715 int i;
716
717 if (wc < ellipsis_start) {
718 fprintf(stderr, "malformed range!");
719 return;
720 }
721 while (ellipsis_start < wc) {
722 /*
723 * pick all of the saved weights for the
724 * ellipsis. note that -1 encodes for the
725 * ellipsis itself, which means to take the
726 * current relative priority.
727 */
728 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
729 INTERR;
730 return;
731 }
732 for (i = 0; i < NUM_WT; i++) {
733 collpri_t *p;
734 if (((ref = ellipsis_weights[i]) == -1) ||
735 ((p = get_pri(ref)) == NULL) ||
736 (p->pri == -1)) {
737 set_pri(cc->ref[i], nextpri, RESOLVED);
738 } else {
739 set_pri(cc->ref[i], ref, REFER);
740 }
741 ellipsis_weights[i] = 0;
742 }
743 ellipsis_start++;
744 nextpri++;
745 }
746 }
747
748 currchar = get_collchar(wc, 1);
749}
750
751void
752start_order_collelem(collelem_t *e)
753{
754 start_order(T_COLLELEM);
755 currelem = e;
756}
757
758void
759start_order_ellipsis(void)
760{
761 int i;
762
763 start_order(T_ELLIPSIS);
764
765 if (lastorder != T_CHAR) {
766 fprintf(stderr, "illegal starting point for range");
767 return;
768 }
769
770 for (i = 0; i < NUM_WT; i++) {
771 ellipsis_weights[i] = order_weights[i];
772 }
773}
774
775void
776define_collelem(char *name, wchar_t *wcs)
777{
778 collelem_t *e;
779 avl_index_t where1;
780 avl_index_t where2;
781 int i;
782
783 if (wcslen(wcs) >= COLLATE_STR_LEN) {
784 fprintf(stderr,"expanded collation element too long");
785 return;
786 }
787
788 if ((e = calloc(sizeof (*e), 1)) == NULL) {
789 fprintf(stderr, "out of memory");
790 return;
791 }
792 e->expand = wcs;
793 e->symbol = name;
794
795 /*
796 * This is executed before the order statement, so we don't
797 * know how many priorities we *really* need. We allocate one
798 * for each possible weight. Not a big deal, as collating-elements
799 * prove to be quite rare.
800 */
801 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
802 e->ref[i] = new_pri();
803 }
804
805 /* A character sequence can only reduce to one element. */
806 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
807 (avl_find(&elem_by_expand, e, &where2) != NULL)) {
808 fprintf(stderr, "duplicate collating element definition");
809 return;
810 }
811 avl_insert(&elem_by_symbol, e, where1);
812 avl_insert(&elem_by_expand, e, where2);
813}
814
815void
816add_order_bit(int kw)
817{
818 uint8_t bit = DIRECTIVE_UNDEF;
819
820 switch (kw) {
821 case T_FORWARD:
822 bit = DIRECTIVE_FORWARD;
823 break;
824 case T_BACKWARD:
825 bit = DIRECTIVE_BACKWARD;
826 break;
827 case T_POSITION:
828 bit = DIRECTIVE_POSITION;
829 break;
830 default:
831 INTERR;
832 break;
833 }
834 collinfo.directive[collinfo.directive_count] |= bit;
835}
836
837void
838add_order_directive(void)
839{
840 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
841 fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
842 }
843 collinfo.directive_count++;
844}
845
846static void
847add_order_pri(int32_t ref)
848{
849 if (curr_weight >= NUM_WT) {
850 fprintf(stderr,"too many weights (max %d)", NUM_WT);
851 return;
852 }
853 order_weights[curr_weight] = ref;
854 curr_weight++;
855}
856
857void
858add_order_collsym(collsym_t *s)
859{
860 add_order_pri(s->ref);
861}
862
863void
864add_order_char(wchar_t wc)
865{
866 collchar_t *cc;
867
868 if ((cc = get_collchar(wc, 1)) == NULL) {
869 INTERR;
870 return;
871 }
872
873 add_order_pri(cc->ref[curr_weight]);
874}
875
876void
877add_order_collelem(collelem_t *e)
878{
879 add_order_pri(e->ref[curr_weight]);
880}
881
882void
883add_order_ignore(void)
884{
885 add_order_pri(pri_ignore);
886}
887
888void
889add_order_symbol(char *sym)
890{
891 collundef_t *c;
892 if ((c = get_collundef(sym)) == NULL) {
893 INTERR;
894 return;
895 }
896 add_order_pri(c->ref[curr_weight]);
897}
898
899void
900add_order_ellipsis(void)
901{
902 /* special NULL value indicates self reference */
903 add_order_pri(0);
904}
905
906void
907add_order_subst(void)
908{
909 subst_t srch;
910 subst_t *s;
911 avl_index_t where;
912 int i;
913
914 (void) memset(&srch, 0, sizeof (srch));
915 for (i = 0; i < curr_subst; i++) {
916 srch.ref[i] = subst_weights[i];
917 subst_weights[i] = 0;
918 }
919 s = avl_find(&substs_ref[curr_weight], &srch, &where);
920
921 if (s == NULL) {
922 if ((s = calloc(sizeof (*s), 1)) == NULL) {
923 fprintf(stderr,"out of memory");
924 return;
925 }
926 s->key = new_pri();
927
928 /*
929 * We use a self reference for our key, but we set a
930 * high bit to indicate that this is a substitution
931 * reference. This will expedite table lookups later,
932 * and prevent table lookups for situations that don't
933 * require it. (In short, its a big win, because we
934 * can skip a lot of binary searching.)
935 */
936 set_pri(s->key,
937 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
938 RESOLVED);
939 nextsubst[curr_weight] += 1;
940
941 for (i = 0; i < curr_subst; i++) {
942 s->ref[i] = srch.ref[i];
943 }
944
945 avl_insert(&substs_ref[curr_weight], s, where);
946
947 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
948 INTERR;
949 return;
950 }
951 avl_insert(&substs[curr_weight], s, where);
952 }
953 curr_subst = 0;
954
955
956 /*
957 * We are using the current (unique) priority as a search key
958 * in the substitution table.
959 */
960 add_order_pri(s->key);
961}
962
963static void
964add_subst_pri(int32_t ref)
965{
966 if (curr_subst >= COLLATE_STR_LEN) {
967 fprintf(stderr,"substitution string is too long");
968 return;
969 }
970 subst_weights[curr_subst] = ref;
971 curr_subst++;
972}
973
974void
975add_subst_char(wchar_t wc)
976{
977 collchar_t *cc;
978
979
980 if (((cc = get_collchar(wc, 1)) == NULL) ||
981 (cc->wc != wc)) {
982 INTERR;
983 return;
984 }
985 /* we take the weight for the character at that position */
986 add_subst_pri(cc->ref[curr_weight]);
987}
988
989void
990add_subst_collelem(collelem_t *e)
991{
992 add_subst_pri(e->ref[curr_weight]);
993}
994
995void
996add_subst_collsym(collsym_t *s)
997{
998 add_subst_pri(s->ref);
999}
1000
1001void
1002add_subst_symbol(char *ptr)
1003{
1004 collundef_t *cu;
1005
1006 if ((cu = get_collundef(ptr)) != NULL) {
1007 add_subst_pri(cu->ref[curr_weight]);
1008 }
1009}
1010
1011void
1012add_weight(int32_t ref, int pass)
1013{
1014 weight_t srch;
1015 weight_t *w;
1016 avl_index_t where;
1017
1018 srch.pri = resolve_pri(ref);
1019
1020 /* No translation of ignores */
1021 if (srch.pri == 0)
1022 return;
1023
1024 /* Substitution priorities are not weights */
1025 if (srch.pri & COLLATE_SUBST_PRIORITY)
1026 return;
1027
1028 if (avl_find(&weights[pass], &srch, &where) != NULL)
1029 return;
1030
1031 if ((w = calloc(sizeof (*w), 1)) == NULL) {
1032 fprintf(stderr, "out of memory");
1033 return;
1034 }
1035 w->pri = srch.pri;
1036 avl_insert(&weights[pass], w, where);
1037}
1038
1039void
1040add_weights(int32_t *refs)
1041{
1042 int i;
1043 for (i = 0; i < NUM_WT; i++) {
1044 add_weight(refs[i], i);
1045 }
1046}
1047
1048int32_t
1049get_weight(int32_t ref, int pass)
1050{
1051 weight_t srch;
1052 weight_t *w;
1053 int32_t pri;
1054
1055 pri = resolve_pri(ref);
1056 if (pri & COLLATE_SUBST_PRIORITY) {
1057 return (pri);
1058 }
1059 if (pri <= 0) {
1060 return (pri);
1061 }
1062 srch.pri = pri;
1063 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1064 INTERR;
1065 return (-1);
1066 }
1067 return (w->opt);
1068}
1069
1070wchar_t *
1071wsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
1072{
1073 wchar_t *os1 = s1;
1074
1075 n++;
1076 while (--n > 0 && (*s1++ = *s2++) != 0)
1077 continue;
1078 if (n > 0)
1079 while (--n > 0)
1080 *s1++ = 0;
1081 return (os1);
1082}
1083
1084void
1085dump_collate(void)
1086{
1087 FILE *f;
1088 int i, j, n;
1089 size_t sz;
1090 int32_t pri;
1091 collelem_t *ce;
1092 collchar_t *cc;
1093 subst_t *sb;
1094 char vers[COLLATE_STR_LEN];
1095 collate_char_t chars[UCHAR_MAX + 1];
1096 collate_large_t *large;
1097 collate_subst_t *subst[COLL_WEIGHTS_MAX];
1098 collate_chain_t *chain;
1099
1100 /*
1101 * We have to run throught a preliminary pass to identify all the
1102 * weights that we use for each sorting level.
1103 */
1104 for (i = 0; i < NUM_WT; i++) {
1105 add_weight(pri_ignore, i);
1106 }
1107 for (i = 0; i < NUM_WT; i++) {
1108 for (sb = avl_first(&substs[i]); sb;
1109 sb = AVL_NEXT(&substs[i], sb)) {
1110 for (j = 0; sb->ref[j]; j++) {
1111 add_weight(sb->ref[j], i);
1112 }
1113 }
1114 }
1115 for (ce = avl_first(&elem_by_expand);
1116 ce != NULL;
1117 ce = AVL_NEXT(&elem_by_expand, ce)) {
1118 add_weights(ce->ref);
1119 }
1120 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1121 add_weights(cc->ref);
1122 }
1123
1124 /*
1125 * Now we walk the entire set of weights, removing the gaps
1126 * in the weights. This gives us optimum usage. The walk
1127 * occurs in priority.
1128 */
1129 for (i = 0; i < NUM_WT; i++) {
1130 weight_t *w;
1131 for (w = avl_first(&weights[i]); w;
1132 w = AVL_NEXT(&weights[i], w)) {
1133 w->opt = nweight[i];
1134 nweight[i] += 1;
1135 }
1136 }
1137
1138 (void) memset(&chars, 0, sizeof (chars));
1139 (void) memset(vers, 0, COLLATE_STR_LEN);
1140 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1141
1142 /*
1143 * We need to make sure we arrange for the UNDEFINED field
1144 * to show up. Also, set the total weight counts.
1145 */
1146 for (i = 0; i < NUM_WT; i++) {
1147 if (resolve_pri(pri_undefined[i]) == -1) {
1148 set_pri(pri_undefined[i], -1, RESOLVED);
1149 /* they collate at the end of everything else */
1150 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1151 }
1152 collinfo.pri_count[i] = nweight[i];
1153 }
1154
1155 collinfo.pri_count[NUM_WT] = max_wide();
1156 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1157 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1158
1159 /*
1160 * Ordinary character priorities
1161 */
1162 for (i = 0; i <= UCHAR_MAX; i++) {
1163 if ((cc = get_collchar(i, 0)) != NULL) {
1164 for (j = 0; j < NUM_WT; j++) {
1165 chars[i].pri[j] = get_weight(cc->ref[j], j);
1166 }
1167 } else {
1168 for (j = 0; j < NUM_WT; j++) {
1169 chars[i].pri[j] =
1170 get_weight(pri_undefined[j], j);
1171 }
1172 /*
1173 * Per POSIX, for undefined characters, we
1174 * also have to add a last item, which is the
1175 * character code.
1176 */
1177 chars[i].pri[NUM_WT] = i;
1178 }
1179 }
1180
1181 /*
1182 * Substitution tables
1183 */
1184 for (i = 0; i < NUM_WT; i++) {
1185 collate_subst_t *st = NULL;
1186 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1187 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1188 fprintf(stderr, "out of memory");
1189 return;
1190 }
1191 n = 0;
1192 for (sb = avl_first(&substs[i]); sb;
1193 sb = AVL_NEXT(&substs[i], sb)) {
1194 if ((st[n].key = resolve_pri(sb->key)) < 0) {
1195 /* by definition these resolve! */
1196 INTERR;
1197 }
1198 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1199 INTERR;
1200 }
1201 for (j = 0; sb->ref[j]; j++) {
1202 st[n].pri[j] = get_weight(sb->ref[j], i);
1203 }
1204 n++;
1205 }
1206 if (n != collinfo.subst_count[i])
1207 INTERR;
1208 subst[i] = st;
1209 }
1210
1211
1212 /*
1213 * Chains, i.e. collating elements
1214 */
1215 collinfo.chain_count = avl_numnodes(&elem_by_expand);
1216 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1217 if (chain == NULL) {
1218 fprintf(stderr, "out of memory");
1219 return;
1220 }
1221 for (n = 0, ce = avl_first(&elem_by_expand);
1222 ce != NULL;
1223 ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1224 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1225 for (i = 0; i < NUM_WT; i++) {
1226 chain[n].pri[i] = get_weight(ce->ref[i], i);
1227 }
1228 }
1229 if (n != collinfo.chain_count)
1230 INTERR;
1231
1232 /*
1233 * Large (> UCHAR_MAX) character priorities
1234 */
1235 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1236 if (large == NULL) {
1237 fprintf(stderr, "out of memory");
1238 return;
1239 }
1240
1241 i = 0;
1242 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1243 int undef = 0;
1244 /* we already gathered those */
1245 if (cc->wc <= UCHAR_MAX)
1246 continue;
1247 for (j = 0; j < NUM_WT; j++) {
1248 if ((pri = get_weight(cc->ref[j], j)) < 0) {
1249 undef = 1;
1250 }
1251 if (undef && (pri >= 0)) {
1252 /* if undefined, then all priorities are */
1253 INTERR;
1254 } else {
1255 large[i].pri.pri[j] = pri;
1256 }
1257 }
1258 if (!undef) {
1259 large[i].val = cc->wc;
1260 collinfo.large_count = i++;
1261 }
1262 }
1263
1264 if ((f = open_category()) == NULL) {
1265 return;
1266 }
1267
1268 /* Time to write the entire data set out */
1269
1270 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1271 (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1272 (wr_category(&chars, sizeof (chars), f) < 0)) {
1273 return;
1274 }
1275
1276 for (i = 0; i < NUM_WT; i++) {
1277 sz = sizeof (collate_subst_t) * collinfo.subst_count[i];
1278 if (wr_category(subst[i], sz, f) < 0) {
1279 return;
1280 }
1281 }
1282 sz = sizeof (collate_chain_t) * collinfo.chain_count;
1283 if (wr_category(chain, sz, f) < 0) {
1284 return;
1285 }
1286 sz = sizeof (collate_large_t) * collinfo.large_count;
1287 if (wr_category(large, sz, f) < 0) {
1288 return;
1289 }
1290
1291 close_category(f);
1292}