gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
cd1c6085 JM	1	/*
	2	* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
	3	* Copyright 2015 John Marino <draco@marino.st>
8aa2b98b JM	4	*
	5	* This source code is derived from the illumos localedef command, and
	6	* provided under BSD-style license terms by Nexenta Systems, Inc.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	*
	12	* 1. Redistributions of source code must retain the above copyright
	13	* notice, this list of conditions and the following disclaimer.
	14	* 2. Redistributions in binary form must reproduce the above copyright
	15	* notice, this list of conditions and the following disclaimer in the
	16	* documentation and/or other materials provided with the distribution.
	17	*
	18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	19	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	20	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	21	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	22	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	23	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	24	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	25	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	26	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	27	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	28	* POSSIBILITY OF SUCH DAMAGE.
cd1c6085 JM	29	*/
	30
	31	/*
	32	* LC_COLLATE database generation routines for localedef.
	33	*/
	34
	35	#include <stdio.h>
	36	#include <stddef.h>
	37	#include <stdlib.h>
	38	#include <errno.h>
	39	#include <string.h>
	40	#include <sys/types.h>
	41	#include <string.h>
	42	#include <unistd.h>
	43	#include <wchar.h>
	44	#include <limits.h>
	45	#include "localedef.h"
	46	#include "parser.h"
	47	#include "collate.h"
	48	#include "avl.h"
	49
	50	/*
	51	* Design notes.
	52	*
	53	* It will be extremely helpful to the reader if they have access to
	54	* the localedef and locale file format specifications available.
	55	* Latest versions of these are available from www.opengroup.org.
	56	*
	57	* The design for the collation code is a bit complex. The goal is a
	58	* single collation database as described in collate.h (in
	59	* libc/port/locale). However, there are some other tidbits:
	60	*
	61	* a) The substitution entries are now a directly indexable array. A
	62	* priority elsewhere in the table is taken as an index into the
	63	* substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
	64	* set. (The bit is cleared and the result is the index into the
	65	* table.
	66	*
	67	* b) We eliminate duplicate entries into the substitution table.
	68	* This saves a lot of space.
	69	*
	70	* c) The priorities for each level are "compressed", so that each
	71	* sorting level has consecutively numbered priorities starting at 1.
	72	* (O is reserved for the ignore priority.) This means sort levels
	73	* which only have a few distinct priorities can represent the
	74	* priority level in fewer bits, which makes the strxfrm output
	75	* smaller.
	76	*
	77	* d) We record the total number of priorities so that strxfrm can
	78	* figure out how many bytes to expand a numeric priority into.
	79	*
	80	* e) For the UNDEFINED pass (the last pass), we record the maximum
	81	* number of bits needed to uniquely prioritize these entries, so that
	82	* the last pass can also use smaller strxfrm output when possible.
	83	*
	84	* f) Priorities with the sign bit set are verboten. This works out
	85	* because no active character set needs that bit to carry significant
	86	* information once the character is in wide form.
	87	*
	88	* To process the entire data to make the database, we actually run
	89	* multiple passes over the data.
	90	*
	91	* The first pass, which is done at parse time, identifies elements,
	92	* substitutions, and such, and records them in priority order. As
93	* some priorities can refer to other priorities, using forward
94	* references, we use a table of references indicating whether the
95	* priority's value has been resolved, or whether it is still a
96	* reference.
97	*
98	* The second pass walks over all the items in priority order, noting
99	* that they are used directly, and not just an indirect reference.
100	* This is done by creating a "weight" structure for the item. The
101	* weights are stashed in an AVL tree sorted by relative "priority".
102	*
103	* The third pass walks over all the weight structures, in priority
104	* order, and assigns a new monotonically increasing (per sort level)
105	* weight value to them. These are the values that will actually be
106	* written to the file.
107	*
108	* The fourth pass just writes the data out.
109	*/
110
111	/*
112	* In order to resolve the priorities, we create a table of priorities.
113	* Entries in the table can be in one of three states.
114	*
115	* UNKNOWN is for newly allocated entries, and indicates that nothing
116	* is known about the priority. (For example, when new entries are created
117	* for collating-symbols, this is the value assigned for them until the
118	* collating symbol's order has been determined.
119	*
120	* RESOLVED is used for an entry where the priority indicates the final
121	* numeric weight.
122	*
123	* REFER is used for entries that reference other entries. Typically
124	* this is used for forward references. A collating-symbol can never
125	* have this value.
126	*
127	* The "pass" field is used during final resolution to aid in detection
128	* of referencing loops. (For example <A> depends on <B>, but <B> has its
129	* priority dependent on <A>.)
130	*/
131	typedef enum {
132	UNKNOWN, /* priority is totally unknown */
133	RESOLVED, /* priority value fully resolved */
134	REFER /* priority is a reference (index) */
135	} res_t;
136
137	typedef struct weight {
138	int32_t pri;
139	int opt;
140	avl_node_t avl;
141	} weight_t;
142
143	typedef struct priority {
144	res_t res;
145	int32_t pri;
146	int pass;
147	int lineno;
148	} collpri_t;
149
150	#define NUM_WT collinfo.directive_count
151
152	/*
153	* These are the abstract collating symbols, which are just a symbolic
154	* way to reference a priority.
155	*/
156	struct collsym {
157	char *name;
158	int32_t ref;
159	avl_node_t avl;
160	};
161
162	/*
163	* These are also abstract collating symbols, but we allow them to have
164	* different priorities at different levels.
165	*/
166	typedef struct collundef {
167	char *name;
168	int32_t ref[COLL_WEIGHTS_MAX];
169	avl_node_t avl;
170	} collundef_t;
171
172	/*
173	* These are called "chains" in libc. This records the fact that two
174	* more characters should be treated as a single collating entity when
175	* they appear together. For example, in Spanish <C><h> gets collated
176	* as a character between <C> and <D>.
177	*/
178	struct collelem {
179	char *symbol;
180	wchar_t *expand;
181	int32_t ref[COLL_WEIGHTS_MAX];
182	avl_node_t avl_bysymbol;
183	avl_node_t avl_byexpand;
184	};
185
186	/*
187	* Individual characters have a sequence of weights as well.
188	*/
189	typedef struct collchar {
190	wchar_t wc;
191	int32_t ref[COLL_WEIGHTS_MAX];
192	avl_node_t avl;
193	} collchar_t;
194
195	/*
196	* Substitution entries. The key is itself a priority. Note that
197	* when we create one of these, we automatically wind up with a
198	* fully resolved priority for the key, because creation of
199	* substitutions creates a resolved priority at the same time.
200	*/
201	typedef struct {
202	int32_t key;
203	int32_t ref[COLLATE_STR_LEN];
204	avl_node_t avl;
205	avl_node_t avl_ref;
206	} subst_t;
207
208	static avl_tree_t collsyms;
209	static avl_tree_t collundefs;
210	static avl_tree_t elem_by_symbol;
211	static avl_tree_t elem_by_expand;
212	static avl_tree_t collchars;
213	static avl_tree_t substs[COLL_WEIGHTS_MAX];
214	static avl_tree_t substs_ref[COLL_WEIGHTS_MAX];
215	static avl_tree_t weights[COLL_WEIGHTS_MAX];
216	static int32_t nweight[COLL_WEIGHTS_MAX];
217
218	/*
219	* This is state tracking for the ellipsis token. Note that we start
220	* the initial values so that the ellipsis logic will think we got a
221	* magic starting value of NUL. It starts at minus one because the
222	* starting point is exclusive -- i.e. the starting point is not
223	* itself handled by the ellipsis code.
224	*/
225	static int currorder = EOF;
226	static int lastorder = EOF;
227	static collelem_t *currelem;
228	static collchar_t *currchar;
229	static collundef_t *currundef;
230	static wchar_t ellipsis_start = 0;
231	static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
232
233	/*
234	* We keep a running tally of weights.
235	*/
236	static int nextpri = 1;
237	static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
238
239	/*
240	* This array collects up the weights for each level.
241	*/
242	static int32_t order_weights[COLL_WEIGHTS_MAX];
243	static int curr_weight = 0;
244	static int32_t subst_weights[COLLATE_STR_LEN];
245	static int curr_subst = 0;
246
247	/*
248	* Some initial priority values.
249	*/
250	static int32_t pri_undefined[COLL_WEIGHTS_MAX];
251	static int32_t pri_ignore;
252
253	static collate_info_t collinfo;
254
255	static collpri_t *prilist = NULL;
256	static int numpri = 0;
257	static int maxpri = 0;
258
259	static void start_order(int);
260
261	static int32_t
262	new_pri(void)
263	{
264	int i;
265
266	if (numpri >= maxpri) {
267	maxpri = maxpri ? maxpri * 2 : 1024;
268	prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
269	if (prilist == NULL) {
270	fprintf(stderr,"out of memory");
271	return (-1);
272	}
273	for (i = numpri; i < maxpri; i++) {
274	prilist[i].res = UNKNOWN;
275	prilist[i].pri = 0;
276	prilist[i].pass = 0;
277	}
278	}
279	return (numpri++);
280	}
281
282	static collpri_t *
283	get_pri(int32_t ref)
284	{
285	if ((ref < 0) \|\| (ref > numpri)) {
286	INTERR;
287	return (NULL);
288	}
289	return (&prilist[ref]);
290	}
291
292	static void
293	set_pri(int32_t ref, int32_t v, res_t res)
294	{
295	collpri_t *pri;
296
297	pri = get_pri(ref);
298
299	if ((res == REFER) && ((v < 0) \|\| (v >= numpri))) {
300	INTERR;
301	}
302
303	/* Resolve self references */
304	if ((res == REFER) && (ref == v)) {
305	v = nextpri;
306	res = RESOLVED;
307	}
308
309	if (pri->res != UNKNOWN) {
310	warn("repeated item in order list (first on %d)",
311	pri->lineno);
312	return;
313	}
314	pri->lineno = lineno;
315	pri->pri = v;
316	pri->res = res;
317	}
318
319	static int32_t
320	resolve_pri(int32_t ref)
321	{
322	collpri_t *pri;
323	static int32_t pass = 0;
324
325	pri = get_pri(ref);
326	pass++;
327	while (pri->res == REFER) {
328	if (pri->pass == pass) {
329	/* report a line with the circular symbol */
330	lineno = pri->lineno;
331	fprintf(stderr,"circular reference in order list");
332	return (-1);
333	}
334	if ((pri->pri < 0) \|\| (pri->pri >= numpri)) {
335	INTERR;
336	return (-1);
337	}
338	pri->pass = pass;
339	pri = &prilist[pri->pri];
340	}
341
342	if (pri->res == UNKNOWN) {
343	return (-1);
344	}
345	if (pri->res != RESOLVED)
346	INTERR;
347
348	return (pri->pri);
349	}
350
351	static int
352	weight_compare(const void n1, const void n2)
353	{
354	int32_t k1 = ((const weight_t *)n1)->pri;
355	int32_t k2 = ((const weight_t *)n2)->pri;
356
357	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
358	}
359
360	static int
361	collsym_compare(const void n1, const void n2)
362	{
363	const collsym_t *c1 = n1;
364	const collsym_t *c2 = n2;
365	int rv;
366
367	rv = strcmp(c1->name, c2->name);
368	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
369	}
370
371	static int
372	collundef_compare(const void n1, const void n2)
373	{
374	const collundef_t *c1 = n1;
375	const collundef_t *c2 = n2;
376	int rv;
377
378	rv = strcmp(c1->name, c2->name);
379	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
380	}
381
382	static int
383	element_compare_symbol(const void n1, const void n2)
384	{
385	const collelem_t *c1 = n1;
386	const collelem_t *c2 = n2;
387	int rv;
388
389	rv = strcmp(c1->symbol, c2->symbol);
390	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
391	}
392
393	static int
394	element_compare_expand(const void n1, const void n2)
395	{
396	const collelem_t *c1 = n1;
397	const collelem_t *c2 = n2;
398	int rv;
399
400	rv = wcscmp(c1->expand, c2->expand);
401	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
402	}
403
404	static int
405	collchar_compare(const void n1, const void n2)
406	{
407	wchar_t k1 = ((const collchar_t *)n1)->wc;
408	wchar_t k2 = ((const collchar_t *)n2)->wc;
409
410	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
411	}
412
413	static int
414	subst_compare(const void n1, const void n2)
415	{
416	int32_t k1 = ((const subst_t *)n1)->key;
417	int32_t k2 = ((const subst_t *)n2)->key;
418
419	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
420	}
421
422	#pragma GCC diagnostic push
423	#pragma GCC diagnostic ignored "-Wcast-qual"
424
425	static int
426	subst_compare_ref(const void n1, const void n2)
427	{
428	int32_t c1 = ((subst_t )n1)->ref;
429	int32_t c2 = ((subst_t )n2)->ref;
430	int rv;
431
432	rv = wcscmp((wchar_t )c1, (wchar_t )c2);
433	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
434	}
435
436	#pragma GCC diagnostic pop
437
438	void
439	init_collate(void)
440	{
441	int i;
442
443	avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
444	offsetof(collsym_t, avl));
445
446	avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
447	offsetof(collundef_t, avl));
448
449	avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
450	offsetof(collelem_t, avl_bysymbol));
451	avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
452	offsetof(collelem_t, avl_byexpand));
453
454	avl_create(&collchars, collchar_compare, sizeof (collchar_t),
455	offsetof(collchar_t, avl));
456
457	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
458	avl_create(&substs[i], subst_compare, sizeof (subst_t),
459	offsetof(subst_t, avl));
460	avl_create(&substs_ref[i], subst_compare_ref,
461	sizeof (subst_t), offsetof(subst_t, avl_ref));
462	avl_create(&weights[i], weight_compare, sizeof (weight_t),
463	offsetof(weight_t, avl));
464	nweight[i] = 1;
465	}
466
467	(void) memset(&collinfo, 0, sizeof (collinfo));
468
469	/* allocate some initial priorities */
470	pri_ignore = new_pri();
471
472	set_pri(pri_ignore, 0, RESOLVED);
473
474	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
475	pri_undefined[i] = new_pri();
476
477	/* we will override this later */
478	set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
479	}
480	}
481
482	void
483	define_collsym(char *name)
484	{
485	collsym_t *sym;
486	avl_index_t where;
487
488	if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
489	fprintf(stderr,"out of memory");
490	return;
491	}
492	sym->name = name;
493	sym->ref = new_pri();
494
495	if (avl_find(&collsyms, sym, &where) != NULL) {
496	/*
497	* This should never happen because we are only called
498	* for undefined symbols.
499	*/
500	INTERR;
501	return;
502	}
503	avl_insert(&collsyms, sym, where);
504	}
505
506	collsym_t *
507	lookup_collsym(char *name)
508	{
509	collsym_t srch;
510
511	srch.name = name;
512	return (avl_find(&collsyms, &srch, NULL));
513	}
514
515	collelem_t *
516	lookup_collelem(char *symbol)
517	{
518	collelem_t srch;
519
520	srch.symbol = symbol;
521	return (avl_find(&elem_by_symbol, &srch, NULL));
522	}
523
524	static collundef_t *
525	get_collundef(char *name)
526	{
527	collundef_t srch;
528	collundef_t *ud;
529	avl_index_t where;
530	int i;
531
532	srch.name = name;
533	if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
534	if (((ud = calloc(sizeof (*ud), 1)) == NULL) \|\|
535	((ud->name = strdup(name)) == NULL)) {
536	fprintf(stderr,"out of memory");
537	return (NULL);
538	}
539	for (i = 0; i < NUM_WT; i++) {
540	ud->ref[i] = new_pri();
541	}
542	avl_insert(&collundefs, ud, where);
543	}
544	add_charmap_undefined(name);
545	return (ud);
546	}
547
548	static collchar_t *
549	get_collchar(wchar_t wc, int create)
550	{
551	collchar_t srch;
552	collchar_t *cc;
553	avl_index_t where;
554	int i;
555
556	srch.wc = wc;
557	cc = avl_find(&collchars, &srch, &where);
558	if ((cc == NULL) && create) {
559	if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
560	fprintf(stderr, "out of memory");
561	return (NULL);
562	}
563	for (i = 0; i < NUM_WT; i++) {
564	cc->ref[i] = new_pri();
565	}
566	cc->wc = wc;
567	avl_insert(&collchars, cc, where);
568	}
569	return (cc);
570	}
571
572	void
573	end_order_collsym(collsym_t *sym)
574	{
575	start_order(T_COLLSYM);
576	/* update the weight */
577
578	set_pri(sym->ref, nextpri, RESOLVED);
579	nextpri++;
580	}
581
582	void
583	end_order(void)
584	{
585	int i;
586	int32_t pri;
587	int32_t ref;
588	collpri_t *p;
589
590	/* advance the priority/weight */
591	pri = nextpri;
592
593	switch (currorder) {
594	case T_CHAR:
595	for (i = 0; i < NUM_WT; i++) {
596	if (((ref = order_weights[i]) < 0) \|\|
597	((p = get_pri(ref)) == NULL) \|\|
598	(p->pri == -1)) {
599	/* unspecified weight is a self reference */
600	set_pri(currchar->ref[i], pri, RESOLVED);
601	} else {
602	set_pri(currchar->ref[i], ref, REFER);
603	}
604	order_weights[i] = -1;
605	}
606
607	/* leave a cookie trail in case next symbol is ellipsis */
608	ellipsis_start = currchar->wc + 1;
609	currchar = NULL;
610	break;
611
612	case T_ELLIPSIS:
613	/* save off the weights were we can find them */
614	for (i = 0; i < NUM_WT; i++) {
615	ellipsis_weights[i] = order_weights[i];
616	order_weights[i] = -1;
617	}
618	break;
619
620	case T_COLLELEM:
621	if (currelem == NULL) {
622	INTERR;
623	} else {
624	for (i = 0; i < NUM_WT; i++) {
625
626	if (((ref = order_weights[i]) < 0) \|\|
627	((p = get_pri(ref)) == NULL) \|\|
628	(p->pri == -1)) {
629	set_pri(currelem->ref[i], pri,
630	RESOLVED);
631	} else {
632	set_pri(currelem->ref[i], ref, REFER);
633	}
634	order_weights[i] = -1;
635	}
636	}
637	break;
638
639	case T_UNDEFINED:
640	for (i = 0; i < NUM_WT; i++) {
641	if (((ref = order_weights[i]) < 0) \|\|
642	((p = get_pri(ref)) == NULL) \|\|
643	(p->pri == -1)) {
644	set_pri(pri_undefined[i], -1, RESOLVED);
645	} else {
646	set_pri(pri_undefined[i], ref, REFER);
647	}
648	order_weights[i] = -1;
649	}
650	break;
651
652	case T_SYMBOL:
653	for (i = 0; i < NUM_WT; i++) {
654	if (((ref = order_weights[i]) < 0) \|\|
655	((p = get_pri(ref)) == NULL) \|\|
656	(p->pri == -1)) {
657	set_pri(currundef->ref[i], pri, RESOLVED);
658	} else {
659	set_pri(currundef->ref[i], ref, REFER);
660	}
661	order_weights[i] = -1;
662	}
663	break;
664
665	default:
666	INTERR;
667	}
668
669	nextpri++;
670	}
671
672	static void
673	start_order(int type)
674	{
675	int i;
676
677	lastorder = currorder;
678	currorder = type;
679
680	/* this is used to protect ELLIPSIS processing */
681	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
682	fprintf(stderr, "character value expected");
683	}
684
685	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
686	order_weights[i] = -1;
687	}
688	curr_weight = 0;
689	}
690
691	void
692	start_order_undefined(void)
693	{
694	start_order(T_UNDEFINED);
695	}
696
697	void
698	start_order_symbol(char *name)
699	{
700	currundef = get_collundef(name);
701	start_order(T_SYMBOL);
702	}
703
704	void
705	start_order_char(wchar_t wc)
706	{
707	collchar_t *cc;
708	int32_t ref;
709
710	start_order(T_CHAR);
711
712	/*
713	* If we last saw an ellipsis, then we need to close the range.
714	* Handle that here. Note that we have to be careful because the
715	* items inside the range are treated exclusiveley to the items
716	* outside of the range. The ends of the range can have quite
717	* different weights than the range members.
718	*/
719	if (lastorder == T_ELLIPSIS) {
720	int i;
721
722	if (wc < ellipsis_start) {
723	fprintf(stderr, "malformed range!");
724	return;
725	}
726	while (ellipsis_start < wc) {
727	/*
728	* pick all of the saved weights for the
729	* ellipsis. note that -1 encodes for the
730	* ellipsis itself, which means to take the
731	* current relative priority.
732	*/
733	if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
734	INTERR;
735	return;
736	}
737	for (i = 0; i < NUM_WT; i++) {
738	collpri_t *p;
739	if (((ref = ellipsis_weights[i]) == -1) \|\|
740	((p = get_pri(ref)) == NULL) \|\|
741	(p->pri == -1)) {
742	set_pri(cc->ref[i], nextpri, RESOLVED);
743	} else {
744	set_pri(cc->ref[i], ref, REFER);
745	}
746	ellipsis_weights[i] = 0;
747	}
748	ellipsis_start++;
749	nextpri++;
750	}
751	}
752
753	currchar = get_collchar(wc, 1);
754	}
755
756	void
757	start_order_collelem(collelem_t *e)
758	{
759	start_order(T_COLLELEM);
760	currelem = e;
761	}
762
763	void
764	start_order_ellipsis(void)
765	{
766	int i;
767
768	start_order(T_ELLIPSIS);
769
770	if (lastorder != T_CHAR) {
771	fprintf(stderr, "illegal starting point for range");
772	return;
773	}
774
775	for (i = 0; i < NUM_WT; i++) {
776	ellipsis_weights[i] = order_weights[i];
777	}
778	}
779
780	void
781	define_collelem(char name, wchar_t wcs)
782	{
783	collelem_t *e;
784	avl_index_t where1;
785	avl_index_t where2;
786	int i;
787
788	if (wcslen(wcs) >= COLLATE_STR_LEN) {
789	fprintf(stderr,"expanded collation element too long");
790	return;
791	}
792
793	if ((e = calloc(sizeof (*e), 1)) == NULL) {
794	fprintf(stderr, "out of memory");
795	return;
796	}
797	e->expand = wcs;
798	e->symbol = name;
799
800	/*
801	* This is executed before the order statement, so we don't
802	* know how many priorities we really need. We allocate one
803	* for each possible weight. Not a big deal, as collating-elements
804	* prove to be quite rare.
805	*/
806	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
807	e->ref[i] = new_pri();
808	}
809
810	/* A character sequence can only reduce to one element. */
811	if ((avl_find(&elem_by_symbol, e, &where1) != NULL) \|\|
812	(avl_find(&elem_by_expand, e, &where2) != NULL)) {
813	fprintf(stderr, "duplicate collating element definition");
814	return;
815	}
816	avl_insert(&elem_by_symbol, e, where1);
817	avl_insert(&elem_by_expand, e, where2);
818	}
819
820	void
821	add_order_bit(int kw)
822	{
823	uint8_t bit = DIRECTIVE_UNDEF;
824
825	switch (kw) {
826	case T_FORWARD:
827	bit = DIRECTIVE_FORWARD;
828	break;
829	case T_BACKWARD:
830	bit = DIRECTIVE_BACKWARD;
831	break;
832	case T_POSITION:
833	bit = DIRECTIVE_POSITION;
834	break;
835	default:
836	INTERR;
837	break;
838	}
839	collinfo.directive[collinfo.directive_count] \|= bit;
840	}
841
842	void
843	add_order_directive(void)
844	{
845	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
846	fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
847	}
848	collinfo.directive_count++;
849	}
850
851	static void
852	add_order_pri(int32_t ref)
853	{
854	if (curr_weight >= NUM_WT) {
855	fprintf(stderr,"too many weights (max %d)", NUM_WT);
856	return;
857	}
858	order_weights[curr_weight] = ref;
859	curr_weight++;
860	}
861
862	void
863	add_order_collsym(collsym_t *s)
864	{
865	add_order_pri(s->ref);
866	}
867
868	void
869	add_order_char(wchar_t wc)
870	{
871	collchar_t *cc;
872
873	if ((cc = get_collchar(wc, 1)) == NULL) {
874	INTERR;
875	return;
876	}
877
878	add_order_pri(cc->ref[curr_weight]);
879	}
880
881	void
882	add_order_collelem(collelem_t *e)
883	{
884	add_order_pri(e->ref[curr_weight]);
885	}
886
887	void
888	add_order_ignore(void)
889	{
890	add_order_pri(pri_ignore);
891	}
892
893	void
894	add_order_symbol(char *sym)
895	{
896	collundef_t *c;
897	if ((c = get_collundef(sym)) == NULL) {
898	INTERR;
899	return;
900	}
901	add_order_pri(c->ref[curr_weight]);
902	}
903
904	void
905	add_order_ellipsis(void)
906	{
907	/* special NULL value indicates self reference */
908	add_order_pri(0);
909	}
910
911	void
912	add_order_subst(void)
913	{
914	subst_t srch;
915	subst_t *s;
916	avl_index_t where;
917	int i;
918
919	(void) memset(&srch, 0, sizeof (srch));
920	for (i = 0; i < curr_subst; i++) {
921	srch.ref[i] = subst_weights[i];
922	subst_weights[i] = 0;
923	}
924	s = avl_find(&substs_ref[curr_weight], &srch, &where);
925
926	if (s == NULL) {
927	if ((s = calloc(sizeof (*s), 1)) == NULL) {
928	fprintf(stderr,"out of memory");
929	return;
930	}
931	s->key = new_pri();
932
933	/*
934	* We use a self reference for our key, but we set a
935	* high bit to indicate that this is a substitution
936	* reference. This will expedite table lookups later,
937	* and prevent table lookups for situations that don't
938	* require it. (In short, its a big win, because we
939	* can skip a lot of binary searching.)
940	*/
941	set_pri(s->key,
942	(nextsubst[curr_weight] \| COLLATE_SUBST_PRIORITY),
943	RESOLVED);
944	nextsubst[curr_weight] += 1;
945
946	for (i = 0; i < curr_subst; i++) {
947	s->ref[i] = srch.ref[i];
948	}
949
950	avl_insert(&substs_ref[curr_weight], s, where);
951
952	if (avl_find(&substs[curr_weight], s, &where) != NULL) {
953	INTERR;
954	return;
955	}
956	avl_insert(&substs[curr_weight], s, where);
957	}
958	curr_subst = 0;
959
960
961	/*
962	* We are using the current (unique) priority as a search key
963	* in the substitution table.
964	*/
965	add_order_pri(s->key);
966	}
967
968	static void
969	add_subst_pri(int32_t ref)
970	{
971	if (curr_subst >= COLLATE_STR_LEN) {
972	fprintf(stderr,"substitution string is too long");
973	return;
974	}
975	subst_weights[curr_subst] = ref;
976	curr_subst++;
977	}
978
979	void
980	add_subst_char(wchar_t wc)
981	{
982	collchar_t *cc;
983
984
985	if (((cc = get_collchar(wc, 1)) == NULL) \|\|
986	(cc->wc != wc)) {
987	INTERR;
988	return;
989	}
990	/* we take the weight for the character at that position */
991	add_subst_pri(cc->ref[curr_weight]);
992	}
993
994	void
995	add_subst_collelem(collelem_t *e)
996	{
997	add_subst_pri(e->ref[curr_weight]);
998	}
999
1000	void
1001	add_subst_collsym(collsym_t *s)
1002	{
1003	add_subst_pri(s->ref);
1004	}
1005
1006	void
1007	add_subst_symbol(char *ptr)
1008	{
1009	collundef_t *cu;
1010
1011	if ((cu = get_collundef(ptr)) != NULL) {
1012	add_subst_pri(cu->ref[curr_weight]);
1013	}
1014	}
1015
1016	void
1017	add_weight(int32_t ref, int pass)
1018	{
1019	weight_t srch;
1020	weight_t *w;
1021	avl_index_t where;
1022
1023	srch.pri = resolve_pri(ref);
1024
1025	/* No translation of ignores */
1026	if (srch.pri == 0)
1027	return;
1028
1029	/* Substitution priorities are not weights */
1030	if (srch.pri & COLLATE_SUBST_PRIORITY)
1031	return;
1032
1033	if (avl_find(&weights[pass], &srch, &where) != NULL)
1034	return;
1035
1036	if ((w = calloc(sizeof (*w), 1)) == NULL) {
1037	fprintf(stderr, "out of memory");
1038	return;
1039	}
1040	w->pri = srch.pri;
1041	avl_insert(&weights[pass], w, where);
1042	}
1043
1044	void
1045	add_weights(int32_t *refs)
1046	{
1047	int i;
1048	for (i = 0; i < NUM_WT; i++) {
1049	add_weight(refs[i], i);
1050	}
1051	}
1052
1053	int32_t
1054	get_weight(int32_t ref, int pass)
1055	{
1056	weight_t srch;
1057	weight_t *w;
1058	int32_t pri;
1059
1060	pri = resolve_pri(ref);
1061	if (pri & COLLATE_SUBST_PRIORITY) {
1062	return (pri);
1063	}
1064	if (pri <= 0) {
1065	return (pri);
1066	}
1067	srch.pri = pri;
1068	if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1069	INTERR;
1070	return (-1);
1071	}
1072	return (w->opt);
1073	}
1074
1075	wchar_t *
1076	wsncpy(wchar_t s1, const wchar_t s2, size_t n)
1077	{
1078	wchar_t *os1 = s1;
1079
1080	n++;
1081	while (--n > 0 && (s1++ = s2++) != 0)
1082	continue;
1083	if (n > 0)
1084	while (--n > 0)
1085	*s1++ = 0;
1086	return (os1);
1087	}
1088
1089	void
1090	dump_collate(void)
1091	{
1092	FILE *f;
1093	int i, j, n;
1094	size_t sz;
1095	int32_t pri;
1096	collelem_t *ce;
1097	collchar_t *cc;
1098	subst_t *sb;
1099	char vers[COLLATE_STR_LEN];
1100	collate_char_t chars[UCHAR_MAX + 1];
1101	collate_large_t *large;
1102	collate_subst_t *subst[COLL_WEIGHTS_MAX];
1103	collate_chain_t *chain;
1104
1105	/*
1106	* We have to run throught a preliminary pass to identify all the
1107	* weights that we use for each sorting level.
1108	*/
1109	for (i = 0; i < NUM_WT; i++) {
1110	add_weight(pri_ignore, i);
1111	}
1112	for (i = 0; i < NUM_WT; i++) {
1113	for (sb = avl_first(&substs[i]); sb;
1114	sb = AVL_NEXT(&substs[i], sb)) {
1115	for (j = 0; sb->ref[j]; j++) {
1116	add_weight(sb->ref[j], i);
1117	}
1118	}
1119	}
1120	for (ce = avl_first(&elem_by_expand);
1121	ce != NULL;
1122	ce = AVL_NEXT(&elem_by_expand, ce)) {
1123	add_weights(ce->ref);
1124	}
1125	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1126	add_weights(cc->ref);
1127	}
1128
1129	/*
1130	* Now we walk the entire set of weights, removing the gaps
1131	* in the weights. This gives us optimum usage. The walk
1132	* occurs in priority.
1133	*/
1134	for (i = 0; i < NUM_WT; i++) {
1135	weight_t *w;
1136	for (w = avl_first(&weights[i]); w;
1137	w = AVL_NEXT(&weights[i], w)) {
1138	w->opt = nweight[i];
1139	nweight[i] += 1;
1140	}
1141	}
1142
1143	(void) memset(&chars, 0, sizeof (chars));
1144	(void) memset(vers, 0, COLLATE_STR_LEN);
1145	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1146
1147	/*
1148	* We need to make sure we arrange for the UNDEFINED field
1149	* to show up. Also, set the total weight counts.
1150	*/
1151	for (i = 0; i < NUM_WT; i++) {
1152	if (resolve_pri(pri_undefined[i]) == -1) {
1153	set_pri(pri_undefined[i], -1, RESOLVED);
1154	/* they collate at the end of everything else */
1155	collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1156	}
1157	collinfo.pri_count[i] = nweight[i];
1158	}
1159
1160	collinfo.pri_count[NUM_WT] = max_wide();
1161	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1162	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1163
1164	/*
1165	* Ordinary character priorities
1166	*/
1167	for (i = 0; i <= UCHAR_MAX; i++) {
1168	if ((cc = get_collchar(i, 0)) != NULL) {
1169	for (j = 0; j < NUM_WT; j++) {
1170	chars[i].pri[j] = get_weight(cc->ref[j], j);
1171	}
1172	} else {
1173	for (j = 0; j < NUM_WT; j++) {
1174	chars[i].pri[j] =
1175	get_weight(pri_undefined[j], j);
1176	}
1177	/*
1178	* Per POSIX, for undefined characters, we
1179	* also have to add a last item, which is the
1180	* character code.
1181	*/
1182	chars[i].pri[NUM_WT] = i;
1183	}
1184	}
1185
1186	/*
1187	* Substitution tables
1188	*/
1189	for (i = 0; i < NUM_WT; i++) {
1190	collate_subst_t *st = NULL;
1191	n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1192	if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1193	fprintf(stderr, "out of memory");
1194	return;
1195	}
1196	n = 0;
1197	for (sb = avl_first(&substs[i]); sb;
1198	sb = AVL_NEXT(&substs[i], sb)) {
1199	if ((st[n].key = resolve_pri(sb->key)) < 0) {
1200	/* by definition these resolve! */
1201	INTERR;
1202	}
1203	if (st[n].key != (n \| COLLATE_SUBST_PRIORITY)) {
1204	INTERR;
1205	}
1206	for (j = 0; sb->ref[j]; j++) {
1207	st[n].pri[j] = get_weight(sb->ref[j], i);
1208	}
1209	n++;
1210	}
1211	if (n != collinfo.subst_count[i])
1212	INTERR;
1213	subst[i] = st;
1214	}
1215
1216
1217	/*
1218	* Chains, i.e. collating elements
1219	*/
1220	collinfo.chain_count = avl_numnodes(&elem_by_expand);
1221	chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1222	if (chain == NULL) {
1223	fprintf(stderr, "out of memory");
1224	return;
1225	}
1226	for (n = 0, ce = avl_first(&elem_by_expand);
1227	ce != NULL;
1228	ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1229	(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1230	for (i = 0; i < NUM_WT; i++) {
1231	chain[n].pri[i] = get_weight(ce->ref[i], i);
1232	}
1233	}
1234	if (n != collinfo.chain_count)
1235	INTERR;
1236
1237	/*
1238	* Large (> UCHAR_MAX) character priorities
1239	*/
1240	large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1241	if (large == NULL) {
1242	fprintf(stderr, "out of memory");
1243	return;
1244	}
1245
1246	i = 0;
1247	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1248	int undef = 0;
1249	/* we already gathered those */
1250	if (cc->wc <= UCHAR_MAX)
1251	continue;
1252	for (j = 0; j < NUM_WT; j++) {
1253	if ((pri = get_weight(cc->ref[j], j)) < 0) {
1254	undef = 1;
1255	}
1256	if (undef && (pri >= 0)) {
1257	/* if undefined, then all priorities are */
1258	INTERR;
1259	} else {
1260	large[i].pri.pri[j] = pri;
1261	}
1262	}
1263	if (!undef) {
1264	large[i].val = cc->wc;
1265	collinfo.large_count = i++;
1266	}
1267	}
1268
1269	if ((f = open_category()) == NULL) {
1270	return;
1271	}
1272
1273	/* Time to write the entire data set out */
1274
1275	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) \|\|
1276	(wr_category(&collinfo, sizeof (collinfo), f) < 0) \|\|
1277	(wr_category(&chars, sizeof (chars), f) < 0)) {
1278	return;
1279	}
1280
1281	for (i = 0; i < NUM_WT; i++) {
1282	sz = sizeof (collate_subst_t) * collinfo.subst_count[i];
1283	if (wr_category(subst[i], sz, f) < 0) {
1284	return;
1285	}
1286	}
1287	sz = sizeof (collate_chain_t) * collinfo.chain_count;
1288	if (wr_category(chain, sz, f) < 0) {
1289	return;
1290	}
1291	sz = sizeof (collate_large_t) * collinfo.large_count;
1292	if (wr_category(large, sz, f) < 0) {
1293	return;
1294	}
1295
1296	close_category(f);
1297	}