libc/locale: Add __collate_equiv_value functionality
[dragonfly.git] / lib / libc / locale / collate.c
CommitLineData
16db8bac
JM
1/*
2 * Copright 2010 Nexenta Systems, Inc. All rights reserved.
984263bc
MD
3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4 * at Electronni Visti IA, Kiev, Ukraine.
5 * All rights reserved.
6 *
0d5acd74
JM
7 * Copyright (c) 2011 The FreeBSD Foundation
8 * All rights reserved.
9 * Portions of this software were developed by David Chisnall
10 * under sponsorship from the FreeBSD Foundation.
11 *
984263bc
MD
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
1de703da 32 *
16db8bac 33 * Adapted to xlocale by John Marino <draco@marino.st>
984263bc
MD
34 */
35
17ea2221 36#include "namespace.h"
984263bc
MD
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
16db8bac 40#include <wchar.h>
984263bc
MD
41#include <errno.h>
42#include <unistd.h>
16db8bac
JM
43#include <fcntl.h>
44#include <sys/types.h>
45#include <sys/stat.h>
46#include <sys/mman.h>
17ea2221 47#include "un-namespace.h"
984263bc
MD
48
49#include "collate.h"
0d5acd74
JM
50#include "setlocale.h"
51#include "ldpart.h"
52
0d5acd74 53struct xlocale_collate __xlocale_global_collate = {
16db8bac 54 {{0}, "C"}, 1, 0, 0, 0
0d5acd74 55};
984263bc 56
16db8bac
JM
57struct xlocale_collate __xlocale_C_collate = {
58 {{0}, "C"}, 1, 0, 0, 0
0d5acd74 59};
984263bc 60
16db8bac 61#include "libc_private.h"
984263bc 62
0d5acd74
JM
63int
64__collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
65
66static void
67destruct_collate(void *t)
68{
69 struct xlocale_collate *table = t;
16db8bac
JM
70 if (table->map && (table->maplen > 0)) {
71 (void) munmap(table->map, table->maplen);
0d5acd74
JM
72 }
73 free(t);
74}
75
76void *
de950ab3 77__collate_load(const char *encoding, __unused locale_t unused)
0d5acd74
JM
78{
79 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
80 return &__xlocale_C_collate;
81 }
82 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
83 table->header.header.destructor = destruct_collate;
84 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing
85 // the caching outside of this section
86 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
87 xlocale_release(table);
88 return NULL;
89 }
90 return table;
91}
92
93/**
94 * Load the collation tables for the specified encoding into the global table.
95 */
984263bc
MD
96int
97__collate_load_tables(const char *encoding)
0d5acd74
JM
98{
99 int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate);
0d5acd74
JM
100 return ret;
101}
102
103int
104__collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
984263bc 105{
16db8bac
JM
106 int i, chains, z;
107 char buf[PATH_MAX];
108 char *TMP;
109 char *map;
110 collate_info_t *info;
111 struct stat sbuf;
112 int fd;
984263bc
MD
113
114 /* 'encoding' must be already checked. */
115 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
0d5acd74 116 table->__collate_load_error = 1;
984263bc
MD
117 return (_LDP_CACHE);
118 }
119
16db8bac
JM
120 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
121 _PathLocale, encoding);
984263bc 122
16db8bac 123 if ((fd = _open(buf, O_RDONLY)) < 0)
984263bc 124 return (_LDP_ERROR);
16db8bac
JM
125 if (_fstat(fd, &sbuf) < 0) {
126 (void) _close(fd);
984263bc
MD
127 return (_LDP_ERROR);
128 }
16db8bac
JM
129 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
130 (void) _close(fd);
131 errno = EINVAL;
984263bc
MD
132 return (_LDP_ERROR);
133 }
16db8bac
JM
134 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
135 (void) _close(fd);
136 if ((TMP = map) == NULL) {
984263bc
MD
137 return (_LDP_ERROR);
138 }
16db8bac
JM
139
140 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
141 (void) munmap(map, sbuf.st_size);
142 errno = EINVAL;
984263bc
MD
143 return (_LDP_ERROR);
144 }
16db8bac 145 TMP += COLLATE_STR_LEN;
984263bc 146
16db8bac
JM
147 info = (void *)TMP;
148 TMP += sizeof (*info);
984263bc 149
16db8bac
JM
150 if ((info->directive_count < 1) ||
151 (info->directive_count >= COLL_WEIGHTS_MAX) ||
152 ((chains = info->chain_count) < 0)) {
153 (void) munmap(map, sbuf.st_size);
154 errno = EINVAL;
155 return (_LDP_ERROR);
0d5acd74 156 }
16db8bac
JM
157
158 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
159 (sizeof (collate_chain_t) * chains) +
160 (sizeof (collate_large_t) * info->large_count);
161 for (z = 0; z < (info->directive_count); z++) {
162 i += sizeof (collate_subst_t) * info->subst_count[z];
163 }
164 if (i != (sbuf.st_size - (TMP - map))) {
165 (void) munmap(map, sbuf.st_size);
166 errno = EINVAL;
167 return (_LDP_ERROR);
0d5acd74 168 }
16db8bac
JM
169
170 table->char_pri_table = (void *)TMP;
171 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
172
173 for (z = 0; z < info->directive_count; z++) {
174 if (info->subst_count[z] > 0) {
175 table->subst_table[z] = (void *)TMP;
176 TMP += info->subst_count[z] * sizeof (collate_subst_t);
177 } else {
178 table->subst_table[z] = NULL;
984263bc
MD
179 }
180 }
16db8bac
JM
181
182 if (chains > 0) {
183 table->chain_pri_table = (void *)TMP;
184 TMP += chains * sizeof (collate_chain_t);
185 } else
186 table->chain_pri_table = NULL;
187 if (info->large_count > 0)
188 table->large_pri_table = (void *)TMP;
189 else
190 table->large_pri_table = NULL;
191
192 table->info = info;
0d5acd74 193 table->__collate_load_error = 0;
984263bc
MD
194
195 return (_LDP_LOADED);
196}
197
16db8bac
JM
198/*
199 * Note: for performance reasons, we have expanded bsearch here. This avoids
200 * function call overhead with each comparison.
201 */
202
203static int32_t *
204substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
205{
206 collate_subst_t *p;
207 int n = table->info->subst_count[pass];
208
209 if (n == 0)
210 return (NULL);
211
212 if (pass >= table->info->directive_count)
213 return (NULL);
214
215 if (!(key & COLLATE_SUBST_PRIORITY))
216 return (NULL);
217
218 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
219 return (p->pri);
220}
221
222static collate_chain_t *
223chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
984263bc 224{
16db8bac
JM
225 int low;
226 int high;
227 int next, compar, l;
228 collate_chain_t *p;
229 collate_chain_t *tab;
230
231 if (table->info->chain_count == 0)
232 return (NULL);
233
234 low = 0;
235 high = table->info->chain_count - 1;
236 tab = table->chain_pri_table;
237
238 while (low <= high) {
239 next = (low + high) / 2;
240 p = tab + next;
241 compar = *key - *p->str;
242 if (compar == 0) {
243 l = wcsnlen(p->str, COLLATE_STR_LEN);
244 compar = wcsncmp(key, p->str, l);
245 if (compar == 0) {
246 *len = l;
247 return (p);
248 }
984263bc 249 }
16db8bac
JM
250 if (compar > 0)
251 low = next + 1;
252 else
253 high = next - 1;
984263bc 254 }
16db8bac
JM
255 return (NULL);
256}
257
258static collate_large_t *
259largesearch(struct xlocale_collate *table, const wchar_t key)
260{
261 int low = 0;
262 int high = table->info->large_count - 1;
263 int next, compar;
264 collate_large_t *p;
265 collate_large_t *tab = table->large_pri_table;
266
267 if (table->info->large_count == 0)
268 return (NULL);
269
270 while (low <= high) {
271 next = (low + high) / 2;
272 p = tab + next;
273 compar = key - p->val;
274 if (compar == 0)
275 return (p);
276 if (compar > 0)
277 low = next + 1;
278 else
279 high = next - 1;
280 }
281 return (NULL);
984263bc
MD
282}
283
284void
16db8bac
JM
285_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
286 int *pri, int which, const int **state)
984263bc 287{
16db8bac
JM
288 collate_chain_t *p2;
289 collate_large_t *match;
290 int p, l;
291 const int *sptr;
292
293 /*
294 * If this is the "last" pass for the UNDEFINED, then
295 * we just return the priority itself.
296 */
297 if (which >= table->info->directive_count) {
298 *pri = *t;
299 *len = 1;
300 *state = NULL;
301 return;
302 }
984263bc 303
16db8bac
JM
304 /*
305 * If we have remaining substitution data from a previous
306 * call, consume it first.
307 */
308 if ((sptr = *state) != NULL) {
309 *pri = *sptr;
310 sptr++;
311 *state = *sptr ? sptr : NULL;
312 *len = 0;
313 return;
314 }
315
316 /* No active substitutions */
984263bc 317 *len = 1;
16db8bac
JM
318
319 /*
320 * Check for composites such as dipthongs that collate as a
321 * single element (aka chains or collating-elements).
322 */
323 if (((p2 = chainsearch(table, t, &l)) != NULL) &&
324 ((p = p2->pri[which]) >= 0)) {
325
326 *len = l;
327 *pri = p;
328
329 } else if (*t <= UCHAR_MAX) {
330
331 /*
332 * Character is a small (8-bit) character.
333 * We just look these up directly for speed.
334 */
335 *pri = table->char_pri_table[*t].pri[which];
336
337 } else if ((table->info->large_count > 0) &&
338 ((match = largesearch(table, *t)) != NULL)) {
339
340 /*
341 * Character was found in the extended table.
342 */
343 *pri = match->pri.pri[which];
344
345 } else {
346 /*
347 * Character lacks a specific definition.
348 */
349 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
350 /* Mask off sign bit to prevent ordering confusion. */
351 *pri = (*t & COLLATE_MAX_PRIORITY);
352 } else {
353 *pri = table->info->undef_pri[which];
354 }
355 /* No substitutions for undefined characters! */
356 return;
357 }
358
359 /*
360 * Try substituting (expanding) the character. We are
361 * currently doing this *after* the chain compression. I
362 * think it should not matter, but this way might be slightly
363 * faster.
364 *
365 * We do this after the priority search, as this will help us
366 * to identify a single key value. In order for this to work,
367 * its important that the priority assigned to a given element
368 * to be substituted be unique for that level. The localedef
369 * code ensures this for us.
370 */
371 if ((sptr = substsearch(table, *pri, which)) != NULL) {
372 if ((*pri = *sptr) != 0) {
373 sptr++;
374 *state = *sptr ? sptr : NULL;
984263bc
MD
375 }
376 }
16db8bac 377
984263bc
MD
378}
379
16db8bac
JM
380/*
381 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
382 * NOT NULL terminate. That is left to the caller.
383 */
384size_t
385_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
386 size_t room)
984263bc 387{
16db8bac
JM
388 int pri;
389 int len;
390 const wchar_t *t;
391 wchar_t *tr = NULL;
392 int direc;
393 int pass;
394 const int32_t *state;
395 size_t want = 0;
396 size_t need = 0;
397
398 for (pass = 0; pass <= table->info->directive_count; pass++) {
399
400 state = NULL;
401
402 if (pass != 0) {
403 /* insert level separator from the previous pass */
404 if (room) {
405 *xf++ = 1;
406 room--;
407 }
408 want++;
409 }
410
411 /* special pass for undefined */
412 if (pass == table->info->directive_count) {
413 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
414 } else {
415 direc = table->info->directive[pass];
416 }
417
418 t = src;
419
420 if (direc & DIRECTIVE_BACKWARD) {
421 wchar_t *bp, *fp, c;
422 if (tr)
423 free(tr);
424 if ((tr = wcsdup(t)) == NULL) {
425 errno = ENOMEM;
426 goto fail;
427 }
428 bp = tr;
429 fp = tr + wcslen(tr) - 1;
430 while (bp < fp) {
431 c = *bp;
432 *bp++ = *fp;
433 *fp-- = c;
434 }
435 t = (const wchar_t *)tr;
436 }
984263bc 437
16db8bac
JM
438 if (direc & DIRECTIVE_POSITION) {
439 while (*t || state) {
440 _collate_lookup(table, t, &len, &pri, pass, &state);
441 t += len;
442 if (pri <= 0) {
443 if (pri < 0) {
444 errno = EINVAL;
445 goto fail;
446 }
447 pri = COLLATE_MAX_PRIORITY;
448 }
449 if (room) {
450 *xf++ = pri;
451 room--;
452 }
453 want++;
454 need = want;
455 }
456 } else {
457 while (*t || state) {
458 _collate_lookup(table, t, &len, &pri, pass, &state);
459 t += len;
460 if (pri <= 0) {
461 if (pri < 0) {
462 errno = EINVAL;
463 goto fail;
464 }
465 continue;
466 }
467 if (room) {
468 *xf++ = pri;
469 room--;
470 }
471 want++;
472 need = want;
473 }
474 }
475 }
476 if (tr)
477 free(tr);
478 return (need);
479
480fail:
481 if (tr)
482 free(tr);
483 return ((size_t)(-1));
984263bc
MD
484}
485
16db8bac
JM
486/*
487 * In the non-POSIX case, we transform each character into a string of
488 * characters representing the character's priority. Since char is usually
489 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
490 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
491 * bits per byte.
492 *
493 * It turns out that we sometimes have real priorities that are
494 * 31-bits wide. (But: be careful using priorities where the high
495 * order bit is set -- i.e. the priority is negative. The sort order
496 * may be surprising!)
497 *
498 * TODO: This would be a good area to optimize somewhat. It turns out
499 * that real prioririties *except for the last UNDEFINED pass* are generally
500 * very small. We need the localedef code to precalculate the max
501 * priority for us, and ideally also give us a mask, and then we could
502 * severely limit what we expand to.
503 */
504#define XFRM_BYTES 6
505#define XFRM_OFFSET ('0') /* make all printable characters */
506#define XFRM_SHIFT 6
507#define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
508#define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
509
510static int
511xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
984263bc 512{
16db8bac
JM
513 /* we use unsigned to ensure zero fill on right shift */
514 uint32_t val = (uint32_t)table->info->pri_count[pass];
515 int nc = 0;
516
517 while (val) {
518 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
519 pri >>= XFRM_SHIFT;
520 val >>= XFRM_SHIFT;
521 p++;
522 nc++;
523 }
524 return (nc);
984263bc
MD
525}
526
16db8bac
JM
527size_t
528_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
529 size_t room)
984263bc 530{
16db8bac
JM
531 int pri;
532 int len;
533 const wchar_t *t;
534 wchar_t *tr = NULL;
535 int direc;
536 int pass;
537 const int32_t *state;
538 size_t want = 0;
539 size_t need = 0;
540 int b;
541 uint8_t buf[XFRM_BYTES];
542
543 for (pass = 0; pass <= table->info->directive_count; pass++) {
544
545 state = NULL;
546
547 if (pass != 0) {
548 /* insert level separator from the previous pass */
549 if (room) {
550 *xf++ = XFRM_SEP;
551 room--;
552 }
553 want++;
554 }
555
556 /* special pass for undefined */
557 if (pass == table->info->directive_count) {
558 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 } else {
560 direc = table->info->directive[pass];
561 }
562
563 t = src;
564
565 if (direc & DIRECTIVE_BACKWARD) {
566 wchar_t *bp, *fp, c;
567 if (tr)
568 free(tr);
569 if ((tr = wcsdup(t)) == NULL) {
570 errno = ENOMEM;
571 goto fail;
572 }
573 bp = tr;
574 fp = tr + wcslen(tr) - 1;
575 while (bp < fp) {
576 c = *bp;
577 *bp++ = *fp;
578 *fp-- = c;
579 }
580 t = (const wchar_t *)tr;
581 }
582
583 if (direc & DIRECTIVE_POSITION) {
584 while (*t || state) {
585
586 _collate_lookup(table, t, &len, &pri, pass, &state);
587 t += len;
588 if (pri <= 0) {
589 if (pri < 0) {
590 errno = EINVAL;
591 goto fail;
592 }
593 pri = COLLATE_MAX_PRIORITY;
594 }
595
596 b = xfrm(table, buf, pri, pass);
597 want += b;
598 if (room) {
599 while (b) {
600 b--;
601 if (room) {
602 *xf++ = buf[b];
603 room--;
604 }
605 }
606 }
607 need = want;
608 }
609 } else {
610 while (*t || state) {
611 _collate_lookup(table, t, &len, &pri, pass, &state);
612 t += len;
613 if (pri <= 0) {
614 if (pri < 0) {
615 errno = EINVAL;
616 goto fail;
617 }
618 continue;
619 }
620
621 b = xfrm(table, buf, pri, pass);
622 want += b;
623 if (room) {
624
625 while (b) {
626 b--;
627 if (room) {
628 *xf++ = buf[b];
629 room--;
630 }
631 }
632 }
633 need = want;
634 }
635 }
636 }
637 if (tr)
638 free(tr);
639 return (need);
640
641fail:
642 if (tr)
643 free(tr);
644 return ((size_t)(-1));
984263bc 645}
903faafa
JM
646
647/*
648 * __collate_equiv_value returns the primary collation value for the given
649 * collating symbol specified by str and len. Zero or negative is returned
650 * if the collating symbol was not found. This function is used by bracket
651 * code in the TRE regex library.
652 */
653int
654__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
655{
656 int32_t e;
657
658 if (len < 1 || len >= COLLATE_STR_LEN)
659 return (-1);
660
661 FIX_LOCALE(locale);
662 struct xlocale_collate *table =
663 (struct xlocale_collate*)locale->components[XLC_COLLATE];
664
665 if (table->__collate_load_error)
666 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
667
668 if (len == 1) {
669 e = -1;
670 if (*str <= UCHAR_MAX)
671 e = table->char_pri_table[*str].pri[0];
672 else if (table->info->large_count > 0) {
673 collate_large_t *match_large;
674 match_large = largesearch(table, *str);
675 if (match_large)
676 e = match_large->pri.pri[0];
677 }
678 if (e == 0)
679 return (1);
680 return (e > 0 ? e : 0);
681 }
682 if (table->info->chain_count > 0) {
683 wchar_t name[COLLATE_STR_LEN];
684 collate_chain_t *match_chain;
685 int clen;
686
687 wcsncpy (name, str, len);
688 name[len] = 0;
689 match_chain = chainsearch(table, name, &clen);
690 if (match_chain) {
691 e = match_chain->pri[0];
692 if (e == 0)
693 return (1);
694 return (e < 0 ? -e : e);
695 }
696 }
697 return (0);
698}