collate 1/4: Add support for LC_COLLATE format "DragonFly 4.4" in libc
[dragonfly.git] / lib / libc / locale / collate.c
1 /*
2  * Copright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *              at Electronni Visti IA, Kiev, Ukraine.
5  *                      All rights reserved.
6  *
7  * Copyright (c) 2011 The FreeBSD Foundation
8  * All rights reserved.
9  * Portions of this software were developed by David Chisnall
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Adapted to xlocale by John Marino <draco@marino.st>
34  */
35
36 #include "namespace.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include <errno.h>
42 #include <unistd.h>
43 #include <fcntl.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47 #include "un-namespace.h"
48
49 #include "collate.h"
50 #include "setlocale.h"
51 #include "ldpart.h"
52
53 struct xlocale_collate __xlocale_global_collate = {
54         {{0}, "C"}, 1, 0, 0, 0
55 };
56
57 struct xlocale_collate __xlocale_C_collate = {
58         {{0}, "C"}, 1, 0, 0, 0
59 };
60
61 #include "libc_private.h"
62
63 int
64 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
65
66 static void
67 destruct_collate(void *t)
68 {
69         struct xlocale_collate *table = t;
70         if (table->map && (table->maplen > 0)) {
71                 (void) munmap(table->map, table->maplen);
72         }
73         free(t);
74 }
75
76 void *
77 __collate_load(const char *encoding, __unused locale_t unused)
78 {
79         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
80                 return &__xlocale_C_collate;
81         }
82         struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
83         table->header.header.destructor = destruct_collate;
84         // FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
85         // the caching outside of this section
86         if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
87                 xlocale_release(table);
88                 return NULL;
89         }
90         return table;
91 }
92
93 /**
94  * Load the collation tables for the specified encoding into the global table.
95  */
96 int
97 __collate_load_tables(const char *encoding)
98 {
99         int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate);
100         return ret;
101 }
102
103 int
104 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
105 {
106         int i, chains, z;
107         char buf[PATH_MAX];
108         char *TMP;
109         char *map;
110         collate_info_t *info;
111         struct stat sbuf;
112         int fd;
113
114         /* 'encoding' must be already checked. */
115         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
116                 table->__collate_load_error = 1;
117                 return (_LDP_CACHE);
118         }
119
120         (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE",
121             _PathLocale, encoding);
122
123         if ((fd = _open(buf, O_RDONLY)) < 0)
124                 return (_LDP_ERROR);
125         if (_fstat(fd, &sbuf) < 0) {
126                 (void) _close(fd);
127                 return (_LDP_ERROR);
128         }
129         if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
130                 (void) _close(fd);
131                 errno = EINVAL;
132                 return (_LDP_ERROR);
133         }
134         map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
135         (void) _close(fd);
136         if ((TMP = map) == NULL) {
137                 return (_LDP_ERROR);
138         }
139
140         if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
141                 (void) munmap(map, sbuf.st_size);
142                 errno = EINVAL;
143                 return (_LDP_ERROR);
144         }
145         TMP += COLLATE_STR_LEN;
146
147         info = (void *)TMP;
148         TMP += sizeof (*info);
149
150         if ((info->directive_count < 1) ||
151             (info->directive_count >= COLL_WEIGHTS_MAX) ||
152             ((chains = info->chain_count) < 0)) {
153                 (void) munmap(map, sbuf.st_size);
154                 errno = EINVAL;
155                 return (_LDP_ERROR);
156         }
157
158         i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
159             (sizeof (collate_chain_t) * chains) +
160             (sizeof (collate_large_t) * info->large_count);
161         for (z = 0; z < (info->directive_count); z++) {
162                 i += sizeof (collate_subst_t) * info->subst_count[z];
163         }
164         if (i != (sbuf.st_size - (TMP - map))) {
165                 (void) munmap(map, sbuf.st_size);
166                 errno = EINVAL;
167                 return (_LDP_ERROR);
168         }
169
170         table->char_pri_table = (void *)TMP;
171         TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
172
173         for (z = 0; z < info->directive_count; z++) {
174                 if (info->subst_count[z] > 0) {
175                         table->subst_table[z] = (void *)TMP;
176                         TMP += info->subst_count[z] * sizeof (collate_subst_t);
177                 } else {
178                         table->subst_table[z] = NULL;
179                 }
180         }
181
182         if (chains > 0) {
183                 table->chain_pri_table = (void *)TMP;
184                 TMP += chains * sizeof (collate_chain_t);
185         } else
186                 table->chain_pri_table = NULL;
187         if (info->large_count > 0)
188                 table->large_pri_table = (void *)TMP;
189         else
190                 table->large_pri_table = NULL;
191
192         table->info = info;
193         table->__collate_load_error = 0;
194
195         return (_LDP_LOADED);
196 }
197
198 /*
199  * Note: for performance reasons, we have expanded bsearch here.  This avoids
200  * function call overhead with each comparison.
201  */
202
203 static int32_t *
204 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
205 {
206         collate_subst_t *p;
207         int n = table->info->subst_count[pass];
208
209         if (n == 0)
210                 return (NULL);
211
212         if (pass >= table->info->directive_count)
213                 return (NULL);
214
215         if (!(key & COLLATE_SUBST_PRIORITY))
216                 return (NULL);
217
218         p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
219         return (p->pri);
220 }
221
222 static collate_chain_t *
223 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
224 {
225         int low;
226         int high;
227         int next, compar, l;
228         collate_chain_t *p;
229         collate_chain_t *tab;
230
231         if (table->info->chain_count == 0)
232                 return (NULL);
233
234         low = 0;
235         high = table->info->chain_count - 1;
236         tab = table->chain_pri_table;
237
238         while (low <= high) {
239                 next = (low + high) / 2;
240                 p = tab + next;
241                 compar = *key - *p->str;
242                 if (compar == 0) {
243                         l = wcsnlen(p->str, COLLATE_STR_LEN);
244                         compar = wcsncmp(key, p->str, l);
245                         if (compar == 0) {
246                                 *len = l;
247                                 return (p);
248                         }
249                 }
250                 if (compar > 0)
251                         low = next + 1;
252                 else
253                         high = next - 1;
254         }
255         return (NULL);
256 }
257
258 static collate_large_t *
259 largesearch(struct xlocale_collate *table, const wchar_t key)
260 {
261         int low = 0;
262         int high = table->info->large_count - 1;
263         int next, compar;
264         collate_large_t *p;
265         collate_large_t *tab = table->large_pri_table;
266
267         if (table->info->large_count == 0)
268                 return (NULL);
269
270         while (low <= high) {
271                 next = (low + high) / 2;
272                 p = tab + next;
273                 compar = key - p->val;
274                 if (compar == 0)
275                         return (p);
276                 if (compar > 0)
277                         low = next + 1;
278                 else
279                         high = next - 1;
280         }
281         return (NULL);
282 }
283
284 void
285 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
286     int *pri, int which, const int **state)
287 {
288         collate_chain_t *p2;
289         collate_large_t *match;
290         int p, l;
291         const int *sptr;
292
293         /*
294          * If this is the "last" pass for the UNDEFINED, then
295          * we just return the priority itself.
296          */
297         if (which >= table->info->directive_count) {
298                 *pri = *t;
299                 *len = 1;
300                 *state = NULL;
301                 return;
302         }
303
304         /*
305          * If we have remaining substitution data from a previous
306          * call, consume it first.
307          */
308         if ((sptr = *state) != NULL) {
309                 *pri = *sptr;
310                 sptr++;
311                 *state = *sptr ? sptr : NULL;
312                 *len = 0;
313                 return;
314         }
315
316         /* No active substitutions */
317         *len = 1;
318
319         /*
320          * Check for composites such as dipthongs that collate as a
321          * single element (aka chains or collating-elements).
322          */
323         if (((p2 = chainsearch(table, t, &l)) != NULL) &&
324             ((p = p2->pri[which]) >= 0)) {
325
326                 *len = l;
327                 *pri = p;
328
329         } else if (*t <= UCHAR_MAX) {
330
331                 /*
332                  * Character is a small (8-bit) character.
333                  * We just look these up directly for speed.
334                  */
335                 *pri = table->char_pri_table[*t].pri[which];
336
337         } else if ((table->info->large_count > 0) &&
338             ((match = largesearch(table, *t)) != NULL)) {
339
340                 /*
341                  * Character was found in the extended table.
342                  */
343                 *pri = match->pri.pri[which];
344
345         } else {
346                 /*
347                  * Character lacks a specific definition.
348                  */
349                 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
350                         /* Mask off sign bit to prevent ordering confusion. */
351                         *pri = (*t & COLLATE_MAX_PRIORITY);
352                 } else {
353                         *pri = table->info->undef_pri[which];
354                 }
355                 /* No substitutions for undefined characters! */
356                 return;
357         }
358
359         /*
360          * Try substituting (expanding) the character.  We are
361          * currently doing this *after* the chain compression.  I
362          * think it should not matter, but this way might be slightly
363          * faster.
364          *
365          * We do this after the priority search, as this will help us
366          * to identify a single key value.  In order for this to work,
367          * its important that the priority assigned to a given element
368          * to be substituted be unique for that level.  The localedef
369          * code ensures this for us.
370          */
371         if ((sptr = substsearch(table, *pri, which)) != NULL) {
372                 if ((*pri = *sptr) != 0) {
373                         sptr++;
374                         *state = *sptr ? sptr : NULL;
375                 }
376         }
377
378 }
379
380 /*
381  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
382  * NOT NULL terminate.  That is left to the caller.
383  */
384 size_t
385 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
386     size_t room)
387 {
388         int             pri;
389         int             len;
390         const wchar_t   *t;
391         wchar_t         *tr = NULL;
392         int             direc;
393         int             pass;
394         const int32_t   *state;
395         size_t          want = 0;
396         size_t          need = 0;
397
398         for (pass = 0; pass <= table->info->directive_count; pass++) {
399
400                 state = NULL;
401
402                 if (pass != 0) {
403                         /* insert level separator from the previous pass */
404                         if (room) {
405                                 *xf++ = 1;
406                                 room--;
407                         }
408                         want++;
409                 }
410
411                 /* special pass for undefined */
412                 if (pass == table->info->directive_count) {
413                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
414                 } else {
415                         direc = table->info->directive[pass];
416                 }
417
418                 t = src;
419
420                 if (direc & DIRECTIVE_BACKWARD) {
421                         wchar_t *bp, *fp, c;
422                         if (tr)
423                                 free(tr);
424                         if ((tr = wcsdup(t)) == NULL) {
425                                 errno = ENOMEM;
426                                 goto fail;
427                         }
428                         bp = tr;
429                         fp = tr + wcslen(tr) - 1;
430                         while (bp < fp) {
431                                 c = *bp;
432                                 *bp++ = *fp;
433                                 *fp-- = c;
434                         }
435                         t = (const wchar_t *)tr;
436                 }
437
438                 if (direc & DIRECTIVE_POSITION) {
439                         while (*t || state) {
440                                 _collate_lookup(table, t, &len, &pri, pass, &state);
441                                 t += len;
442                                 if (pri <= 0) {
443                                         if (pri < 0) {
444                                                 errno = EINVAL;
445                                                 goto fail;
446                                         }
447                                         pri = COLLATE_MAX_PRIORITY;
448                                 }
449                                 if (room) {
450                                         *xf++ = pri;
451                                         room--;
452                                 }
453                                 want++;
454                                 need = want;
455                         }
456                 } else {
457                         while (*t || state) {
458                                 _collate_lookup(table, t, &len, &pri, pass, &state);
459                                 t += len;
460                                 if (pri <= 0) {
461                                         if (pri < 0) {
462                                                 errno = EINVAL;
463                                                 goto fail;
464                                         }
465                                         continue;
466                                 }
467                                 if (room) {
468                                         *xf++ = pri;
469                                         room--;
470                                 }
471                                 want++;
472                                 need = want;
473                         }
474                 }
475         }
476         if (tr)
477                 free(tr);
478         return (need);
479
480 fail:
481         if (tr)
482                 free(tr);
483         return ((size_t)(-1));
484 }
485
486 /*
487  * In the non-POSIX case, we transform each character into a string of
488  * characters representing the character's priority.  Since char is usually
489  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
490  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
491  * bits per byte.
492  *
493  * It turns out that we sometimes have real priorities that are
494  * 31-bits wide.  (But: be careful using priorities where the high
495  * order bit is set -- i.e. the priority is negative.  The sort order
496  * may be surprising!)
497  *
498  * TODO: This would be a good area to optimize somewhat.  It turns out
499  * that real prioririties *except for the last UNDEFINED pass* are generally
500  * very small.  We need the localedef code to precalculate the max
501  * priority for us, and ideally also give us a mask, and then we could
502  * severely limit what we expand to.
503  */
504 #define XFRM_BYTES      6
505 #define XFRM_OFFSET     ('0')   /* make all printable characters */
506 #define XFRM_SHIFT      6
507 #define XFRM_MASK       ((1 << XFRM_SHIFT) - 1)
508 #define XFRM_SEP        ('.')   /* chosen to be less than XFRM_OFFSET */
509
510 static int
511 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
512 {
513         /* we use unsigned to ensure zero fill on right shift */
514         uint32_t val = (uint32_t)table->info->pri_count[pass];
515         int nc = 0;
516
517         while (val) {
518                 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
519                 pri >>= XFRM_SHIFT;
520                 val >>= XFRM_SHIFT;
521                 p++;
522                 nc++;
523         }
524         return (nc);
525 }
526
527 size_t
528 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
529     size_t room)
530 {
531         int             pri;
532         int             len;
533         const wchar_t   *t;
534         wchar_t         *tr = NULL;
535         int             direc;
536         int             pass;
537         const int32_t   *state;
538         size_t          want = 0;
539         size_t          need = 0;
540         int             b;
541         uint8_t         buf[XFRM_BYTES];
542
543         for (pass = 0; pass <= table->info->directive_count; pass++) {
544
545                 state = NULL;
546
547                 if (pass != 0) {
548                         /* insert level separator from the previous pass */
549                         if (room) {
550                                 *xf++ = XFRM_SEP;
551                                 room--;
552                         }
553                         want++;
554                 }
555
556                 /* special pass for undefined */
557                 if (pass == table->info->directive_count) {
558                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559                 } else {
560                         direc = table->info->directive[pass];
561                 }
562
563                 t = src;
564
565                 if (direc & DIRECTIVE_BACKWARD) {
566                         wchar_t *bp, *fp, c;
567                         if (tr)
568                                 free(tr);
569                         if ((tr = wcsdup(t)) == NULL) {
570                                 errno = ENOMEM;
571                                 goto fail;
572                         }
573                         bp = tr;
574                         fp = tr + wcslen(tr) - 1;
575                         while (bp < fp) {
576                                 c = *bp;
577                                 *bp++ = *fp;
578                                 *fp-- = c;
579                         }
580                         t = (const wchar_t *)tr;
581                 }
582
583                 if (direc & DIRECTIVE_POSITION) {
584                         while (*t || state) {
585
586                                 _collate_lookup(table, t, &len, &pri, pass, &state);
587                                 t += len;
588                                 if (pri <= 0) {
589                                         if (pri < 0) {
590                                                 errno = EINVAL;
591                                                 goto fail;
592                                         }
593                                         pri = COLLATE_MAX_PRIORITY;
594                                 }
595
596                                 b = xfrm(table, buf, pri, pass);
597                                 want += b;
598                                 if (room) {
599                                         while (b) {
600                                                 b--;
601                                                 if (room) {
602                                                         *xf++ = buf[b];
603                                                         room--;
604                                                 }
605                                         }
606                                 }
607                                 need = want;
608                         }
609                 } else {
610                         while (*t || state) {
611                                 _collate_lookup(table, t, &len, &pri, pass, &state);
612                                 t += len;
613                                 if (pri <= 0) {
614                                         if (pri < 0) {
615                                                 errno = EINVAL;
616                                                 goto fail;
617                                         }
618                                         continue;
619                                 }
620
621                                 b = xfrm(table, buf, pri, pass);
622                                 want += b;
623                                 if (room) {
624
625                                         while (b) {
626                                                 b--;
627                                                 if (room) {
628                                                         *xf++ = buf[b];
629                                                         room--;
630                                                 }
631                                         }
632                                 }
633                                 need = want;
634                         }
635                 }
636         }
637         if (tr)
638                 free(tr);
639         return (need);
640
641 fail:
642         if (tr)
643                 free(tr);
644         return ((size_t)(-1));
645 }