tic(1): Add missing beforedepend for termsort.c.
[dragonfly.git] / usr.bin / localedef / ctype.c
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35
36 #include <sys/tree.h>
37
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stddef.h>
41 #include <string.h>
42 #include <sys/types.h>
43 #include <wchar.h>
44 #include <ctype.h>
45 #include <wctype.h>
46 #include <unistd.h>
47 #include "localedef.h"
48 #include "parser.h"
49 #include "runefile.h"
50
51
52 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */
53 #ifndef _CTYPE_N
54 #define _CTYPE_N       0x00400000L
55 #endif
56
57 #define _ISUPPER        _CTYPE_U
58 #define _ISLOWER        _CTYPE_L
59 #define _ISDIGIT        _CTYPE_D
60 #define _ISXDIGIT       _CTYPE_X
61 #define _ISSPACE        _CTYPE_S
62 #define _ISBLANK        _CTYPE_B
63 #define _ISALPHA        _CTYPE_A
64 #define _ISPUNCT        _CTYPE_P
65 #define _ISGRAPH        _CTYPE_G
66 #define _ISPRINT        _CTYPE_R
67 #define _ISCNTRL        _CTYPE_C
68 #define _E1             _CTYPE_Q
69 #define _E2             _CTYPE_I
70 #define _E3             0
71 #define _E4             _CTYPE_N
72 #define _E5             _CTYPE_T
73
74 static wchar_t          last_ctype;
75 static int ctype_compare(const void *n1, const void *n2);
76
77 typedef struct ctype_node {
78         wchar_t wc;
79         int32_t ctype;
80         int32_t toupper;
81         int32_t tolower;
82         RB_ENTRY(ctype_node) entry;
83 } ctype_node_t;
84
85 static RB_HEAD(ctypes, ctype_node) ctypes;
86 RB_PROTOTYPE_STATIC(ctypes, ctype_node, entry, ctype_compare);
87 RB_GENERATE(ctypes, ctype_node, entry, ctype_compare);
88
89 static int
90 ctype_compare(const void *n1, const void *n2)
91 {
92         const ctype_node_t *c1 = n1;
93         const ctype_node_t *c2 = n2;
94
95         return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
96 }
97
98 void
99 init_ctype(void)
100 {
101         RB_INIT(&ctypes);
102 }
103
104
105 static void
106 add_ctype_impl(ctype_node_t *ctn)
107 {
108         switch (last_kw) {
109         case T_ISUPPER:
110                 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
111                 break;
112         case T_ISLOWER:
113                 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
114                 break;
115         case T_ISALPHA:
116                 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
117                 break;
118         case T_ISDIGIT:
119                 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
120                 break;
121         case T_ISSPACE:
122                 ctn->ctype |= _ISSPACE;
123                 break;
124         case T_ISCNTRL:
125                 ctn->ctype |= _ISCNTRL;
126                 break;
127         case T_ISGRAPH:
128                 ctn->ctype |= (_ISGRAPH | _ISPRINT);
129                 break;
130         case T_ISPRINT:
131                 ctn->ctype |= _ISPRINT;
132                 break;
133         case T_ISPUNCT:
134                 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
135                 break;
136         case T_ISXDIGIT:
137                 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
138                 break;
139         case T_ISBLANK:
140                 ctn->ctype |= (_ISBLANK | _ISSPACE);
141                 break;
142         case T_ISPHONOGRAM:
143                 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
144                 break;
145         case T_ISIDEOGRAM:
146                 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
147                 break;
148         case T_ISENGLISH:
149                 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
150                 break;
151         case T_ISNUMBER:
152                 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
153                 break;
154         case T_ISSPECIAL:
155                 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
156                 break;
157         case T_ISALNUM:
158                 /*
159                  * We can't do anything with this.  The character
160                  * should already be specified as a digit or alpha.
161                  */
162                 break;
163         default:
164                 errf("not a valid character class");
165         }
166 }
167
168 static ctype_node_t *
169 get_ctype(wchar_t wc)
170 {
171         ctype_node_t    srch;
172         ctype_node_t    *ctn;
173
174         srch.wc = wc;
175         if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
176                 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
177                         errf("out of memory");
178                         return (NULL);
179                 }
180                 ctn->wc = wc;
181
182                 RB_INSERT(ctypes, &ctypes, ctn);
183         }
184         return (ctn);
185 }
186
187 void
188 add_ctype(int val)
189 {
190         ctype_node_t    *ctn;
191
192         if ((ctn = get_ctype(val)) == NULL) {
193                 INTERR;
194                 return;
195         }
196         add_ctype_impl(ctn);
197         last_ctype = ctn->wc;
198 }
199
200 void
201 add_ctype_range(wchar_t end)
202 {
203         ctype_node_t    *ctn;
204         wchar_t         cur;
205
206         if (end < last_ctype) {
207                 errf("malformed character range (%u ... %u))",
208                     last_ctype, end);
209                 return;
210         }
211         for (cur = last_ctype + 1; cur <= end; cur++) {
212                 if ((ctn = get_ctype(cur)) == NULL) {
213                         INTERR;
214                         return;
215                 }
216                 add_ctype_impl(ctn);
217         }
218         last_ctype = end;
219
220 }
221
222 /*
223  * A word about widths: if the width mask is specified, then libc
224  * unconditionally honors it.  Otherwise, it assumes printable
225  * characters have width 1, and non-printable characters have width
226  * -1 (except for NULL which is special with with 0).  Hence, we have
227  * no need to inject defaults here -- the "default" unset value of 0
228  * indicates that libc should use its own logic in wcwidth as described.
229  */
230 void
231 add_width(int wc, int width)
232 {
233         ctype_node_t    *ctn;
234
235         if ((ctn = get_ctype(wc)) == NULL) {
236                 INTERR;
237                 return;
238         }
239         ctn->ctype &= ~(_CTYPE_SWM);
240         switch (width) {
241         case 0:
242                 ctn->ctype |= _CTYPE_SW0;
243                 break;
244         case 1:
245                 ctn->ctype |= _CTYPE_SW1;
246                 break;
247         case 2:
248                 ctn->ctype |= _CTYPE_SW2;
249                 break;
250         case 3:
251                 ctn->ctype |= _CTYPE_SW3;
252                 break;
253         }
254 }
255
256 void
257 add_width_range(int start, int end, int width)
258 {
259         for (; start <= end; start++) {
260                 add_width(start, width);
261         }
262 }
263
264 void
265 add_caseconv(int val, int wc)
266 {
267         ctype_node_t    *ctn;
268
269         ctn = get_ctype(val);
270         if (ctn == NULL) {
271                 INTERR;
272                 return;
273         }
274
275         switch (last_kw) {
276         case T_TOUPPER:
277                 ctn->toupper = wc;
278                 break;
279         case T_TOLOWER:
280                 ctn->tolower = wc;
281                 break;
282         default:
283                 INTERR;
284                 break;
285         }
286 }
287
288 void
289 dump_ctype(void)
290 {
291         FILE            *f;
292         _FileRuneLocale rl;
293         ctype_node_t    *ctn, *last_ct, *last_lo, *last_up;
294         _FileRuneEntry  *ct = NULL;
295         _FileRuneEntry  *lo = NULL;
296         _FileRuneEntry  *up = NULL;
297         wchar_t         wc;
298
299         (void) memset(&rl, 0, sizeof (rl));
300         last_ct = NULL;
301         last_lo = NULL;
302         last_up = NULL;
303
304         if ((f = open_category()) == NULL)
305                 return;
306
307         (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
308         (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
309
310         /*
311          * Initialize the identity map.
312          */
313         for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
314                 rl.maplower[wc] = wc;
315                 rl.mapupper[wc] = wc;
316         }
317
318         RB_FOREACH(ctn, ctypes, &ctypes) {
319                 int conflict = 0;
320
321                 wc = ctn->wc;
322
323                 /*
324                  * POSIX requires certain portable characters have
325                  * certain types.  Add them if they are missing.
326                  */
327                 if ((wc >= 1) && (wc <= 127)) {
328                         if ((wc >= 'A') && (wc <= 'Z'))
329                                 ctn->ctype |= _ISUPPER;
330                         if ((wc >= 'a') && (wc <= 'z'))
331                                 ctn->ctype |= _ISLOWER;
332                         if ((wc >= '0') && (wc <= '9'))
333                                 ctn->ctype |= _ISDIGIT;
334                         if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
335                                 ctn->ctype |= _ISSPACE;
336                         if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
337                                 ctn->ctype |= _ISXDIGIT;
338                         if (strchr(" \t", (char)wc))
339                                 ctn->ctype |= _ISBLANK;
340                         if (wc == ' ')
341                                 ctn->ctype |= _ISPRINT;
342
343                         /*
344                          * Technically these settings are only
345                          * required for the C locale.  However, it
346                          * turns out that because of the historical
347                          * version of isprint(), we need them for all
348                          * locales as well.  Note that these are not
349                          * necessarily valid punctation characters in
350                          * the current language, but ispunct() needs
351                          * to return TRUE for them.
352                          */
353                         if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
354                             (char)wc))
355                                 ctn->ctype |= _ISPUNCT;
356                 }
357
358                 /*
359                  * POSIX also requires that certain types imply
360                  * others.  Add any inferred types here.
361                  */
362                 if (ctn->ctype & (_ISUPPER |_ISLOWER))
363                         ctn->ctype |= _ISALPHA;
364                 if (ctn->ctype & _ISDIGIT)
365                         ctn->ctype |= _ISXDIGIT;
366                 if (ctn->ctype & _ISBLANK)
367                         ctn->ctype |= _ISSPACE;
368                 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
369                         ctn->ctype |= _ISGRAPH;
370                 if (ctn->ctype & _ISGRAPH)
371                         ctn->ctype |= _ISPRINT;
372
373                 /*
374                  * Finally, POSIX requires that certain combinations
375                  * are invalid.  We don't flag this as a fatal error,
376                  * but we will warn about.
377                  */
378                 if ((ctn->ctype & _ISALPHA) &&
379                     (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
380                         conflict++;
381                 if ((ctn->ctype & _ISPUNCT) &
382                     (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
383                         conflict++;
384                 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
385                         conflict++;
386                 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
387                         conflict++;
388                 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
389                         conflict++;
390
391                 if (conflict) {
392                         warn("conflicting classes for character 0x%x (%x)",
393                             wc, ctn->ctype);
394                 }
395                 /*
396                  * Handle the lower 256 characters using the simple
397                  * optimization.  Note that if we have not defined the
398                  * upper/lower case, then we identity map it.
399                  */
400                 if ((unsigned)wc < _CACHED_RUNES) {
401                         rl.runetype[wc] = ctn->ctype;
402                         if (ctn->tolower)
403                                 rl.maplower[wc] = ctn->tolower;
404                         if (ctn->toupper)
405                                 rl.mapupper[wc] = ctn->toupper;
406                         continue;
407                 }
408
409                 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
410                     (last_ct->wc + 1 == wc)) {
411                         ct[rl.runetype_ext_nranges-1].max = wc;
412                 } else {
413                         rl.runetype_ext_nranges++;
414                         ct = realloc(ct,
415                             sizeof (*ct) * rl.runetype_ext_nranges);
416                         ct[rl.runetype_ext_nranges - 1].min = wc;
417                         ct[rl.runetype_ext_nranges - 1].max = wc;
418                         ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
419                 }
420                 last_ct = ctn;
421                 if (ctn->tolower == 0) {
422                         last_lo = NULL;
423                 } else if ((last_lo != NULL) &&
424                     (last_lo->tolower + 1 == ctn->tolower)) {
425                         lo[rl.maplower_ext_nranges-1].max = wc;
426                         last_lo = ctn;
427                 } else {
428                         rl.maplower_ext_nranges++;
429                         lo = realloc(lo,
430                             sizeof (*lo) * rl.maplower_ext_nranges);
431                         lo[rl.maplower_ext_nranges - 1].min = wc;
432                         lo[rl.maplower_ext_nranges - 1].max = wc;
433                         lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
434                         last_lo = ctn;
435                 }
436
437                 if (ctn->toupper == 0) {
438                         last_up = NULL;
439                 } else if ((last_up != NULL) &&
440                     (last_up->toupper + 1 == ctn->toupper)) {
441                         up[rl.mapupper_ext_nranges-1].max = wc;
442                         last_up = ctn;
443                 } else {
444                         rl.mapupper_ext_nranges++;
445                         up = realloc(up,
446                             sizeof (*up) * rl.mapupper_ext_nranges);
447                         up[rl.mapupper_ext_nranges - 1].min = wc;
448                         up[rl.mapupper_ext_nranges - 1].max = wc;
449                         up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
450                         last_up = ctn;
451                 }
452         }
453
454         if ((wr_category(&rl, sizeof (rl), f) < 0) ||
455             (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
456             (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
457             (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
458                 return;
459         }
460
461         close_category(f);
462 }