Add localedef(1), a locale definition generator tool
[dragonfly.git] / usr.bin / localedef / scanner.c
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11
12 /*
13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14  * Copyright 2013 DEY Storage Systems, Inc.
15  * Copyright 2015 John Marino <draco@marino.st>
16  */
17
18 /*
19  * This file contains the "scanner", which tokenizes the input files
20  * for localedef for processing by the higher level grammar processor.
21  */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <limits.h>
27 #include <string.h>
28 #include <wchar.h>
29 #include <sys/types.h>
30 #include <assert.h>
31 #include "localedef.h"
32 #include "parser.h"
33
34 int                     com_char = '#';
35 int                     esc_char = '\\';
36 int                     mb_cur_min = 1;
37 int                     mb_cur_max = 1;
38 int                     lineno = 1;
39 int                     warnings = 0;
40 int                     is_stdin = 1;
41 FILE                    *input;
42 static int              nextline;
43 //static FILE           *input = stdin;
44 static const char       *filename = "<stdin>";
45 static int              instring = 0;
46 static int              escaped = 0;
47
48 /*
49  * Token space ... grows on demand.
50  */
51 static char *token = NULL;
52 static int tokidx;
53 static int toksz = 0;
54 static int hadtok = 0;
55
56 /*
57  * Wide string space ... grows on demand.
58  */
59 static wchar_t *widestr = NULL;
60 static int wideidx = 0;
61 static int widesz = 0;
62
63 /*
64  * The last keyword seen.  This is useful to trigger the special lexer rules
65  * for "copy" and also collating symbols and elements.
66  */
67 int     last_kw = 0;
68 static int      category = T_END;
69
70 static struct token {
71         int id;
72         const char *name;
73 } keywords[] = {
74         { T_COM_CHAR,           "comment_char" },
75         { T_ESC_CHAR,           "escape_char" },
76         { T_END,                "END" },
77         { T_COPY,               "copy" },
78         { T_MESSAGES,           "LC_MESSAGES" },
79         { T_YESSTR,             "yesstr" },
80         { T_YESEXPR,            "yesexpr" },
81         { T_NOSTR,              "nostr" },
82         { T_NOEXPR,             "noexpr" },
83         { T_MONETARY,           "LC_MONETARY" },
84         { T_INT_CURR_SYMBOL,    "int_curr_symbol" },
85         { T_CURRENCY_SYMBOL,    "currency_symbol" },
86         { T_MON_DECIMAL_POINT,  "mon_decimal_point" },
87         { T_MON_THOUSANDS_SEP,  "mon_thousands_sep" },
88         { T_POSITIVE_SIGN,      "positive_sign" },
89         { T_NEGATIVE_SIGN,      "negative_sign" },
90         { T_MON_GROUPING,       "mon_grouping" },
91         { T_INT_FRAC_DIGITS,    "int_frac_digits" },
92         { T_FRAC_DIGITS,        "frac_digits" },
93         { T_P_CS_PRECEDES,      "p_cs_precedes" },
94         { T_P_SEP_BY_SPACE,     "p_sep_by_space" },
95         { T_N_CS_PRECEDES,      "n_cs_precedes" },
96         { T_N_SEP_BY_SPACE,     "n_sep_by_space" },
97         { T_P_SIGN_POSN,        "p_sign_posn" },
98         { T_N_SIGN_POSN,        "n_sign_posn" },
99         { T_INT_P_CS_PRECEDES,  "int_p_cs_precedes" },
100         { T_INT_N_CS_PRECEDES,  "int_n_cs_precedes" },
101         { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
102         { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
103         { T_INT_P_SIGN_POSN,    "int_p_sign_posn" },
104         { T_INT_N_SIGN_POSN,    "int_n_sign_posn" },
105         { T_COLLATE,            "LC_COLLATE" },
106         { T_COLLATING_SYMBOL,   "collating-symbol" },
107         { T_COLLATING_ELEMENT,  "collating-element" },
108         { T_FROM,               "from" },
109         { T_ORDER_START,        "order_start" },
110         { T_ORDER_END,          "order_end" },
111         { T_FORWARD,            "forward" },
112         { T_BACKWARD,           "backward" },
113         { T_POSITION,           "position" },
114         { T_IGNORE,             "IGNORE" },
115         { T_UNDEFINED,          "UNDEFINED" },
116         { T_NUMERIC,            "LC_NUMERIC" },
117         { T_DECIMAL_POINT,      "decimal_point" },
118         { T_THOUSANDS_SEP,      "thousands_sep" },
119         { T_GROUPING,           "grouping" },
120         { T_TIME,               "LC_TIME" },
121         { T_ABDAY,              "abday" },
122         { T_DAY,                "day" },
123         { T_ABMON,              "abmon" },
124         { T_MON,                "mon" },
125         { T_D_T_FMT,            "d_t_fmt" },
126         { T_D_FMT,              "d_fmt" },
127         { T_T_FMT,              "t_fmt" },
128         { T_AM_PM,              "am_pm" },
129         { T_T_FMT_AMPM,         "t_fmt_ampm" },
130         { T_ERA,                "era" },
131         { T_ERA_D_FMT,          "era_d_fmt" },
132         { T_ERA_T_FMT,          "era_t_fmt" },
133         { T_ERA_D_T_FMT,        "era_d_t_fmt" },
134         { T_ALT_DIGITS,         "alt_digits" },
135         { T_CTYPE,              "LC_CTYPE" },
136         { T_ISUPPER,            "upper" },
137         { T_ISLOWER,            "lower" },
138         { T_ISALPHA,            "alpha" },
139         { T_ISDIGIT,            "digit" },
140         { T_ISPUNCT,            "punct" },
141         { T_ISXDIGIT,           "xdigit" },
142         { T_ISSPACE,            "space" },
143         { T_ISPRINT,            "print" },
144         { T_ISGRAPH,            "graph" },
145         { T_ISBLANK,            "blank" },
146         { T_ISCNTRL,            "cntrl" },
147         /*
148          * These entries are local additions, and not specified by
149          * TOG.  Note that they are not guaranteed to be accurate for
150          * all locales, and so applications should not depend on them.
151          */
152         { T_ISSPECIAL,          "special" },
153         { T_ISENGLISH,          "english" },
154         { T_ISPHONOGRAM,        "phonogram" },
155         { T_ISIDEOGRAM,         "ideogram" },
156         { T_ISNUMBER,           "number" },
157         /*
158          * We have to support this in the grammar, but it would be a
159          * syntax error to define a character as one of these without
160          * also defining it as an alpha or digit.  We ignore it in our
161          * parsing.
162          */
163         { T_ISALNUM,            "alnum" },
164         { T_TOUPPER,            "toupper" },
165         { T_TOLOWER,            "tolower" },
166
167         /*
168          * These are keywords used in the charmap file.  Note that
169          * Solaris orginally used angle brackets to wrap some of them,
170          * but we removed that to simplify our parser.  The first of these
171          * items are "global items."
172          */
173         { T_CHARMAP,            "CHARMAP" },
174         { T_WIDTH,              "WIDTH" },
175
176         { -1, NULL },
177 };
178
179 /*
180  * These special words are only used in a charmap file, enclosed in <>.
181  */
182 static struct token symwords[] = {
183         { T_COM_CHAR,           "comment_char" },
184         { T_ESC_CHAR,           "escape_char" },
185         { T_CODE_SET,           "code_set_name" },
186         { T_MB_CUR_MAX,         "mb_cur_max" },
187         { T_MB_CUR_MIN,         "mb_cur_min" },
188         { -1, NULL },
189 };
190
191 static int categories[] = {
192         T_CHARMAP,
193         T_CTYPE,
194         T_COLLATE,
195         T_MESSAGES,
196         T_MONETARY,
197         T_NUMERIC,
198         T_TIME,
199         T_WIDTH,
200         0
201 };
202
203 void
204 reset_scanner(const char *fname)
205 {
206         if (fname == NULL) {
207                 filename = "<stdin>";
208                 is_stdin = 1;
209         } else {
210                 if (!is_stdin)
211                         (void) fclose(input);
212                 if ((input = fopen(fname, "r")) == NULL) {
213                         perror("fopen");
214                         exit(4);
215                 } else {
216                         is_stdin = 0;
217                 }
218                 filename = fname;
219         }
220         com_char = '#';
221         esc_char = '\\';
222         instring = 0;
223         escaped = 0;
224         lineno = 1;
225         nextline = 1;
226         tokidx = 0;
227         wideidx = 0;
228 }
229
230 #define hex(x)  \
231         (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
232 #define isodigit(x)     ((x >= '0') && (x <= '7'))
233
234 static int
235 scanc(void)
236 {
237         int     c;
238
239         if (is_stdin)
240                 c = getc(stdin);
241         else
242                 c = getc(input);
243         lineno = nextline;
244         if (c == '\n') {
245                 nextline++;
246         }
247         return (c);
248 }
249
250 static void
251 unscanc(int c)
252 {
253         if (c == '\n') {
254                 nextline--;
255         }
256         if (ungetc(c, is_stdin ? stdin : input) < 0) {
257                 yyerror("ungetc failed");
258         }
259 }
260
261 static int
262 scan_hex_byte(void)
263 {
264         int     c1, c2;
265         int     v;
266
267         c1 = scanc();
268         if (!isxdigit(c1)) {
269                 yyerror("malformed hex digit");
270                 return (0);
271         }
272         c2 = scanc();
273         if (!isxdigit(c2)) {
274                 yyerror("malformed hex digit");
275                 return (0);
276         }
277         v = ((hex(c1) << 4) | hex(c2));
278         return (v);
279 }
280
281 static int
282 scan_dec_byte(void)
283 {
284         int     c1, c2, c3;
285         int     b;
286
287         c1 = scanc();
288         if (!isdigit(c1)) {
289                 yyerror("malformed decimal digit");
290                 return (0);
291         }
292         b = c1 - '0';
293         c2 = scanc();
294         if (!isdigit(c2)) {
295                 yyerror("malformed decimal digit");
296                 return (0);
297         }
298         b *= 10;
299         b += (c2 - '0');
300         c3 = scanc();
301         if (!isdigit(c3)) {
302                 unscanc(c3);
303         } else {
304                 b *= 10;
305                 b += (c3 - '0');
306         }
307         return (b);
308 }
309
310 static int
311 scan_oct_byte(void)
312 {
313         int c1, c2, c3;
314         int     b;
315
316         b = 0;
317
318         c1 = scanc();
319         if (!isodigit(c1)) {
320                 yyerror("malformed octal digit");
321                 return (0);
322         }
323         b = c1 - '0';
324         c2 = scanc();
325         if (!isodigit(c2)) {
326                 yyerror("malformed octal digit");
327                 return (0);
328         }
329         b *= 8;
330         b += (c2 - '0');
331         c3 = scanc();
332         if (!isodigit(c3)) {
333                 unscanc(c3);
334         } else {
335                 b *= 8;
336                 b += (c3 - '0');
337         }
338         return (b);
339 }
340
341 void
342 add_tok(int c)
343 {
344         if ((tokidx + 1) >= toksz) {
345                 toksz += 64;
346                 if ((token = realloc(token, toksz)) == NULL) {
347                         yyerror("out of memory");
348                         tokidx = 0;
349                         toksz = 0;
350                         return;
351                 }
352         }
353
354         token[tokidx++] = (char)c;
355         token[tokidx] = 0;
356 }
357 void
358 add_wcs(wchar_t c)
359 {
360         if ((wideidx + 1) >= widesz) {
361                 widesz += 64;
362                 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
363                 if (widestr == NULL) {
364                         yyerror("out of memory");
365                         wideidx = 0;
366                         widesz = 0;
367                         return;
368                 }
369         }
370
371         widestr[wideidx++] = c;
372         widestr[wideidx] = 0;
373 }
374
375 wchar_t *
376 get_wcs(void)
377 {
378         wchar_t *ws = widestr;
379         wideidx = 0;
380         widestr = NULL;
381         widesz = 0;
382         if (ws == NULL) {
383                 if ((ws = wcsdup(L"")) == NULL) {
384                         yyerror("out of memory");
385                 }
386         }
387         return (ws);
388 }
389
390 static int
391 get_byte(void)
392 {
393         int     c;
394
395         if ((c = scanc()) != esc_char) {
396                 unscanc(c);
397                 return (EOF);
398         }
399         c = scanc();
400
401         switch (c) {
402         case 'd':
403         case 'D':
404                 return (scan_dec_byte());
405         case 'x':
406         case 'X':
407                 return (scan_hex_byte());
408         case '0':
409         case '1':
410         case '2':
411         case '3':
412         case '4':
413         case '5':
414         case '6':
415         case '7':
416                 /* put the character back so we can get it */
417                 unscanc(c);
418                 return (scan_oct_byte());
419         default:
420                 unscanc(c);
421                 unscanc(esc_char);
422                 return (EOF);
423         }
424 }
425
426 int
427 get_escaped(int c)
428 {
429         switch (c) {
430         case 'n':
431                 return ('\n');
432         case 'r':
433                 return ('\r');
434         case 't':
435                 return ('\t');
436         case 'f':
437                 return ('\f');
438         case 'v':
439                 return ('\v');
440         case 'b':
441                 return ('\b');
442         case 'a':
443                 return ('\a');
444         default:
445                 return (c);
446         }
447 }
448
449 int
450 get_wide(void)
451 {
452         static char mbs[MB_LEN_MAX + 1] = "";
453         static int mbi = 0;
454         int c;
455         wchar_t wc;
456
457         if (mb_cur_max >= (int)sizeof (mbs)) {
458                 yyerror("max multibyte character size too big");
459                 mbi = 0;
460                 return (T_NULL);
461         }
462         for (;;) {
463                 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
464                         /*
465                          * end of the byte sequence reached, but no
466                          * valid wide decoding.  fatal error.
467                          */
468                         mbi = 0;
469                         yyerror("not a valid character encoding");
470                         return (T_NULL);
471                 }
472                 mbs[mbi++] = c;
473                 mbs[mbi] = 0;
474
475                 /* does it decode? */
476                 if (to_wide(&wc, mbs) >= 0) {
477                         break;
478                 }
479         }
480
481         mbi = 0;
482         if ((category != T_CHARMAP) && (category != T_WIDTH)) {
483                 if (check_charmap(wc) < 0) {
484                         yyerror("no symbolic name for character");
485                         return (T_NULL);
486                 }
487         }
488
489         yylval.wc = wc;
490         return (T_CHAR);
491 }
492
493 int
494 get_symbol(void)
495 {
496         int     c;
497
498         while ((c = scanc()) != EOF) {
499                 if (escaped) {
500                         escaped = 0;
501                         if (c == '\n')
502                                 continue;
503                         add_tok(get_escaped(c));
504                         continue;
505                 }
506                 if (c == esc_char) {
507                         escaped = 1;
508                         continue;
509                 }
510                 if (c == '\n') {        /* well that's strange! */
511                         yyerror("unterminated symbolic name");
512                         continue;
513                 }
514                 if (c == '>') {         /* end of symbol */
515
516                         /*
517                          * This restarts the token from the beginning
518                          * the next time we scan a character.  (This
519                          * token is complete.)
520                          */
521
522                         if (token == NULL) {
523                                 yyerror("missing symbolic name");
524                                 return (T_NULL);
525                         }
526                         tokidx = 0;
527
528                         /*
529                          * A few symbols are handled as keywords outside
530                          * of the normal categories.
531                          */
532                         if (category == T_END) {
533                                 int i;
534                                 for (i = 0; symwords[i].name != 0; i++) {
535                                         if (strcmp(token, symwords[i].name) ==
536                                             0) {
537                                                 last_kw = symwords[i].id;
538                                                 return (last_kw);
539                                         }
540                                 }
541                         }
542                         /*
543                          * Contextual rule: Only literal characters are
544                          * permitted in CHARMAP.  Anywhere else the symbolic
545                          * forms are fine.
546                          */
547                         if ((category != T_CHARMAP) &&
548                             (lookup_charmap(token, &yylval.wc)) != -1) {
549                                 return (T_CHAR);
550                         }
551                         if ((yylval.collsym = lookup_collsym(token)) != NULL) {
552                                 return (T_COLLSYM);
553                         }
554                         if ((yylval.collelem = lookup_collelem(token)) !=
555                             NULL) {
556                                 return (T_COLLELEM);
557                         }
558                         /* its an undefined symbol */
559                         yylval.token = strdup(token);
560                         token = NULL;
561                         toksz = 0;
562                         tokidx = 0;
563                         return (T_SYMBOL);
564                 }
565                 add_tok(c);
566         }
567
568         yyerror("unterminated symbolic name");
569         return (EOF);
570 }
571
572 int
573 get_category(void)
574 {
575         return (category);
576 }
577
578 static int
579 consume_token(void)
580 {
581         int     len = tokidx;
582         int     i;
583
584         tokidx = 0;
585         if (token == NULL)
586                 return (T_NULL);
587
588         /*
589          * this one is special, because we don't want it to alter the
590          * last_kw field.
591          */
592         if (strcmp(token, "...") == 0) {
593                 return (T_ELLIPSIS);
594         }
595
596         /* search for reserved words first */
597         for (i = 0; keywords[i].name; i++) {
598                 int j;
599                 if (strcmp(keywords[i].name, token) != 0) {
600                         continue;
601                 }
602
603                 last_kw = keywords[i].id;
604
605                 /* clear the top level category if we're done with it */
606                 if (last_kw == T_END) {
607                         category = T_END;
608                 }
609
610                 /* set the top level category if we're changing */
611                 for (j = 0; categories[j]; j++) {
612                         if (categories[j] != last_kw)
613                                 continue;
614                         category = last_kw;
615                 }
616
617                 return (keywords[i].id);
618         }
619
620         /* maybe its a numeric constant? */
621         if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
622                 char *eptr;
623                 yylval.num = strtol(token, &eptr, 10);
624                 if (*eptr != 0)
625                         yyerror("malformed number");
626                 return (T_NUMBER);
627         }
628
629         /*
630          * A single lone character is treated as a character literal.
631          * To avoid duplication of effort, we stick in the charmap.
632          */
633         if (len == 1) {
634                 yylval.wc = token[0];
635                 return (T_CHAR);
636         }
637
638         /* anything else is treated as a symbolic name */
639         yylval.token = strdup(token);
640         token = NULL;
641         toksz = 0;
642         tokidx = 0;
643         return (T_NAME);
644 }
645
646 void
647 scan_to_eol(void)
648 {
649         int     c;
650         while ((c = scanc()) != '\n') {
651                 if (c == EOF) {
652                         /* end of file without newline! */
653                         errf("missing newline");
654                         return;
655                 }
656         }
657         assert(c == '\n');
658 }
659
660 int
661 yylex(void)
662 {
663         int             c;
664
665         while ((c = scanc()) != EOF) {
666
667                 /* special handling for quoted string */
668                 if (instring) {
669                         if (escaped) {
670                                 escaped = 0;
671
672                                 /* if newline, just eat and forget it */
673                                 if (c == '\n')
674                                         continue;
675
676                                 if (strchr("xXd01234567", c)) {
677                                         unscanc(c);
678                                         unscanc(esc_char);
679                                         return (get_wide());
680                                 }
681                                 yylval.wc = get_escaped(c);
682                                 return (T_CHAR);
683                         }
684                         if (c == esc_char) {
685                                 escaped = 1;
686                                 continue;
687                         }
688                         switch (c) {
689                         case '<':
690                                 return (get_symbol());
691                         case '>':
692                                 /* oops! should generate syntax error  */
693                                 return (T_GT);
694                         case '"':
695                                 instring = 0;
696                                 return (T_QUOTE);
697                         default:
698                                 yylval.wc = c;
699                                 return (T_CHAR);
700                         }
701                 }
702
703                 /* escaped characters first */
704                 if (escaped) {
705                         escaped = 0;
706                         if (c == '\n') {
707                                 /* eat the newline */
708                                 continue;
709                         }
710                         hadtok = 1;
711                         if (tokidx) {
712                                 /* an escape mid-token is nonsense */
713                                 return (T_NULL);
714                         }
715
716                         /* numeric escapes are treated as wide characters */
717                         if (strchr("xXd01234567", c)) {
718                                 unscanc(c);
719                                 unscanc(esc_char);
720                                 return (get_wide());
721                         }
722
723                         add_tok(get_escaped(c));
724                         continue;
725                 }
726
727                 /* if it is the escape charter itself note it */
728                 if (c == esc_char) {
729                         escaped = 1;
730                         continue;
731                 }
732
733                 /* remove from the comment char to end of line */
734                 if (c == com_char) {
735                         while (c != '\n') {
736                                 if ((c = scanc()) == EOF) {
737                                         /* end of file without newline! */
738                                         return (EOF);
739                                 }
740                         }
741                         assert(c == '\n');
742                         if (!hadtok) {
743                                 /*
744                                  * If there were no tokens on this line,
745                                  * then just pretend it didn't exist at all.
746                                  */
747                                 continue;
748                         }
749                         hadtok = 0;
750                         return (T_NL);
751                 }
752
753                 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
754                         /*
755                          * These are all token delimiters.  If there
756                          * is a token already in progress, we need to
757                          * process it.
758                          */
759                         unscanc(c);
760                         return (consume_token());
761                 }
762
763                 switch (c) {
764                 case '\n':
765                         if (!hadtok) {
766                                 /*
767                                  * If the line was completely devoid of tokens,
768                                  * then just ignore it.
769                                  */
770                                 continue;
771                         }
772                         /* we're starting a new line, reset the token state */
773                         hadtok = 0;
774                         return (T_NL);
775                 case ',':
776                         hadtok = 1;
777                         return (T_COMMA);
778                 case ';':
779                         hadtok = 1;
780                         return (T_SEMI);
781                 case '(':
782                         hadtok = 1;
783                         return (T_LPAREN);
784                 case ')':
785                         hadtok = 1;
786                         return (T_RPAREN);
787                 case '>':
788                         hadtok = 1;
789                         return (T_GT);
790                 case '<':
791                         /* symbol start! */
792                         hadtok = 1;
793                         return (get_symbol());
794                 case ' ':
795                 case '\t':
796                         /* whitespace, just ignore it */
797                         continue;
798                 case '"':
799                         hadtok = 1;
800                         instring = 1;
801                         return (T_QUOTE);
802                 default:
803                         hadtok = 1;
804                         add_tok(c);
805                         continue;
806                 }
807         }
808         return (EOF);
809 }
810
811 void
812 yyerror(const char *msg)
813 {
814         (void) fprintf(stderr, "%s: %d: error: %s\n",
815             filename, lineno, msg);
816         exit(4);
817 }
818
819 void
820 errf(const char *fmt, ...)
821 {
822         char    *msg;
823
824         va_list va;
825         va_start(va, fmt);
826         (void) vasprintf(&msg, fmt, va);
827         va_end(va);
828
829         (void) fprintf(stderr, "%s: %d: error: %s\n",
830             filename, lineno, msg);
831         free(msg);
832         exit(4);
833 }
834
835 void
836 warn(const char *fmt, ...)
837 {
838         char    *msg;
839
840         va_list va;
841         va_start(va, fmt);
842         (void) vasprintf(&msg, fmt, va);
843         va_end(va);
844
845         (void) fprintf(stderr, "%s: %d: warning: %s\n",
846             filename, lineno, msg);
847         free(msg);
848         warnings++;
849         if (!warnok)
850                 exit(4);
851 }