2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2013 DEY Storage Systems, Inc.
15 * Copyright 2015 John Marino <draco@marino.st>
19 * This file contains the "scanner", which tokenizes the input files
20 * for localedef for processing by the higher level grammar processor.
29 #include <sys/types.h>
31 #include "localedef.h"
43 //static FILE *input = stdin;
44 static const char *filename = "<stdin>";
45 static int instring = 0;
46 static int escaped = 0;
49 * Token space ... grows on demand.
51 static char *token = NULL;
54 static int hadtok = 0;
57 * Wide string space ... grows on demand.
59 static wchar_t *widestr = NULL;
60 static int wideidx = 0;
61 static int widesz = 0;
64 * The last keyword seen. This is useful to trigger the special lexer rules
65 * for "copy" and also collating symbols and elements.
68 static int category = T_END;
74 { T_COM_CHAR, "comment_char" },
75 { T_ESC_CHAR, "escape_char" },
78 { T_MESSAGES, "LC_MESSAGES" },
79 { T_YESSTR, "yesstr" },
80 { T_YESEXPR, "yesexpr" },
82 { T_NOEXPR, "noexpr" },
83 { T_MONETARY, "LC_MONETARY" },
84 { T_INT_CURR_SYMBOL, "int_curr_symbol" },
85 { T_CURRENCY_SYMBOL, "currency_symbol" },
86 { T_MON_DECIMAL_POINT, "mon_decimal_point" },
87 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
88 { T_POSITIVE_SIGN, "positive_sign" },
89 { T_NEGATIVE_SIGN, "negative_sign" },
90 { T_MON_GROUPING, "mon_grouping" },
91 { T_INT_FRAC_DIGITS, "int_frac_digits" },
92 { T_FRAC_DIGITS, "frac_digits" },
93 { T_P_CS_PRECEDES, "p_cs_precedes" },
94 { T_P_SEP_BY_SPACE, "p_sep_by_space" },
95 { T_N_CS_PRECEDES, "n_cs_precedes" },
96 { T_N_SEP_BY_SPACE, "n_sep_by_space" },
97 { T_P_SIGN_POSN, "p_sign_posn" },
98 { T_N_SIGN_POSN, "n_sign_posn" },
99 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
100 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
101 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
102 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
103 { T_INT_P_SIGN_POSN, "int_p_sign_posn" },
104 { T_INT_N_SIGN_POSN, "int_n_sign_posn" },
105 { T_COLLATE, "LC_COLLATE" },
106 { T_COLLATING_SYMBOL, "collating-symbol" },
107 { T_COLLATING_ELEMENT, "collating-element" },
109 { T_ORDER_START, "order_start" },
110 { T_ORDER_END, "order_end" },
111 { T_FORWARD, "forward" },
112 { T_BACKWARD, "backward" },
113 { T_POSITION, "position" },
114 { T_IGNORE, "IGNORE" },
115 { T_UNDEFINED, "UNDEFINED" },
116 { T_NUMERIC, "LC_NUMERIC" },
117 { T_DECIMAL_POINT, "decimal_point" },
118 { T_THOUSANDS_SEP, "thousands_sep" },
119 { T_GROUPING, "grouping" },
120 { T_TIME, "LC_TIME" },
121 { T_ABDAY, "abday" },
123 { T_ABMON, "abmon" },
125 { T_D_T_FMT, "d_t_fmt" },
126 { T_D_FMT, "d_fmt" },
127 { T_T_FMT, "t_fmt" },
128 { T_AM_PM, "am_pm" },
129 { T_T_FMT_AMPM, "t_fmt_ampm" },
131 { T_ERA_D_FMT, "era_d_fmt" },
132 { T_ERA_T_FMT, "era_t_fmt" },
133 { T_ERA_D_T_FMT, "era_d_t_fmt" },
134 { T_ALT_DIGITS, "alt_digits" },
135 { T_CTYPE, "LC_CTYPE" },
136 { T_ISUPPER, "upper" },
137 { T_ISLOWER, "lower" },
138 { T_ISALPHA, "alpha" },
139 { T_ISDIGIT, "digit" },
140 { T_ISPUNCT, "punct" },
141 { T_ISXDIGIT, "xdigit" },
142 { T_ISSPACE, "space" },
143 { T_ISPRINT, "print" },
144 { T_ISGRAPH, "graph" },
145 { T_ISBLANK, "blank" },
146 { T_ISCNTRL, "cntrl" },
148 * These entries are local additions, and not specified by
149 * TOG. Note that they are not guaranteed to be accurate for
150 * all locales, and so applications should not depend on them.
152 { T_ISSPECIAL, "special" },
153 { T_ISENGLISH, "english" },
154 { T_ISPHONOGRAM, "phonogram" },
155 { T_ISIDEOGRAM, "ideogram" },
156 { T_ISNUMBER, "number" },
158 * We have to support this in the grammar, but it would be a
159 * syntax error to define a character as one of these without
160 * also defining it as an alpha or digit. We ignore it in our
163 { T_ISALNUM, "alnum" },
164 { T_TOUPPER, "toupper" },
165 { T_TOLOWER, "tolower" },
168 * These are keywords used in the charmap file. Note that
169 * Solaris orginally used angle brackets to wrap some of them,
170 * but we removed that to simplify our parser. The first of these
171 * items are "global items."
173 { T_CHARMAP, "CHARMAP" },
174 { T_WIDTH, "WIDTH" },
180 * These special words are only used in a charmap file, enclosed in <>.
182 static struct token symwords[] = {
183 { T_COM_CHAR, "comment_char" },
184 { T_ESC_CHAR, "escape_char" },
185 { T_CODE_SET, "code_set_name" },
186 { T_MB_CUR_MAX, "mb_cur_max" },
187 { T_MB_CUR_MIN, "mb_cur_min" },
191 static int categories[] = {
204 reset_scanner(const char *fname)
207 filename = "<stdin>";
211 (void) fclose(input);
212 if ((input = fopen(fname, "r")) == NULL) {
231 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
232 #define isodigit(x) ((x >= '0') && (x <= '7'))
256 if (ungetc(c, is_stdin ? stdin : input) < 0) {
257 yyerror("ungetc failed");
269 yyerror("malformed hex digit");
274 yyerror("malformed hex digit");
277 v = ((hex(c1) << 4) | hex(c2));
289 yyerror("malformed decimal digit");
295 yyerror("malformed decimal digit");
320 yyerror("malformed octal digit");
326 yyerror("malformed octal digit");
344 if ((tokidx + 1) >= toksz) {
346 if ((token = realloc(token, toksz)) == NULL) {
347 yyerror("out of memory");
354 token[tokidx++] = (char)c;
360 if ((wideidx + 1) >= widesz) {
362 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
363 if (widestr == NULL) {
364 yyerror("out of memory");
371 widestr[wideidx++] = c;
372 widestr[wideidx] = 0;
378 wchar_t *ws = widestr;
383 if ((ws = wcsdup(L"")) == NULL) {
384 yyerror("out of memory");
395 if ((c = scanc()) != esc_char) {
404 return (scan_dec_byte());
407 return (scan_hex_byte());
416 /* put the character back so we can get it */
418 return (scan_oct_byte());
452 static char mbs[MB_LEN_MAX + 1] = "";
457 if (mb_cur_max >= (int)sizeof (mbs)) {
458 yyerror("max multibyte character size too big");
463 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
465 * end of the byte sequence reached, but no
466 * valid wide decoding. fatal error.
469 yyerror("not a valid character encoding");
475 /* does it decode? */
476 if (to_wide(&wc, mbs) >= 0) {
482 if ((category != T_CHARMAP) && (category != T_WIDTH)) {
483 if (check_charmap(wc) < 0) {
484 yyerror("no symbolic name for character");
498 while ((c = scanc()) != EOF) {
503 add_tok(get_escaped(c));
510 if (c == '\n') { /* well that's strange! */
511 yyerror("unterminated symbolic name");
514 if (c == '>') { /* end of symbol */
517 * This restarts the token from the beginning
518 * the next time we scan a character. (This
519 * token is complete.)
523 yyerror("missing symbolic name");
529 * A few symbols are handled as keywords outside
530 * of the normal categories.
532 if (category == T_END) {
534 for (i = 0; symwords[i].name != 0; i++) {
535 if (strcmp(token, symwords[i].name) ==
537 last_kw = symwords[i].id;
543 * Contextual rule: Only literal characters are
544 * permitted in CHARMAP. Anywhere else the symbolic
547 if ((category != T_CHARMAP) &&
548 (lookup_charmap(token, &yylval.wc)) != -1) {
551 if ((yylval.collsym = lookup_collsym(token)) != NULL) {
554 if ((yylval.collelem = lookup_collelem(token)) !=
558 /* its an undefined symbol */
559 yylval.token = strdup(token);
568 yyerror("unterminated symbolic name");
589 * this one is special, because we don't want it to alter the
592 if (strcmp(token, "...") == 0) {
596 /* search for reserved words first */
597 for (i = 0; keywords[i].name; i++) {
599 if (strcmp(keywords[i].name, token) != 0) {
603 last_kw = keywords[i].id;
605 /* clear the top level category if we're done with it */
606 if (last_kw == T_END) {
610 /* set the top level category if we're changing */
611 for (j = 0; categories[j]; j++) {
612 if (categories[j] != last_kw)
617 return (keywords[i].id);
620 /* maybe its a numeric constant? */
621 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
623 yylval.num = strtol(token, &eptr, 10);
625 yyerror("malformed number");
630 * A single lone character is treated as a character literal.
631 * To avoid duplication of effort, we stick in the charmap.
634 yylval.wc = token[0];
638 /* anything else is treated as a symbolic name */
639 yylval.token = strdup(token);
650 while ((c = scanc()) != '\n') {
652 /* end of file without newline! */
653 errf("missing newline");
665 while ((c = scanc()) != EOF) {
667 /* special handling for quoted string */
672 /* if newline, just eat and forget it */
676 if (strchr("xXd01234567", c)) {
681 yylval.wc = get_escaped(c);
690 return (get_symbol());
692 /* oops! should generate syntax error */
703 /* escaped characters first */
707 /* eat the newline */
712 /* an escape mid-token is nonsense */
716 /* numeric escapes are treated as wide characters */
717 if (strchr("xXd01234567", c)) {
723 add_tok(get_escaped(c));
727 /* if it is the escape charter itself note it */
733 /* remove from the comment char to end of line */
736 if ((c = scanc()) == EOF) {
737 /* end of file without newline! */
744 * If there were no tokens on this line,
745 * then just pretend it didn't exist at all.
753 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
755 * These are all token delimiters. If there
756 * is a token already in progress, we need to
760 return (consume_token());
767 * If the line was completely devoid of tokens,
768 * then just ignore it.
772 /* we're starting a new line, reset the token state */
793 return (get_symbol());
796 /* whitespace, just ignore it */
812 yyerror(const char *msg)
814 (void) fprintf(stderr, "%s: %d: error: %s\n",
815 filename, lineno, msg);
820 errf(const char *fmt, ...)
826 (void) vasprintf(&msg, fmt, va);
829 (void) fprintf(stderr, "%s: %d: error: %s\n",
830 filename, lineno, msg);
836 warn(const char *fmt, ...)
842 (void) vasprintf(&msg, fmt, va);
845 (void) fprintf(stderr, "%s: %d: warning: %s\n",
846 filename, lineno, msg);