2 tre-parse.c - Regexp parser
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
18 #endif /* HAVE_CONFIG_H */
26 #include "tre-stack.h"
27 #include "tre-parse.h"
30 /* Characters with special meanings in regexp syntax. */
31 #define CHAR_PIPE L'|'
32 #define CHAR_LPAREN L'('
33 #define CHAR_RPAREN L')'
34 #define CHAR_LBRACE L'{'
35 #define CHAR_RBRACE L'}'
36 #define CHAR_LBRACKET L'['
37 #define CHAR_RBRACKET L']'
38 #define CHAR_MINUS L'-'
39 #define CHAR_STAR L'*'
40 #define CHAR_QUESTIONMARK L'?'
41 #define CHAR_PLUS L'+'
42 #define CHAR_PERIOD L'.'
43 #define CHAR_COLON L':'
44 #define CHAR_EQUAL L'='
45 #define CHAR_COMMA L','
46 #define CHAR_CARET L'^'
47 #define CHAR_DOLLAR L'$'
48 #define CHAR_BACKSLASH L'\\'
49 #define CHAR_HASH L'#'
50 #define CHAR_TILDE L'~'
53 /* Some macros for expanding \w, \s, etc. */
54 static const struct tre_macro_struct {
56 const char *expansion;
58 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
59 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
60 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
61 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
67 must have at least `len' items. Sets buf[0] to zero if the there
68 is no match in `tre_macros'. */
70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
71 tre_char_t *buf, size_t buf_len)
76 if (regex >= regex_end)
79 for (i = 0; tre_macros[i].expansion; i++)
81 if (tre_macros[i].c == *regex)
84 DPRINT(("Expanding macro '%c' => '%s'\n",
85 tre_macros[i].c, tre_macros[i].expansion));
86 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
87 buf[j] = tre_macros[i].expansion[j];
95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
96 tre_ast_node_t ***items)
99 tre_ast_node_t **array = *items;
100 /* Allocate more space if necessary. */
103 tre_ast_node_t **new_items;
104 DPRINT(("out of array space, i = %d\n", *i));
105 /* If the array is already 1024 items large, give up -- there's
106 probably an error in the regexp (e.g. not a '\0' terminated
107 string and missing ']') */
111 new_items = xrealloc(array, sizeof(*items) * *max_i);
112 if (new_items == NULL)
114 *items = array = new_items;
116 array[*i] = tre_ast_new_literal(mem, min, max, -1);
117 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
123 /* Expands a character class to character ranges. */
125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
126 int *i, int *max_i, int cflags)
128 reg_errcode_t status = REG_OK;
130 int j, min = -1, max = 0;
131 assert(TRE_MB_CUR_MAX == 1);
133 DPRINT((" expanding class to character ranges\n"));
134 for (j = 0; (j < 256) && (status == REG_OK); j++)
137 if (tre_isctype(c, class)
138 || ((cflags & REG_ICASE)
139 && (tre_isctype(tre_tolower(c), class)
140 || tre_isctype(tre_toupper(c), class))))
148 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
149 status = tre_new_item(mem, min, max, i, max_i, items);
153 if (min >= 0 && status == REG_OK)
154 status = tre_new_item(mem, min, max, i, max_i, items);
160 tre_compare_items(const void *a, const void *b)
162 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
163 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
164 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
165 int a_min = l_a->code_min, b_min = l_b->code_min;
169 else if (a_min > b_min)
175 #ifndef TRE_USE_SYSTEM_WCTYPE
177 /* isalnum() and the rest may be macros, so wrap them to functions. */
178 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
179 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
182 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
183 #else /* !tre_isascii */
184 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
185 #endif /* !tre_isascii */
188 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
189 #else /* !tre_isblank */
190 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
191 #endif /* !tre_isblank */
193 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
194 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
195 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
196 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
197 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
198 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
199 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
200 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
201 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
205 int (*func)(tre_cint_t);
206 } tre_ctype_map[] = {
207 { "alnum", &tre_isalnum_func },
208 { "alpha", &tre_isalpha_func },
210 { "ascii", &tre_isascii_func },
211 #endif /* tre_isascii */
213 { "blank", &tre_isblank_func },
214 #endif /* tre_isblank */
215 { "cntrl", &tre_iscntrl_func },
216 { "digit", &tre_isdigit_func },
217 { "graph", &tre_isgraph_func },
218 { "lower", &tre_islower_func },
219 { "print", &tre_isprint_func },
220 { "punct", &tre_ispunct_func },
221 { "space", &tre_isspace_func },
222 { "upper", &tre_isupper_func },
223 { "xdigit", &tre_isxdigit_func },
227 tre_ctype_t tre_ctype(const char *name)
230 for (i = 0; tre_ctype_map[i].name != NULL; i++)
232 if (strcmp(name, tre_ctype_map[i].name) == 0)
233 return tre_ctype_map[i].func;
235 return (tre_ctype_t)0;
237 #endif /* !TRE_USE_SYSTEM_WCTYPE */
239 /* Maximum number of character classes that can occur in a negated bracket
241 #define MAX_NEG_CLASSES 64
243 /* Maximum length of character class names. */
244 #define MAX_CLASS_NAME
246 #define REST(re) (int)(ctx->re_end - (re)), (re)
249 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
250 tre_ctype_t neg_classes[], int *num_neg_classes,
251 tre_ast_node_t ***items, int *num_items,
254 const tre_char_t *re = ctx->re;
255 reg_errcode_t status = REG_OK;
256 tre_ctype_t class = (tre_ctype_t)0;
258 int max_i = *items_size;
261 /* Build an array of the items in the bracket expression. */
262 while (status == REG_OK)
265 if (re == ctx->re_end)
269 else if (*re == CHAR_RBRACKET && re > ctx->re)
271 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
277 tre_cint_t min = 0, max = 0;
279 class = (tre_ctype_t)0;
280 if (re + 2 < ctx->re_end
281 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
283 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
287 /* XXX - Should use collation order instead of encoding values
288 in character ranges. */
292 else if (re + 1 < ctx->re_end
293 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
294 status = REG_ECOLLATE;
295 else if (re + 1 < ctx->re_end
296 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
297 status = REG_ECOLLATE;
298 else if (re + 1 < ctx->re_end
299 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
302 const tre_char_t *endptr = re + 2;
304 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
305 while (endptr < ctx->re_end && *endptr != CHAR_COLON)
307 if (endptr != ctx->re_end)
309 len = MIN(endptr - re - 2, 63);
312 tre_char_t tmp_wcs[64];
313 wcsncpy(tmp_wcs, re + 2, (size_t)len);
314 tmp_wcs[len] = L'\0';
315 #if defined HAVE_WCSRTOMBS
318 const tre_char_t *src = tmp_wcs;
319 memset(&state, '\0', sizeof(state));
320 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
322 #elif defined HAVE_WCSTOMBS
323 len = wcstombs(tmp_str, tmp_wcs, 63);
324 #endif /* defined HAVE_WCSTOMBS */
326 #else /* !TRE_WCHAR */
327 strncpy(tmp_str, (const char*)re + 2, len);
328 #endif /* !TRE_WCHAR */
330 DPRINT((" class name: %s\n", tmp_str));
331 class = tre_ctype(tmp_str);
334 /* Optimize character classes for 8 bit character sets. */
335 if (status == REG_OK && TRE_MB_CUR_MAX == 1)
337 status = tre_expand_ctype(ctx->mem, class, items,
338 &i, &max_i, ctx->cflags);
339 class = (tre_ctype_t)0;
351 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
352 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
354 /* Two ranges are not allowed to share and endpoint. */
359 if (status != REG_OK)
363 if (*num_neg_classes >= MAX_NEG_CLASSES)
366 neg_classes[(*num_neg_classes)++] = class;
369 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
370 if (status != REG_OK)
372 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
375 /* Add opposite-case counterpoints if REG_ICASE is present.
376 This is broken if there are more than two "same" characters. */
377 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
379 tre_cint_t cmin, ccurr;
381 DPRINT(("adding opposite-case counterpoints\n"));
384 if (tre_islower(min))
386 cmin = ccurr = tre_toupper(min++);
387 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
389 ccurr = tre_toupper(min++);
390 status = tre_new_item(ctx->mem, cmin, ccurr,
393 else if (tre_isupper(min))
395 cmin = ccurr = tre_tolower(min++);
396 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
398 ccurr = tre_tolower(min++);
399 status = tre_new_item(ctx->mem, cmin, ccurr,
403 if (status != REG_OK)
406 if (status != REG_OK)
418 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
420 tre_ast_node_t *node = NULL;
422 reg_errcode_t status = REG_OK;
423 tre_ast_node_t **items, *u, *n;
424 int i = 0, j, max_i = 32, curr_max, curr_min;
425 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
426 int num_neg_classes = 0;
428 /* Start off with an array of `max_i' elements. */
429 items = xmalloc(sizeof(*items) * max_i);
433 if (*ctx->re == CHAR_CARET)
435 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
440 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
443 if (status != REG_OK)
444 goto parse_bracket_done;
446 /* Sort the array if we need to negate it. */
448 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
450 curr_max = curr_min = 0;
451 /* Build a union of the items in the array, negated if necessary. */
452 for (j = 0; j < i && status == REG_OK; j++)
455 tre_literal_t *l = items[j]->obj;
459 DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
460 (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
467 curr_max = MAX(max + 1, curr_max);
468 DPRINT(("overlap, curr_max = %d\n", curr_max));
475 if (curr_max >= curr_min)
477 DPRINT(("no overlap\n"));
478 l->code_min = curr_min;
479 l->code_max = curr_max;
483 DPRINT(("no overlap, zero room\n"));
486 curr_min = curr_max = max + 1;
493 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
494 l->position = ctx->position;
495 if (num_neg_classes > 0)
497 l->neg_classes = tre_mem_alloc(ctx->mem,
498 (sizeof(l->neg_classes)
499 * (num_neg_classes + 1)));
500 if (l->neg_classes == NULL)
505 for (k = 0; k < num_neg_classes; k++)
506 l->neg_classes[k] = neg_classes[k];
507 l->neg_classes[k] = (tre_ctype_t)0;
510 l->neg_classes = NULL;
515 u = tre_ast_new_union(ctx->mem, node, items[j]);
523 if (status != REG_OK)
524 goto parse_bracket_done;
529 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
530 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
535 tre_literal_t *l = n->obj;
536 if (num_neg_classes > 0)
538 l->neg_classes = tre_mem_alloc(ctx->mem,
539 (sizeof(l->neg_classes)
540 * (num_neg_classes + 1)));
541 if (l->neg_classes == NULL)
544 goto parse_bracket_done;
546 for (k = 0; k < num_neg_classes; k++)
547 l->neg_classes[k] = neg_classes[k];
548 l->neg_classes[k] = (tre_ctype_t)0;
551 l->neg_classes = NULL;
556 u = tre_ast_new_union(ctx->mem, node, n);
564 if (status != REG_OK)
565 goto parse_bracket_done;
569 #endif /* TRE_DEBUG */
579 /* Parses a positive decimal integer. Returns -1 if the string does not
580 contain a valid number. */
582 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
585 const tre_char_t *r = *regex;
586 while (r < regex_end && *r >= L'0' && *r <= L'9')
590 num = num * 10 + *r - L'0';
599 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
602 int cost_ins, cost_del, cost_subst, cost_max;
603 int limit_ins, limit_del, limit_subst, limit_err;
604 const tre_char_t *r = ctx->re;
605 const tre_char_t *start;
606 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
611 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
612 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
614 /* Parse number (minimum repetition count). */
616 if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
617 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
618 min = tre_parse_int(&r, ctx->re_end);
621 /* Parse comma and second number (maximum repetition count). */
623 if (r < ctx->re_end && *r == CHAR_COMMA)
626 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
627 max = tre_parse_int(&r, ctx->re_end);
630 /* Check that the repeat counts are sane. */
631 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
637 optionally followed immediately by a number == minimum repcount
638 optionally followed by , then a number == maximum repcount
639 + then a number == maximum insertion count
640 - then a number == maximum deletion count
641 # then a number == maximum substitution count
642 ~ then a number == maximum number of errors
643 Any of +, -, # or ~ without followed by a number means that
644 the maximum count/number of errors is infinite.
646 An equation of the form
648 can be specified to set costs and the cost limit to a value
649 different from the default value:
650 - X is the cost of an insertion
651 - Y is the cost of a deletion
652 - Z is the cost of a substitution
653 - C is the maximum cost
655 If no count limit or cost is set for an operation, the operation
656 is not allowed at all.
664 /* Parse count limit settings */
667 while (r + 1 < ctx->re_end && !done)
671 case CHAR_PLUS: /* Insert limit */
672 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
674 limit_ins = tre_parse_int(&r, ctx->re_end);
679 case CHAR_MINUS: /* Delete limit */
680 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
682 limit_del = tre_parse_int(&r, ctx->re_end);
687 case CHAR_HASH: /* Substitute limit */
688 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
690 limit_subst = tre_parse_int(&r, ctx->re_end);
692 limit_subst = INT_MAX;
695 case CHAR_TILDE: /* Maximum number of changes */
696 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
698 limit_err = tre_parse_int(&r, ctx->re_end);
718 /* Parse cost restriction equation. */
721 while (r + 1 < ctx->re_end && !done)
730 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
734 cost_max = tre_parse_int(&r, ctx->re_end);
746 if (*r >= L'0' && *r <= L'9')
749 const tre_char_t *sr = r;
750 #endif /* TRE_DEBUG */
751 int cost = tre_parse_int(&r, ctx->re_end);
752 /* XXX - make sure r is not past end. */
755 case L'i': /* Insert cost */
756 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
762 case L'd': /* Delete cost */
763 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
769 case L's': /* Substitute cost */
770 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
787 } while (start != r);
790 if (r >= ctx->re_end)
793 /* Empty contents of {}. */
797 /* Parse the ending '}' or '\}'.*/
798 if (ctx->cflags & REG_EXTENDED)
800 if (r >= ctx->re_end || *r != CHAR_RBRACE)
806 if (r + 1 >= ctx->re_end
807 || *r != CHAR_BACKSLASH
808 || *(r + 1) != CHAR_RBRACE)
814 /* Parse trailing '?' marking minimal repetition. */
817 if (*r == CHAR_QUESTIONMARK)
819 minimal = !(ctx->cflags & REG_UNGREEDY);
822 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
824 /* These are reserved for future extensions. */
829 /* Create the AST node(s). */
830 if (min == 0 && max == 0)
832 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
838 if (min < 0 && max < 0)
839 /* Only approximate parameters set, no repetitions. */
842 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
846 /* If approximate matching parameters are set, add them to the
848 if (approx || costs_set || counts_set)
851 tre_iteration_t *iter = (*result)->obj;
853 if (costs_set || counts_set)
855 if (limit_ins == TRE_PARAM_UNSET)
857 if (cost_ins == TRE_PARAM_UNSET)
863 if (limit_del == TRE_PARAM_UNSET)
865 if (cost_del == TRE_PARAM_UNSET)
871 if (limit_subst == TRE_PARAM_UNSET)
873 if (cost_subst == TRE_PARAM_UNSET)
876 limit_subst = INT_MAX;
880 if (cost_max == TRE_PARAM_UNSET)
882 if (limit_err == TRE_PARAM_UNSET)
885 ctx->have_approx = 1;
886 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
889 for (i = 0; i < TRE_PARAM_LAST; i++)
890 params[i] = TRE_PARAM_UNSET;
891 params[TRE_PARAM_COST_INS] = cost_ins;
892 params[TRE_PARAM_COST_DEL] = cost_del;
893 params[TRE_PARAM_COST_SUBST] = cost_subst;
894 params[TRE_PARAM_COST_MAX] = cost_max;
895 params[TRE_PARAM_MAX_INS] = limit_ins;
896 params[TRE_PARAM_MAX_DEL] = limit_del;
897 params[TRE_PARAM_MAX_SUBST] = limit_subst;
898 params[TRE_PARAM_MAX_ERR] = limit_err;
899 iter->params = params;
903 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
904 "limits [%d,%d,%d, total %d]\n",
905 min, max, cost_ins, cost_del, cost_subst, cost_max,
906 limit_ins, limit_del, limit_subst, limit_err));
916 PARSE_MARK_FOR_SUBMATCH,
920 PARSE_POST_CATENATION,
925 } tre_parse_re_stack_symbol_t;
929 tre_parse(tre_parse_ctx_t *ctx)
931 tre_ast_node_t *result = NULL;
932 tre_parse_re_stack_symbol_t symbol;
933 reg_errcode_t status = REG_OK;
934 tre_stack_t *stack = ctx->stack;
935 int bottom = tre_stack_num_objects(stack);
937 int temporary_cflags = 0;
939 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
940 ctx->len, ctx->re, ctx->len));
942 if (!ctx->nofirstsub)
944 STACK_PUSH(stack, int, ctx->submatch_id);
945 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
948 STACK_PUSH(stack, int, PARSE_RE);
949 ctx->re_start = ctx->re;
950 ctx->re_end = ctx->re + ctx->len;
953 /* The following is basically just a recursive descent parser. I use
954 an explicit stack instead of recursive functions mostly because of
955 two reasons: compatibility with systems which have an overflowable
956 call stack, and efficiency (both in lines of code and speed). */
957 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
959 if (status != REG_OK)
961 symbol = tre_stack_pop_int(stack);
965 /* Parse a full regexp. A regexp is one or more branches,
966 separated by the union operator `|'. */
968 if (!(ctx->cflags & REG_LITERAL)
969 && ctx->cflags & REG_EXTENDED)
970 #endif /* REG_LITERAL */
971 STACK_PUSHX(stack, int, PARSE_UNION);
972 STACK_PUSHX(stack, int, PARSE_BRANCH);
976 /* Parse a branch. A branch is one or more pieces, concatenated.
977 A piece is an atom possibly followed by a postfix operator. */
978 STACK_PUSHX(stack, int, PARSE_CATENATION);
979 STACK_PUSHX(stack, int, PARSE_PIECE);
983 /* Parse a piece. A piece is an atom possibly followed by one
984 or more postfix operators. */
986 if (!(ctx->cflags & REG_LITERAL))
987 #endif /* REG_LITERAL */
988 STACK_PUSHX(stack, int, PARSE_POSTFIX);
989 STACK_PUSHX(stack, int, PARSE_ATOM);
992 case PARSE_CATENATION:
993 /* If the expression has not ended, parse another piece. */
996 if (ctx->re >= ctx->re_end)
1000 if (!(ctx->cflags & REG_LITERAL))
1002 #endif /* REG_LITERAL */
1003 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1005 if ((ctx->cflags & REG_EXTENDED
1006 && c == CHAR_RPAREN && depth > 0)
1007 || (!(ctx->cflags & REG_EXTENDED)
1008 && (c == CHAR_BACKSLASH
1009 && *(ctx->re + 1) == CHAR_RPAREN)))
1011 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1012 status = REG_EPAREN;
1013 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1016 if (!(ctx->cflags & REG_EXTENDED))
1022 #endif /* REG_LITERAL */
1024 #ifdef REG_RIGHT_ASSOC
1025 if (ctx->cflags & REG_RIGHT_ASSOC)
1027 /* Right associative concatenation. */
1028 STACK_PUSHX(stack, voidptr, result);
1029 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1030 STACK_PUSHX(stack, int, PARSE_CATENATION);
1031 STACK_PUSHX(stack, int, PARSE_PIECE);
1034 #endif /* REG_RIGHT_ASSOC */
1036 /* Default case, left associative concatenation. */
1037 STACK_PUSHX(stack, int, PARSE_CATENATION);
1038 STACK_PUSHX(stack, voidptr, result);
1039 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1040 STACK_PUSHX(stack, int, PARSE_PIECE);
1045 case PARSE_POST_CATENATION:
1047 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1048 tre_ast_node_t *tmp_node;
1049 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1057 if (ctx->re >= ctx->re_end)
1060 if (ctx->cflags & REG_LITERAL)
1062 #endif /* REG_LITERAL */
1066 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1068 STACK_PUSHX(stack, int, PARSE_UNION);
1069 STACK_PUSHX(stack, voidptr, result);
1070 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1071 STACK_PUSHX(stack, int, PARSE_BRANCH);
1084 case PARSE_POST_UNION:
1086 tre_ast_node_t *tmp_node;
1087 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1088 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1096 /* Parse postfix operators. */
1097 if (ctx->re >= ctx->re_end)
1100 if (ctx->cflags & REG_LITERAL)
1102 #endif /* REG_LITERAL */
1106 case CHAR_QUESTIONMARK:
1107 if (!(ctx->cflags & REG_EXTENDED))
1112 tre_ast_node_t *tmp_node;
1113 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1117 const tre_char_t *tmp_re;
1120 if (*ctx->re == CHAR_PLUS)
1122 if (*ctx->re == CHAR_QUESTIONMARK)
1128 if (ctx->re + 1 < ctx->re_end)
1130 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1132 minimal = !(ctx->cflags & REG_UNGREEDY);
1135 else if (*(ctx->re + 1) == CHAR_STAR
1136 || *(ctx->re + 1) == CHAR_PLUS)
1138 /* These are reserved for future extensions. */
1143 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1144 minimal ? " minimal" : "greedy", REST(tmp_re)));
1146 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1148 if (tmp_node == NULL)
1151 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1155 case CHAR_BACKSLASH:
1156 /* "\{" is special without REG_EXTENDED */
1157 if (!(ctx->cflags & REG_EXTENDED)
1158 && ctx->re + 1 < ctx->re_end
1159 && *(ctx->re + 1) == CHAR_LBRACE)
1168 /* "{" is literal without REG_EXTENDED */
1169 if (!(ctx->cflags & REG_EXTENDED))
1173 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1177 status = tre_parse_bound(ctx, &result);
1178 if (status != REG_OK)
1180 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1186 /* Parse an atom. An atom is a regular expression enclosed in `()',
1187 an empty set of `()', a bracket expression, `.', `^', `$',
1188 a `\' followed by a character, or a single character. */
1190 /* End of regexp? (empty string). */
1191 if (ctx->re >= ctx->re_end)
1195 if (ctx->cflags & REG_LITERAL)
1197 #endif /* REG_LITERAL */
1201 case CHAR_LPAREN: /* parenthesized subexpression */
1203 /* Handle "(?...)" extensions. They work in a way similar
1204 to Perls corresponding extensions. */
1205 if (ctx->cflags & REG_EXTENDED
1206 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1208 int new_cflags = ctx->cflags;
1210 DPRINT(("tre_parse: extension: '%.*" STRF "\n",
1213 while (/*CONSTCOND*/1)
1215 if (*ctx->re == L'i')
1217 DPRINT(("tre_parse: icase: '%.*" STRF "\n",
1220 new_cflags |= REG_ICASE;
1222 new_cflags &= ~REG_ICASE;
1225 else if (*ctx->re == L'n')
1227 DPRINT(("tre_parse: newline: '%.*" STRF "\n",
1230 new_cflags |= REG_NEWLINE;
1232 new_cflags &= ~REG_NEWLINE;
1235 #ifdef REG_RIGHT_ASSOC
1236 else if (*ctx->re == L'r')
1238 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1241 new_cflags |= REG_RIGHT_ASSOC;
1243 new_cflags &= ~REG_RIGHT_ASSOC;
1246 #endif /* REG_RIGHT_ASSOC */
1248 else if (*ctx->re == L'U')
1250 DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n",
1253 new_cflags |= REG_UNGREEDY;
1255 new_cflags &= ~REG_UNGREEDY;
1258 #endif /* REG_UNGREEDY */
1259 else if (*ctx->re == CHAR_MINUS)
1261 DPRINT(("tre_parse: turn off: '%.*" STRF "\n",
1266 else if (*ctx->re == CHAR_COLON)
1268 DPRINT(("tre_parse: no group: '%.*" STRF "\n",
1274 else if (*ctx->re == CHAR_HASH)
1276 DPRINT(("tre_parse: comment: '%.*" STRF "\n",
1278 /* A comment can contain any character except a
1279 right parenthesis */
1280 while (*ctx->re != CHAR_RPAREN
1281 && ctx->re < ctx->re_end)
1283 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1291 else if (*ctx->re == CHAR_RPAREN)
1300 /* Turn on the cflags changes for the rest of the
1302 STACK_PUSHX(stack, int, ctx->cflags);
1303 STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS);
1304 STACK_PUSHX(stack, int, PARSE_RE);
1305 ctx->cflags = new_cflags;
1309 if (ctx->cflags & REG_EXTENDED
1310 || (ctx->re > ctx->re_start
1311 && *(ctx->re - 1) == CHAR_BACKSLASH))
1314 if (ctx->re + 2 < ctx->re_end
1315 && *(ctx->re + 1) == CHAR_QUESTIONMARK
1316 && *(ctx->re + 2) == CHAR_COLON)
1318 DPRINT(("tre_parse: group begin: '%.*" STRF
1319 "', no submatch\n", REST(ctx->re)));
1320 /* Don't mark for submatching. */
1322 STACK_PUSHX(stack, int, PARSE_RE);
1326 DPRINT(("tre_parse: group begin: '%.*" STRF
1327 "', submatch %d\n", REST(ctx->re),
1330 /* First parse a whole RE, then mark the resulting tree
1332 STACK_PUSHX(stack, int, ctx->submatch_id);
1333 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1334 STACK_PUSHX(stack, int, PARSE_RE);
1342 case CHAR_RPAREN: /* end of current subexpression */
1343 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1344 || (ctx->re > ctx->re_start
1345 && *(ctx->re - 1) == CHAR_BACKSLASH))
1347 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1349 /* We were expecting an atom, but instead the current
1350 subexpression was closed. POSIX leaves the meaning of
1351 this to be implementation-defined. We interpret this as
1352 an empty expression (which matches an empty string). */
1353 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1356 if (!(ctx->cflags & REG_EXTENDED))
1363 case CHAR_LBRACKET: /* bracket expression */
1364 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1367 status = tre_parse_bracket(ctx, &result);
1368 if (status != REG_OK)
1372 case CHAR_BACKSLASH:
1373 /* If this is "\(" or "\)" chew off the backslash and
1375 if (!(ctx->cflags & REG_EXTENDED)
1376 && ctx->re + 1 < ctx->re_end
1377 && (*(ctx->re + 1) == CHAR_LPAREN
1378 || *(ctx->re + 1) == CHAR_RPAREN))
1381 STACK_PUSHX(stack, int, PARSE_ATOM);
1385 /* If a macro is used, parse the expanded macro recursively. */
1388 tre_expand_macro(ctx->re + 1, ctx->re_end,
1389 buf, elementsof(buf));
1392 tre_parse_ctx_t subctx;
1393 memcpy(&subctx, ctx, sizeof(subctx));
1395 subctx.len = tre_strlen(buf);
1396 subctx.nofirstsub = 1;
1397 status = tre_parse(&subctx);
1398 if (status != REG_OK)
1401 ctx->position = subctx.position;
1402 result = subctx.result;
1407 if (ctx->re + 1 >= ctx->re_end)
1408 /* Trailing backslash. */
1412 if (*(ctx->re + 1) == L'Q')
1414 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1416 ctx->cflags |= REG_LITERAL;
1417 temporary_cflags |= REG_LITERAL;
1419 STACK_PUSHX(stack, int, PARSE_ATOM);
1422 #endif /* REG_LITERAL */
1424 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1429 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1434 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1435 ASSERT_AT_WB_NEG, -1);
1439 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1444 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1450 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1452 /* 8 bit hex char. */
1453 char tmp[3] = {0, 0, 0};
1455 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1456 REST(ctx->re - 2)));
1458 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1460 tmp[0] = (char)ctx->re[0];
1463 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1465 tmp[1] = (char)ctx->re[0];
1468 val = strtol(tmp, NULL, 16);
1469 result = tre_ast_new_literal(ctx->mem, (int)val,
1470 (int)val, ctx->position);
1474 else if (ctx->re < ctx->re_end)
1481 while (ctx->re_end - ctx->re >= 0)
1483 if (ctx->re[0] == CHAR_RBRACE)
1485 if (tre_isxdigit(ctx->re[0]))
1487 tmp[i] = (char)ctx->re[0];
1496 val = strtol(tmp, NULL, 16);
1497 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1505 if (tre_isdigit(*ctx->re))
1507 /* Back reference. */
1508 int val = *ctx->re - L'0';
1509 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1510 REST(ctx->re - 1)));
1511 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1516 ctx->max_backref = MAX(val, ctx->max_backref);
1521 /* Escaped character. */
1522 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1523 REST(ctx->re - 1)));
1524 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1535 case CHAR_PERIOD: /* the any-symbol */
1536 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1538 if (ctx->cflags & REG_NEWLINE)
1540 tre_ast_node_t *tmp1;
1541 tre_ast_node_t *tmp2;
1542 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1546 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1550 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1557 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1566 case CHAR_CARET: /* beginning of line assertion */
1567 /* '^' has a special meaning everywhere in EREs, and in the
1568 beginning of the RE and after \( is BREs. */
1569 if (ctx->cflags & REG_EXTENDED
1570 || (ctx->re - 2 >= ctx->re_start
1571 && *(ctx->re - 2) == CHAR_BACKSLASH
1572 && *(ctx->re - 1) == CHAR_LPAREN)
1573 || ctx->re == ctx->re_start)
1575 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1577 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1587 case CHAR_DOLLAR: /* end of line assertion. */
1588 /* '$' is special everywhere in EREs, and in the end of the
1589 string and before \) is BREs. */
1590 if (ctx->cflags & REG_EXTENDED
1591 || (ctx->re + 2 < ctx->re_end
1592 && *(ctx->re + 1) == CHAR_BACKSLASH
1593 && *(ctx->re + 2) == CHAR_RPAREN)
1594 || ctx->re + 1 == ctx->re_end)
1596 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1598 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1611 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1612 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1614 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
1616 ctx->cflags &= ~temporary_cflags;
1617 temporary_cflags = 0;
1619 STACK_PUSHX(stack, int, PARSE_PIECE);
1624 /* We are expecting an atom. If the subexpression (or the whole
1625 regexp ends here, we interpret it as an empty expression
1626 (which matches an empty string). */
1629 !(ctx->cflags & REG_LITERAL) &&
1630 #endif /* REG_LITERAL */
1631 (ctx->re >= ctx->re_end
1632 || *ctx->re == CHAR_STAR
1633 || (ctx->cflags & REG_EXTENDED
1634 && (*ctx->re == CHAR_PIPE
1635 || *ctx->re == CHAR_LBRACE
1636 || *ctx->re == CHAR_PLUS
1637 || *ctx->re == CHAR_QUESTIONMARK))
1638 /* Test for "\)" in BRE mode. */
1639 || (!(ctx->cflags & REG_EXTENDED)
1640 && ctx->re + 1 < ctx->re_end
1641 && *ctx->re == CHAR_BACKSLASH
1642 && *(ctx->re + 1) == CHAR_LBRACE)))
1644 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1646 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1652 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1654 /* Note that we can't use an tre_isalpha() test here, since there
1655 may be characters which are alphabetic but neither upper or
1657 if (ctx->cflags & REG_ICASE
1658 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1660 tre_ast_node_t *tmp1;
1661 tre_ast_node_t *tmp2;
1663 /* XXX - Can there be more than one opposite-case
1664 counterpoints for some character in some locale? Or
1665 more than two characters which all should be regarded
1666 the same character if case is ignored? If yes, there
1667 does not seem to be a portable way to detect it. I guess
1668 that at least for multi-character collating elements there
1669 could be several opposite-case counterpoints, but they
1670 cannot be supported portably anyway. */
1671 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1672 tre_toupper(*ctx->re),
1676 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1677 tre_tolower(*ctx->re),
1681 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1687 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1698 case PARSE_MARK_FOR_SUBMATCH:
1700 int submatch_id = tre_stack_pop_int(stack);
1702 if (result->submatch_id >= 0)
1704 tre_ast_node_t *n, *tmp_node;
1705 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1708 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1709 if (tmp_node == NULL)
1711 tmp_node->num_submatches = result->num_submatches;
1714 result->submatch_id = submatch_id;
1715 result->num_submatches++;
1719 case PARSE_RESTORE_CFLAGS:
1720 ctx->cflags = tre_stack_pop_int(stack);
1729 /* Check for missing closing parentheses. */
1733 if (status == REG_OK)
1734 ctx->result = result;