gdb - Local mods (compile)
[dragonfly.git] / contrib / tre / lib / tre-parse.c
CommitLineData
5f2eab64
JM
1/*
2 tre-parse.c - Regexp parser
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7*/
8
9/*
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
13*/
14
15
16#ifdef HAVE_CONFIG_H
17#include <config.h>
18#endif /* HAVE_CONFIG_H */
19#include <string.h>
20#include <assert.h>
21#include <limits.h>
d5f8dde1 22#include <stddef.h>
5f2eab64
JM
23
24#include "xmalloc.h"
25#include "tre-mem.h"
26#include "tre-ast.h"
27#include "tre-stack.h"
28#include "tre-parse.h"
29
d5f8dde1
JM
30#include "xlocale_private.h"
31#include "collate.h"
32
33/* BSD compatibility:
34 Before looking up a collating symbol, check if the name matches in
35 the character names (cnames) array; if so, use the corresponding
36 character.
37
38 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39 the implementation choice that for ERE, a non-numeric character following
40 a left brace that would normally be a bound, causes the left brace to be
41 literal. */
42#define BSD_COMPATIBILITY
43#ifdef BSD_COMPATIBILITY
44#include "cname.h"
45#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46#endif /* BSD_COMPATIBILITY */
5f2eab64
JM
47
48/* Characters with special meanings in regexp syntax. */
49#define CHAR_PIPE L'|'
50#define CHAR_LPAREN L'('
51#define CHAR_RPAREN L')'
52#define CHAR_LBRACE L'{'
53#define CHAR_RBRACE L'}'
54#define CHAR_LBRACKET L'['
55#define CHAR_RBRACKET L']'
56#define CHAR_MINUS L'-'
57#define CHAR_STAR L'*'
58#define CHAR_QUESTIONMARK L'?'
59#define CHAR_PLUS L'+'
60#define CHAR_PERIOD L'.'
61#define CHAR_COLON L':'
62#define CHAR_EQUAL L'='
63#define CHAR_COMMA L','
64#define CHAR_CARET L'^'
65#define CHAR_DOLLAR L'$'
66#define CHAR_BACKSLASH L'\\'
67#define CHAR_HASH L'#'
68#define CHAR_TILDE L'~'
69
70
71/* Some macros for expanding \w, \s, etc. */
72static const struct tre_macro_struct {
73 const char c;
74 const char *expansion;
75} tre_macros[] =
76 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
77 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
78 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
79 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
80 { 0, NULL }
81 };
82
83
84/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
85 must have at least `len' items. Sets buf[0] to zero if the there
86 is no match in `tre_macros'. */
87static void
88tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
89 tre_char_t *buf, size_t buf_len)
90{
91 int i;
92
93 buf[0] = 0;
94 if (regex >= regex_end)
95 return;
96
97 for (i = 0; tre_macros[i].expansion; i++)
98 {
99 if (tre_macros[i].c == *regex)
100 {
101 unsigned int j;
102 DPRINT(("Expanding macro '%c' => '%s'\n",
103 tre_macros[i].c, tre_macros[i].expansion));
104 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
105 buf[j] = tre_macros[i].expansion[j];
106 buf[j] = 0;
107 break;
108 }
109 }
110}
111
112static reg_errcode_t
d5f8dde1
JM
113tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
114 tre_bracket_match_list_t **items)
5f2eab64 115{
d5f8dde1
JM
116 reg_errcode_t status = REG_OK;
117 tre_bracket_match_list_t *array = *items;
118 int i = array->num_bracket_matches;
5f2eab64 119 /* Allocate more space if necessary. */
d5f8dde1 120 if (i >= *max_i)
5f2eab64 121 {
d5f8dde1
JM
122 tre_bracket_match_list_t *new_items;
123 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
5f2eab64
JM
124 /* If the array is already 1024 items large, give up -- there's
125 probably an error in the regexp (e.g. not a '\0' terminated
126 string and missing ']') */
d5f8dde1 127 if (*max_i >= 1024)
5f2eab64
JM
128 return REG_ESPACE;
129 *max_i *= 2;
d5f8dde1 130 new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
5f2eab64
JM
131 if (new_items == NULL)
132 return REG_ESPACE;
133 *items = array = new_items;
134 }
d5f8dde1
JM
135 array->bracket_matches[i].type = type;
136 array->bracket_matches[i].value = val;
137 array->num_bracket_matches++;
5f2eab64
JM
138 return status;
139}
140
5f2eab64
JM
141#ifndef TRE_USE_SYSTEM_WCTYPE
142
143/* isalnum() and the rest may be macros, so wrap them to functions. */
144int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
145int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
146
147#ifdef tre_isascii
148int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
149#else /* !tre_isascii */
150int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
151#endif /* !tre_isascii */
152
153#ifdef tre_isblank
154int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
155#else /* !tre_isblank */
156int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
157#endif /* !tre_isblank */
158
159int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
160int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
161int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
162int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
163int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
164int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
165int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
166int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
167int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
168
169struct {
170 char *name;
171 int (*func)(tre_cint_t);
172} tre_ctype_map[] = {
173 { "alnum", &tre_isalnum_func },
174 { "alpha", &tre_isalpha_func },
175#ifdef tre_isascii
176 { "ascii", &tre_isascii_func },
177#endif /* tre_isascii */
178#ifdef tre_isblank
179 { "blank", &tre_isblank_func },
180#endif /* tre_isblank */
181 { "cntrl", &tre_iscntrl_func },
182 { "digit", &tre_isdigit_func },
183 { "graph", &tre_isgraph_func },
184 { "lower", &tre_islower_func },
185 { "print", &tre_isprint_func },
186 { "punct", &tre_ispunct_func },
187 { "space", &tre_isspace_func },
188 { "upper", &tre_isupper_func },
189 { "xdigit", &tre_isxdigit_func },
190 { NULL, NULL}
191};
192
193tre_ctype_t tre_ctype(const char *name)
194{
195 int i;
196 for (i = 0; tre_ctype_map[i].name != NULL; i++)
197 {
198 if (strcmp(name, tre_ctype_map[i].name) == 0)
199 return tre_ctype_map[i].func;
200 }
201 return (tre_ctype_t)0;
202}
203#endif /* !TRE_USE_SYSTEM_WCTYPE */
204
d5f8dde1 205#define REST(re) (int)(ctx->re_end - (re)), (re)
5f2eab64 206
d5f8dde1
JM
207#define START_COLLATING_SYMBOLS 16
208#define MAX_COLLATING_SYMBOL_LEN 4
5f2eab64 209
d5f8dde1
JM
210typedef struct {
211 const tre_char_t *start;
212 int len;
213} tre_collating_symbol;
214
215#ifdef BSD_COMPATIBILITY
216static wchar_t
217tre_search_cnames(const wchar_t *name, size_t len)
218{
219 size_t low = 0;
220 size_t high = NCNAMES - 1;
221 size_t cur;
222 int cmp;
5f2eab64 223
d5f8dde1
JM
224 while(low <= high)
225 {
226 cur = (low + high) / 2;
227 cmp = wcsncmp(name, cnames[cur].name, len);
228 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
229 if (cmp > 0) low = cur + 1;
230 else high = cur - 1;
231 }
232 return (wchar_t)-1;
233}
234#endif /* BSD_COMPATIBILITY */
235
236/* Scan the contents of a bracket expression, and create a
237 * tre_bracket_match_list_t encoding the bracket expression. If during
238 * the scan, multi-character collating symbols are detected, switch
239 * into a mode to collect those MCCSs into a tre_collating_symbol
240 * list and pass them back. tre_parse_bracket will use that to
241 * create a new string composed of a union of the bracket expression
242 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243 * call tre_parse (recursive) to parse that new string (which will
244 * call tre_parse_bracket and tre_parse_bracket_items again. */
5f2eab64 245static reg_errcode_t
d5f8dde1
JM
246tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
247 int *items_size, tre_collating_symbol **result)
5f2eab64
JM
248{
249 const tre_char_t *re = ctx->re;
d5f8dde1
JM
250 const tre_char_t *re_end = ctx->re_end;
251 tre_collating_symbol *col_syms = NULL;
252 tre_collating_symbol *cp = NULL;
253 int n_col_syms = 0;
254 reg_errcode_t status;
5f2eab64 255 int max_i = *items_size;
d5f8dde1
JM
256 int other = 0; /* contains content other than multi-character collating
257 * symbols */
258 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
259 tre_cint_t min, c;
260 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
261 int collect_MCCS = 0;
262 const tre_char_t *start;
5f2eab64 263
d5f8dde1 264 for ( ;re < re_end; re++)
5f2eab64 265 {
d5f8dde1 266 switch (*re)
5f2eab64 267 {
d5f8dde1
JM
268 case CHAR_MINUS:
269 /* A first hyphen */
270 if (re == ctx->re)
271 {
272 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
273 min = CHAR_MINUS;
274 other++;
275 range = 0;
276 break;
277 }
278 /* The hyphen is the end range */
279 if (range > 0)
280 {
281 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
282 c = CHAR_MINUS;
283 goto process_end_range;
284 }
285 if (re + 1 >= re_end)
286 {
287 status = REG_EBRACK;
288 goto error;
289 }
290 /* The hyphen is at the end */
291 if (re[1] == CHAR_RBRACKET)
292 {
293 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
294 c = CHAR_MINUS;
295 goto process_begin_range;
296 }
297 /* Two ranges are not allowed to share an endpoint, or begin
298 * range is illegal. */
299 if (range < 0)
300 {
301 status = REG_ERANGE;
302 goto error;
303 }
304 range = 1; /* Expect end range */
305 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
5f2eab64 306 break;
5f2eab64 307
d5f8dde1
JM
308 case CHAR_LBRACKET:
309 if (re + 1 >= re_end)
5f2eab64 310 {
d5f8dde1
JM
311 status = REG_EBRACK;
312 goto error;
5f2eab64 313 }
d5f8dde1 314 switch (re[1])
5f2eab64 315 {
d5f8dde1
JM
316 case CHAR_PERIOD:
317 {
318 re += 2;
319 start = re;
320 for (;; re++)
321 {
322 if (re >= re_end)
323 {
324 status = REG_ECOLLATE;
325 goto error;
326 }
327 if (*re == CHAR_PERIOD)
328 {
329 if (re + 1 >= re_end)
330 {
331 status = REG_ECOLLATE;
332 goto error;
333 }
334 /* Found end */
335 if (re[1] == CHAR_RBRACKET)
336 {
337 DPRINT(("tre_parse_bracket: collating "
338 "symbol: '%.*" STRF "'\n",
339 REST(start - 2)));
340 /* Empty name */
341 if (re == start)
342 {
343 status = REG_ECOLLATE;
344 goto error;
345 }
346#ifdef BSD_COMPATIBILITY
347 /* Check if the name is in cnames; if so, use
348 the corresponding code */
349 c = tre_search_cnames(start, re - start);
350 if (c != (wchar_t)-1)
351 {
352 re++;
353 goto process_single_character;
354 }
355#endif /* BSD_COMPATIBILITY */
356 /* Verify this is a known sequence */
357 if (__collate_equiv_value(ctx->loc, start,
358 re - start) <= 0)
359 {
360 status = REG_ECOLLATE;
361 goto error;
362 }
363 /* Process single character collating symbols */
364 if (re - start == 1)
365 {
366 c = *start;
367 re++;
368 goto process_single_character;
369 }
370 /* Inverted MCCSs are undefined */
371 if (invert)
372 {
373 status = REG_ECOLLATE;
374 goto error;
375 }
376 /* Can't have MCCSs as an endpoint to a range */
377 if (range > 0)
378 {
379 status = REG_ERANGE;
380 goto error;
381 }
382 range = -1;
383 /* Switch into MCCS collection mode (if not
384 * already there */
385#if TRE_DEBUG
386 if (!collect_MCCS)
387 {
388 collect_MCCS = 1;
389 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
390 }
391#else /* !TRE_DEBUG */
392 collect_MCCS = 1;
393#endif /* !TRE_DEBUG */
394 /* Allocate a memory block the first time */
395 if (!cp)
396 {
397 if ((col_syms = xmalloc(sizeof(*col_syms) *
398 (START_COLLATING_SYMBOLS + 2)))
399 == NULL)
400 return REG_ESPACE;
401 cp = col_syms + 1;
402 n_col_syms = START_COLLATING_SYMBOLS;
403 }
404 /* Enlarge the memory block is more is needed */
405 if ((cp - col_syms) - 1 >= n_col_syms)
406 {
407 int i = n_col_syms;
408 tre_collating_symbol *tmp =
409 xrealloc(col_syms, sizeof(*col_syms) *
410 ((n_col_syms *= 2) + 2));
411 if (tmp == NULL)
412 {
413 xfree(col_syms);
414 return REG_ESPACE;
415 }
416 DPRINT(("tre_list_collating_symbols: "
417 "Enlarging col_syms to %d\n",
418 n_col_syms));
419 col_syms = tmp;
420 cp = col_syms + i + 1;
421 }
422 cp->start = start;
423 cp->len = re - start;
424 cp++;
425 re++;
426 break;
427 }
428 }
429 }
430 break;
431 }
432
433 case CHAR_EQUAL:
434 case CHAR_COLON:
435 {
436 /* Process equivalence and character classes */
437 tre_char_t kind = re[1];
438
439 /* Can't have a class as an endpoint to a range */
440 if (range > 0)
441 {
442 status = REG_ERANGE;
443 goto error;
444 }
445 if (!collect_MCCS && range == 0)
446 {
447 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
448 min, &max_i, items);
449 if (status != REG_OK)
450 goto error;
451 }
452 range = -1;
453 re += 2;
454 start = re;
455 for (;; re++)
5f2eab64 456 {
d5f8dde1
JM
457 if (re >= re_end)
458 {
459 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
460 goto error;
461 }
462 if (*re == kind)
463 {
464 if (re + 1 >= re_end)
465 {
466 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
467 REG_ECTYPE;
468 goto error;
469 }
470 /* Found end */
471 if (re[1] == CHAR_RBRACKET)
472 {
473 if (re == start)
474 {
475 /* Empty class name */
476 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
477 REG_ECTYPE;
478 goto error;
479 }
480 /* Process equivalence class */
481 if (kind == CHAR_EQUAL)
482 {
483 int equiv;
484
485 DPRINT(("tre_parse_bracket: equivalence: '%.*"
486 STRF "'\n", REST(start - 2)));
487
488 /* While we find the collation value even for
489 multi-character collating elements , we
490 don't (yet) match any collation values
491 against multi-character sequences. We'd have
492 to enumerate those multi-character sequences
493 and like multi-character collating symbols,
494 create a union of those sequences with the
495 rest of the bracket expression. While
496 doable, a bracket expression matching
497 multiple characters, that doesn't explicitly
498 contain multi-character sequences, might
499 be unexpected, so we punt for now. */
500 if ((equiv = __collate_equiv_value(ctx->loc,
501 start, re - start)) <= 0)
502 {
503 /* The standard says that if no collating
504 element if found, we use the collating
505 symbol itself. But __collate_equiv_value
506 doesn't make a distinction between
507 an element that is in a equvalence
508 class with others, or is the only member,
509 so we already know there is no collating
510 symbol. (Note that in the case of a
511 collating element whose collation value
512 is unique, matching against the
513 collating element itself, or against
514 its collation value, is equivalent.) */
515#ifdef BSD_COMPATIBILITY
516 /* Check if the name is in cnames; if so,
517 use the corresponding code */
518 c = tre_search_cnames(start, re - start);
519 if (c != (wchar_t)-1)
520 {
521 re++;
522 goto process_single_character;
523 }
524#endif /* BSD_COMPATIBILITY */
525 status = REG_ECOLLATE;
526 goto error;
527 }
528 if (!collect_MCCS)
529 {
530 status = tre_new_item(ctx->mem,
531 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
532 equiv, &max_i, items);
533 if (status != REG_OK)
534 goto error;
535 }
536 }
537 else
538 {
539 /* Process character class */
540 DPRINT(("tre_parse_bracket: class: '%.*" STRF
541 "'\n", REST(start - 2)));
542 if (!collect_MCCS)
543 {
544 char tmp_str[64];
545 tre_ctype_t class;
546 int len = MIN(re - start, 63);
547#ifdef TRE_WCHAR
548 {
549 tre_char_t tmp_wcs[64];
550 wcsncpy(tmp_wcs, start, (size_t)len);
551 tmp_wcs[len] = L'\0';
5f2eab64 552#if defined HAVE_WCSRTOMBS
d5f8dde1
JM
553 {
554 mbstate_t state;
555 const tre_char_t *src = tmp_wcs;
556 memset(&state, '\0', sizeof(state));
557 len = wcsrtombs_l(tmp_str, &src,
558 sizeof(tmp_str), &state,
559 ctx->loc);
560 }
5f2eab64 561#elif defined HAVE_WCSTOMBS
d5f8dde1 562 len = wcstombs(tmp_str, tmp_wcs, 63);
5f2eab64 563#endif /* defined HAVE_WCSTOMBS */
d5f8dde1 564 }
5f2eab64 565#else /* !TRE_WCHAR */
d5f8dde1 566 strncpy(tmp_str, (const char*)start, len);
5f2eab64 567#endif /* !TRE_WCHAR */
d5f8dde1
JM
568 tmp_str[len] = '\0';
569 DPRINT((" class name: %s\n", tmp_str));
570 class = tre_ctype_l(tmp_str, ctx->loc);
571 if (!class)
572 {
573 status = REG_ECTYPE;
574 goto error;
575 }
576 status = tre_new_item(ctx->mem,
577 TRE_BRACKET_MATCH_TYPE_CLASS,
578 class, &max_i, items);
579 if (status != REG_OK)
580 goto error;
581 }
582 }
583 re++;
584 break;
585 }
586 }
587 }
588 other++;
589 break;
590 }
591
592 default:
593 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
594 c = CHAR_LBRACKET;
595 goto process_single_character;
596 break;
5f2eab64 597 }
d5f8dde1
JM
598 break;
599
600 case CHAR_RBRACKET:
601 /* A first right bracket */
602 if (re == ctx->re)
5f2eab64
JM
603 {
604 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
d5f8dde1
JM
605 min = CHAR_RBRACKET;
606 range = 0;
607 other++;
608 break;
5f2eab64 609 }
d5f8dde1
JM
610 /* Done */
611 if (collect_MCCS)
612 {
613 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
614 REST(re)));
615 if (col_syms)
616 {
617 /* Mark the character following the right bracket. Set len
618 * to whether there are other things besides the
619 * multi-character collating symbols */
620 col_syms->start = re + 1;
621 col_syms->len = other;
622 /* Mark the end of the list */
623 cp->start = NULL;
624 }
625 *result = col_syms;
626 return REG_OK;
627 }
628 /* range > 0 is not possible, since we did a lookahead after the
629 * hyphen */
630 if (range == 0)
5f2eab64 631 {
d5f8dde1
JM
632 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
633 min, &max_i, items);
5f2eab64 634 if (status != REG_OK)
d5f8dde1 635 goto error;
5f2eab64 636 }
d5f8dde1
JM
637 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
638 *items_size = max_i;
639 ctx->re = re + 1;
640 return REG_OK;
5f2eab64 641
d5f8dde1
JM
642 default:
643 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
644 c = *re;
645process_single_character:
646 /* Process single character */
647 if (range > 0)
5f2eab64 648 {
d5f8dde1 649 int mine, maxe;
5f2eab64 650
d5f8dde1
JM
651process_end_range:
652 /* Get collation equivalence values */
653 mine = __collate_equiv_value(ctx->loc, &min, 1);
654 maxe = __collate_equiv_value(ctx->loc, &c, 1);
655 if (maxe < mine)
5f2eab64 656 {
d5f8dde1
JM
657 status = REG_ERANGE;
658 goto error;
659 }
660 if (!collect_MCCS)
661 {
662 status = tre_new_item(ctx->mem,
663 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
664 mine, &max_i, items);
665 if (status != REG_OK)
666 goto error;
667 status = tre_new_item(ctx->mem,
668 TRE_BRACKET_MATCH_TYPE_RANGE_END,
669 maxe, &max_i, items);
670 if (status != REG_OK)
671 goto error;
672 }
673 range = -1;
674 }
675 else
676 {
677process_begin_range:
678 if (!collect_MCCS)
679 {
680 if (range == 0)
5f2eab64 681 {
d5f8dde1
JM
682 status = tre_new_item(ctx->mem,
683 TRE_BRACKET_MATCH_TYPE_CHAR,
684 min, &max_i, items);
685 if (status != REG_OK)
686 goto error;
5f2eab64 687 }
d5f8dde1 688 min = c;
5f2eab64 689 }
d5f8dde1 690 range = 0;
5f2eab64 691 }
d5f8dde1
JM
692 other++;
693 break;
5f2eab64
JM
694 }
695 }
d5f8dde1
JM
696 status = REG_EBRACK;
697error:
698 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
699 REST(re), status));
700 if (col_syms)
701 xfree(col_syms);
5f2eab64
JM
702 return status;
703}
704
d5f8dde1
JM
705#ifdef TRE_DEBUG
706static const char *bracket_match_type_str[] = {
707 "unused",
708 "char",
709 "range begin",
710 "range end",
711 "class",
712 "equivalence value",
713};
714#endif /* TRE_DEBUG */
715
5f2eab64
JM
716static reg_errcode_t
717tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
718{
d5f8dde1 719 tre_ast_node_t *node;
5f2eab64 720 reg_errcode_t status = REG_OK;
d5f8dde1
JM
721 tre_bracket_match_list_t *items;
722 int max_i = 32;
723 tre_collating_symbol *col_syms = NULL;
724
725 /* Handle special cases [[:<:]] and [[:>:]] */
726 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
727 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
728 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
729 && ctx->re[5] == CHAR_RBRACKET)
730 {
731 *result = tre_ast_new_literal(ctx->mem, ASSERTION,
732 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
733 -1);
734 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
735 "[[:<:]]" : "[[:>:]]"));
736 ctx->re += 6;
737 return *result ? REG_OK : REG_ESPACE;
738 }
5f2eab64
JM
739
740 /* Start off with an array of `max_i' elements. */
d5f8dde1 741 items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
5f2eab64
JM
742 if (items == NULL)
743 return REG_ESPACE;
744
745 if (*ctx->re == CHAR_CARET)
746 {
747 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
d5f8dde1 748 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
5f2eab64
JM
749 ctx->re++;
750 }
751
d5f8dde1 752 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
5f2eab64
JM
753
754 if (status != REG_OK)
755 goto parse_bracket_done;
756
d5f8dde1
JM
757 /* If there are collating symbols, split off the multi-character ones
758 * into a union of the bracket expression (without the collating symbols)
759 * and the multiple-character sequences. We create an equivalent input
760 * string and run tre_parse() recursively */
761 if (col_syms)
5f2eab64 762 {
d5f8dde1
JM
763 tre_char_t *str, *sp;
764 tre_collating_symbol *cp;
765 tre_parse_ctx_t subctx;
766
767 /* Allocate a new string. We start with the size of the original
768 * bracket expression (minus 1) and add 2 (for a leading "[" and
769 * a trailing nil; don't need a "^", since it is illegal to have
770 * inverted MCCSs). Since a multi-character collating symbols
771 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772 * need to worry about the new string getting too long. */
773 xfree(items);
774 str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
775 if (str == NULL)
5f2eab64 776 {
d5f8dde1
JM
777 xfree(col_syms);
778 return REG_ESPACE;
5f2eab64 779 }
d5f8dde1
JM
780 sp = str;
781 if (col_syms->len > 0)
5f2eab64 782 {
d5f8dde1
JM
783 /* There are other items in the bracket expression besides the
784 * multi-character collating symbols, so create a new bracket
785 * expression with only those other itmes. */
786 const tre_char_t *re;
787 ptrdiff_t i;
788
789 *sp++ = '[';
790 re = ctx->re;
791 for (cp = col_syms + 1; cp->start; cp++)
5f2eab64 792 {
d5f8dde1
JM
793 /* The "- 2" is to account for the "[." */
794 if ((i = ((cp->start - re) - 2)) > 0)
5f2eab64 795 {
d5f8dde1
JM
796 memcpy(sp, re, sizeof(*sp) * i);
797 sp += i;
5f2eab64 798 }
d5f8dde1
JM
799 /* The "+ 2" is to account for the ".]" */
800 re = cp->start + cp->len + 2;
5f2eab64 801 }
d5f8dde1
JM
802 i = col_syms->start - re; /* Includes the trailing right bracket */
803 memcpy(sp, re, sizeof(*sp) * i);
804 sp += i;
805 *sp++ = '|';
806 }
807 for (cp = col_syms + 1; cp->start; cp++)
808 {
809 memcpy(sp, cp->start, sizeof(*sp) * cp->len);
810 sp += cp->len;
811 if (cp[1].start)
812 *sp++ = '|';
5f2eab64 813 }
d5f8dde1
JM
814 *sp = 0;
815 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
816 str));
817
818 memcpy(&subctx, ctx, sizeof(subctx));
819 subctx.re = str;
820 subctx.len = sp - str;
821 subctx.nofirstsub = 1;
822 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
823 status = tre_parse(&subctx);
824 xfree(str);
825 if (status != REG_OK)
826 {
827 xfree(col_syms);
828 return status;
829 }
830 ctx->re = col_syms->start;
831 ctx->position = subctx.position;
832 xfree(col_syms);
833 *result = subctx.result;
834 DPRINT(("tre_parse_bracket: Returning to original string\n"));
835 return REG_OK;
5f2eab64
JM
836 }
837
d5f8dde1
JM
838 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
840 if (node == NULL)
841 {
842 status = REG_ESPACE;
843 goto parse_bracket_done;
844 }
845 else
5f2eab64 846 {
d5f8dde1
JM
847 tre_literal_t *l = node->obj;
848 l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
849 SIZEOF_BRACKET_MATCH_LIST(items));
850 if (l->u.bracket_match_list == NULL)
5f2eab64 851 {
d5f8dde1
JM
852 status = REG_ESPACE;
853 goto parse_bracket_done;
5f2eab64 854 }
d5f8dde1 855 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
5f2eab64
JM
856 }
857
5f2eab64 858#ifdef TRE_DEBUG
d5f8dde1
JM
859 {
860 int i;
861 tre_bracket_match_t *b;
862 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863 items->num_bracket_matches, items->flags));
864 for (i = 0, b = items->bracket_matches;
865 i < items->num_bracket_matches; i++, b++)
866 {
867 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
868 b->value));
869 }
870 }
5f2eab64
JM
871#endif /* TRE_DEBUG */
872
873 parse_bracket_done:
874 xfree(items);
875 ctx->position++;
876 *result = node;
877 return status;
878}
879
880
881/* Parses a positive decimal integer. Returns -1 if the string does not
882 contain a valid number. */
883static int
884tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
885{
886 int num = -1;
887 const tre_char_t *r = *regex;
888 while (r < regex_end && *r >= L'0' && *r <= L'9')
889 {
890 if (num < 0)
891 num = 0;
892 num = num * 10 + *r - L'0';
893 r++;
894 }
895 *regex = r;
896 return num;
897}
898
899
900static reg_errcode_t
901tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
902{
d5f8dde1
JM
903 int min, max;
904#ifdef TRE_APPROX
905 int i;
5f2eab64
JM
906 int cost_ins, cost_del, cost_subst, cost_max;
907 int limit_ins, limit_del, limit_subst, limit_err;
5f2eab64 908 const tre_char_t *start;
d5f8dde1
JM
909#endif /* TRE_APPROX */
910 const tre_char_t *r = ctx->re;
5f2eab64 911 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
d5f8dde1 912#ifdef TRE_APPROX
5f2eab64
JM
913 int approx = 0;
914 int costs_set = 0;
915 int counts_set = 0;
916
917 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
918 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
d5f8dde1 919#endif /* TRE_APPROX */
5f2eab64
JM
920
921 /* Parse number (minimum repetition count). */
922 min = -1;
d5f8dde1
JM
923 if (r >= ctx->re_end)
924#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
926#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
927 return REG_EBRACE;
928#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929 if (*r >= L'0' && *r <= L'9') {
5f2eab64
JM
930 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
931 min = tre_parse_int(&r, ctx->re_end);
932 }
d5f8dde1
JM
933#ifndef TRE_APPROX
934 else
935#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936 /* For ERE, return REG_NOMATCH to signal that the lbrace should
937 be treated as a literal */
938 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
939#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
940 return REG_BADBR;
941#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942#endif /* !TRE_APPROX */
5f2eab64
JM
943
944 /* Parse comma and second number (maximum repetition count). */
945 max = min;
946 if (r < ctx->re_end && *r == CHAR_COMMA)
947 {
948 r++;
949 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
950 max = tre_parse_int(&r, ctx->re_end);
951 }
952
953 /* Check that the repeat counts are sane. */
d5f8dde1 954 if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
5f2eab64
JM
955 return REG_BADBR;
956
957
d5f8dde1 958#ifdef TRE_APPROX
5f2eab64
JM
959 /*
960 '{'
961 optionally followed immediately by a number == minimum repcount
962 optionally followed by , then a number == maximum repcount
963 + then a number == maximum insertion count
964 - then a number == maximum deletion count
965 # then a number == maximum substitution count
966 ~ then a number == maximum number of errors
967 Any of +, -, # or ~ without followed by a number means that
968 the maximum count/number of errors is infinite.
969
970 An equation of the form
971 Xi + Yd + Zs < C
972 can be specified to set costs and the cost limit to a value
973 different from the default value:
974 - X is the cost of an insertion
975 - Y is the cost of a deletion
976 - Z is the cost of a substitution
977 - C is the maximum cost
978
979 If no count limit or cost is set for an operation, the operation
980 is not allowed at all.
981 */
982
983
984 do {
985 int done;
986 start = r;
987
988 /* Parse count limit settings */
989 done = 0;
990 if (!counts_set)
991 while (r + 1 < ctx->re_end && !done)
992 {
993 switch (*r)
994 {
995 case CHAR_PLUS: /* Insert limit */
996 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
997 r++;
998 limit_ins = tre_parse_int(&r, ctx->re_end);
999 if (limit_ins < 0)
1000 limit_ins = INT_MAX;
1001 counts_set = 1;
1002 break;
1003 case CHAR_MINUS: /* Delete limit */
1004 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
1005 r++;
1006 limit_del = tre_parse_int(&r, ctx->re_end);
1007 if (limit_del < 0)
1008 limit_del = INT_MAX;
1009 counts_set = 1;
1010 break;
1011 case CHAR_HASH: /* Substitute limit */
1012 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1013 r++;
1014 limit_subst = tre_parse_int(&r, ctx->re_end);
1015 if (limit_subst < 0)
1016 limit_subst = INT_MAX;
1017 counts_set = 1;
1018 break;
1019 case CHAR_TILDE: /* Maximum number of changes */
1020 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1021 r++;
1022 limit_err = tre_parse_int(&r, ctx->re_end);
1023 if (limit_err < 0)
1024 limit_err = INT_MAX;
1025 approx = 1;
1026 break;
1027 case CHAR_COMMA:
1028 r++;
1029 break;
1030 case L' ':
1031 r++;
1032 break;
1033 case L'}':
1034 done = 1;
1035 break;
1036 default:
1037 done = 1;
1038 break;
1039 }
1040 }
1041
1042 /* Parse cost restriction equation. */
1043 done = 0;
1044 if (!costs_set)
1045 while (r + 1 < ctx->re_end && !done)
1046 {
1047 switch (*r)
1048 {
1049 case CHAR_PLUS:
1050 case L' ':
1051 r++;
1052 break;
1053 case L'<':
1054 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
1055 r++;
1056 while (*r == L' ')
1057 r++;
1058 cost_max = tre_parse_int(&r, ctx->re_end);
1059 if (cost_max < 0)
1060 cost_max = INT_MAX;
1061 else
1062 cost_max--;
1063 approx = 1;
1064 break;
1065 case CHAR_COMMA:
1066 r++;
1067 done = 1;
1068 break;
1069 default:
1070 if (*r >= L'0' && *r <= L'9')
1071 {
1072#ifdef TRE_DEBUG
1073 const tre_char_t *sr = r;
1074#endif /* TRE_DEBUG */
1075 int cost = tre_parse_int(&r, ctx->re_end);
1076 /* XXX - make sure r is not past end. */
1077 switch (*r)
1078 {
1079 case L'i': /* Insert cost */
1080 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
1081 REST(sr)));
1082 r++;
1083 cost_ins = cost;
1084 costs_set = 1;
1085 break;
1086 case L'd': /* Delete cost */
1087 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
1088 REST(sr)));
1089 r++;
1090 cost_del = cost;
1091 costs_set = 1;
1092 break;
1093 case L's': /* Substitute cost */
1094 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
1095 REST(sr)));
1096 r++;
1097 cost_subst = cost;
1098 costs_set = 1;
1099 break;
1100 default:
1101 return REG_BADBR;
1102 }
1103 }
1104 else
1105 {
1106 done = 1;
1107 break;
1108 }
1109 }
1110 }
1111 } while (start != r);
d5f8dde1 1112#endif /* TRE_APPROX */
5f2eab64 1113
d5f8dde1 1114 /*{*//* Missing }. */
5f2eab64
JM
1115 if (r >= ctx->re_end)
1116 return REG_EBRACE;
1117
1118 /* Empty contents of {}. */
1119 if (r == ctx->re)
1120 return REG_BADBR;
1121
1122 /* Parse the ending '}' or '\}'.*/
1123 if (ctx->cflags & REG_EXTENDED)
1124 {
1125 if (r >= ctx->re_end || *r != CHAR_RBRACE)
1126 return REG_BADBR;
1127 r++;
d5f8dde1
JM
1128 /* Parse trailing '?' marking minimal repetition. */
1129 if (r < ctx->re_end)
1130 {
1131 if (*r == CHAR_QUESTIONMARK)
1132 {
1133 /* Process the question mark only in enhanced mode.
1134 Otherwise, the question mark is an error in ERE
1135 or a literal in BRE */
1136 if (ctx->cflags & REG_ENHANCED)
1137 {
1138 minimal = !(ctx->cflags & REG_UNGREEDY);
1139 r++;
1140 }
1141 else return REG_BADRPT;
1142 }
1143 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1144 {
1145 /* These are reserved for future extensions. */
1146 return REG_BADRPT;
1147 }
1148 }
5f2eab64
JM
1149 }
1150 else
1151 {
1152 if (r + 1 >= ctx->re_end
1153 || *r != CHAR_BACKSLASH
1154 || *(r + 1) != CHAR_RBRACE)
1155 return REG_BADBR;
1156 r += 2;
d5f8dde1 1157 if (r < ctx->re_end && *r == CHAR_STAR)
5f2eab64 1158 {
d5f8dde1 1159 /* This is reserved for future extensions. */
5f2eab64
JM
1160 return REG_BADRPT;
1161 }
1162 }
1163
d5f8dde1
JM
1164 if (minimal)
1165 ctx->num_reorder_tags++;
1166
1167 if (!result) goto parse_bound_exit;
5f2eab64 1168 /* Create the AST node(s). */
d5f8dde1
JM
1169 /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170 iteration with EMPTY. This unfortunately drops any submatches, and
1171 messes up setting the pmatch values (we can get tags of -1, and
1172 tag values in the billions). So we leave it and process this case as
1173 usual, and wait until tre_expand_ast() to replace with EMPTY */
1174#ifdef TRE_APPROX
1175 if (min < 0 && max < 0)
1176 /* Only approximate parameters set, no repetitions. */
1177 min = max = 1;
1178#endif /* TRE_APPROX */
1179
1180 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1181 if (!*result)
1182 return REG_ESPACE;
5f2eab64 1183
d5f8dde1
JM
1184#ifdef TRE_APPROX
1185 /* If approximate matching parameters are set, add them to the
1186 iteration node. */
1187 if (approx || costs_set || counts_set)
1188 {
1189 int *params;
1190 tre_iteration_t *iter = (*result)->obj;
5f2eab64 1191
d5f8dde1 1192 if (costs_set || counts_set)
5f2eab64 1193 {
d5f8dde1 1194 if (limit_ins == TRE_PARAM_UNSET)
5f2eab64 1195 {
d5f8dde1
JM
1196 if (cost_ins == TRE_PARAM_UNSET)
1197 limit_ins = 0;
1198 else
1199 limit_ins = INT_MAX;
1200 }
5f2eab64 1201
d5f8dde1
JM
1202 if (limit_del == TRE_PARAM_UNSET)
1203 {
1204 if (cost_del == TRE_PARAM_UNSET)
1205 limit_del = 0;
1206 else
1207 limit_del = INT_MAX;
5f2eab64
JM
1208 }
1209
d5f8dde1
JM
1210 if (limit_subst == TRE_PARAM_UNSET)
1211 {
1212 if (cost_subst == TRE_PARAM_UNSET)
1213 limit_subst = 0;
1214 else
1215 limit_subst = INT_MAX;
1216 }
5f2eab64 1217 }
d5f8dde1
JM
1218
1219 if (cost_max == TRE_PARAM_UNSET)
1220 cost_max = INT_MAX;
1221 if (limit_err == TRE_PARAM_UNSET)
1222 limit_err = INT_MAX;
1223
1224 ctx->have_approx = 1;
1225 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1226 if (!params)
1227 return REG_ESPACE;
1228 for (i = 0; i < TRE_PARAM_LAST; i++)
1229 params[i] = TRE_PARAM_UNSET;
1230 params[TRE_PARAM_COST_INS] = cost_ins;
1231 params[TRE_PARAM_COST_DEL] = cost_del;
1232 params[TRE_PARAM_COST_SUBST] = cost_subst;
1233 params[TRE_PARAM_COST_MAX] = cost_max;
1234 params[TRE_PARAM_MAX_INS] = limit_ins;
1235 params[TRE_PARAM_MAX_DEL] = limit_del;
1236 params[TRE_PARAM_MAX_SUBST] = limit_subst;
1237 params[TRE_PARAM_MAX_ERR] = limit_err;
1238 iter->params = params;
5f2eab64 1239 }
d5f8dde1 1240#endif /* TRE_APPROX */
5f2eab64 1241
d5f8dde1
JM
1242parse_bound_exit:
1243#ifdef TRE_APPROX
5f2eab64
JM
1244 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1245 "limits [%d,%d,%d, total %d]\n",
1246 min, max, cost_ins, cost_del, cost_subst, cost_max,
1247 limit_ins, limit_del, limit_subst, limit_err));
d5f8dde1
JM
1248#else /* !TRE_APPROX */
1249 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1250#endif /* !TRE_APPROX */
5f2eab64
JM
1251
1252
1253 ctx->re = r;
1254 return REG_OK;
1255}
1256
d5f8dde1
JM
1257/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260 Because we now set up tags for even non-capturing parenthesized
1261 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1262 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263 have it restore cflags after the subexpression, we don't need to have
1264 a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266 This has the side-benefit of now matching the perl behavior: the RE
1267 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268 of foo AND (?i) (bar OR zap). */
5f2eab64
JM
1269typedef enum {
1270 PARSE_RE = 0,
1271 PARSE_ATOM,
1272 PARSE_MARK_FOR_SUBMATCH,
1273 PARSE_BRANCH,
1274 PARSE_PIECE,
1275 PARSE_CATENATION,
1276 PARSE_POST_CATENATION,
1277 PARSE_UNION,
1278 PARSE_POST_UNION,
1279 PARSE_POSTFIX,
5f2eab64
JM
1280} tre_parse_re_stack_symbol_t;
1281
1282
1283reg_errcode_t
1284tre_parse(tre_parse_ctx_t *ctx)
1285{
1286 tre_ast_node_t *result = NULL;
1287 tre_parse_re_stack_symbol_t symbol;
1288 reg_errcode_t status = REG_OK;
1289 tre_stack_t *stack = ctx->stack;
1290 int bottom = tre_stack_num_objects(stack);
1291 int depth = 0;
1292 int temporary_cflags = 0;
d5f8dde1
JM
1293 int bre_branch_begin;
1294#ifdef TRE_DEBUG
1295 const tre_char_t *tmp_re;
1296#endif
5f2eab64 1297
d5f8dde1
JM
1298 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1299 ctx->len, ctx->re, ctx->len, ctx->cflags));
5f2eab64 1300
d5f8dde1 1301 if (ctx->len <= 0) return REG_EMPTY;
5f2eab64
JM
1302 if (!ctx->nofirstsub)
1303 {
d5f8dde1 1304 STACK_PUSH(stack, int, ctx->cflags);
5f2eab64
JM
1305 STACK_PUSH(stack, int, ctx->submatch_id);
1306 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1307 ctx->submatch_id++;
1308 }
d5f8dde1 1309 STACK_PUSH(stack, int, 0); // bre_branch_begin
5f2eab64
JM
1310 STACK_PUSH(stack, int, PARSE_RE);
1311 ctx->re_start = ctx->re;
1312 ctx->re_end = ctx->re + ctx->len;
1313
1314
1315 /* The following is basically just a recursive descent parser. I use
1316 an explicit stack instead of recursive functions mostly because of
1317 two reasons: compatibility with systems which have an overflowable
1318 call stack, and efficiency (both in lines of code and speed). */
d5f8dde1 1319 while (tre_stack_num_objects(stack) > bottom)
5f2eab64 1320 {
5f2eab64
JM
1321 symbol = tre_stack_pop_int(stack);
1322 switch (symbol)
1323 {
1324 case PARSE_RE:
1325 /* Parse a full regexp. A regexp is one or more branches,
1326 separated by the union operator `|'. */
d5f8dde1
JM
1327 bre_branch_begin = tre_stack_pop_int(stack);
1328 if (
5f2eab64 1329#ifdef REG_LITERAL
d5f8dde1 1330 !(ctx->cflags & REG_LITERAL) &&
5f2eab64 1331#endif /* REG_LITERAL */
d5f8dde1 1332 ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
5f2eab64 1333 STACK_PUSHX(stack, int, PARSE_UNION);
d5f8dde1 1334 STACK_PUSHX(stack, int, bre_branch_begin);
5f2eab64
JM
1335 STACK_PUSHX(stack, int, PARSE_BRANCH);
1336 break;
1337
1338 case PARSE_BRANCH:
1339 /* Parse a branch. A branch is one or more pieces, concatenated.
1340 A piece is an atom possibly followed by a postfix operator. */
d5f8dde1 1341 bre_branch_begin = tre_stack_pop_int(stack);
5f2eab64 1342 STACK_PUSHX(stack, int, PARSE_CATENATION);
d5f8dde1 1343 STACK_PUSHX(stack, int, bre_branch_begin);
5f2eab64
JM
1344 STACK_PUSHX(stack, int, PARSE_PIECE);
1345 break;
1346
1347 case PARSE_PIECE:
1348 /* Parse a piece. A piece is an atom possibly followed by one
1349 or more postfix operators. */
d5f8dde1
JM
1350 bre_branch_begin = tre_stack_pop_int(stack);
1351 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1352 STACK_PUSHX(stack, int, bre_branch_begin);
5f2eab64
JM
1353 STACK_PUSHX(stack, int, PARSE_ATOM);
1354 break;
1355
1356 case PARSE_CATENATION:
1357 /* If the expression has not ended, parse another piece. */
1358 {
1359 tre_char_t c;
1360 if (ctx->re >= ctx->re_end)
1361 break;
1362 c = *ctx->re;
1363#ifdef REG_LITERAL
1364 if (!(ctx->cflags & REG_LITERAL))
1365 {
1366#endif /* REG_LITERAL */
d5f8dde1
JM
1367 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1368 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1369 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1370 *(ctx->re + 1) == CHAR_PIPE))
5f2eab64
JM
1371 break;
1372 if ((ctx->cflags & REG_EXTENDED
1373 && c == CHAR_RPAREN && depth > 0)
1374 || (!(ctx->cflags & REG_EXTENDED)
d5f8dde1
JM
1375 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1376 && *(ctx->re + 1) == CHAR_RPAREN))
5f2eab64
JM
1377 {
1378 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
d5f8dde1 1379 return REG_EPAREN;
5f2eab64
JM
1380 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1381 REST(ctx->re)));
1382 depth--;
d5f8dde1 1383 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
5f2eab64
JM
1384 ctx->re += 2;
1385 break;
1386 }
1387#ifdef REG_LITERAL
1388 }
1389#endif /* REG_LITERAL */
1390
d5f8dde1
JM
1391#ifdef REG_LEFT_ASSOC
1392 if (ctx->cflags & REG_LEFT_ASSOC)
5f2eab64 1393 {
d5f8dde1
JM
1394 /* Left associative concatenation. */
1395 STACK_PUSHX(stack, int, PARSE_CATENATION);
5f2eab64
JM
1396 STACK_PUSHX(stack, voidptr, result);
1397 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
d5f8dde1 1398 STACK_PUSHX(stack, int, 0); // bre_branch_begin
5f2eab64
JM
1399 STACK_PUSHX(stack, int, PARSE_PIECE);
1400 }
1401 else
d5f8dde1 1402#endif /* REG_LEFT_ASSOC */
5f2eab64 1403 {
d5f8dde1 1404 /* Default case, right associative concatenation. */
5f2eab64
JM
1405 STACK_PUSHX(stack, voidptr, result);
1406 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
d5f8dde1
JM
1407 STACK_PUSHX(stack, int, PARSE_CATENATION);
1408 STACK_PUSHX(stack, int, 0); // bre_branch_begin
5f2eab64
JM
1409 STACK_PUSHX(stack, int, PARSE_PIECE);
1410 }
1411 break;
1412 }
1413
1414 case PARSE_POST_CATENATION:
1415 {
1416 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1417 tre_ast_node_t *tmp_node;
1418 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1419 if (!tmp_node)
1420 return REG_ESPACE;
1421 result = tmp_node;
1422 break;
1423 }
1424
1425 case PARSE_UNION:
1426 if (ctx->re >= ctx->re_end)
1427 break;
1428#ifdef REG_LITERAL
1429 if (ctx->cflags & REG_LITERAL)
1430 break;
1431#endif /* REG_LITERAL */
d5f8dde1
JM
1432 if (!(ctx->cflags & REG_EXTENDED))
1433 {
1434 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1435 break;
1436 ctx->re++;
1437 }
5f2eab64
JM
1438 switch (*ctx->re)
1439 {
1440 case CHAR_PIPE:
1441 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1442 REST(ctx->re)));
1443 STACK_PUSHX(stack, int, PARSE_UNION);
d5f8dde1 1444 STACK_PUSHX(stack, voidptr, (void *)ctx->re);
5f2eab64
JM
1445 STACK_PUSHX(stack, voidptr, result);
1446 STACK_PUSHX(stack, int, PARSE_POST_UNION);
d5f8dde1
JM
1447 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448 indicate if this is the beginning of a BRE extended branch. */
1449 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
5f2eab64
JM
1450 STACK_PUSHX(stack, int, PARSE_BRANCH);
1451 ctx->re++;
1452 break;
1453
1454 case CHAR_RPAREN:
1455 ctx->re++;
1456 break;
1457
1458 default:
d5f8dde1
JM
1459 if (!(ctx->cflags & REG_EXTENDED))
1460 ctx->re--;
5f2eab64
JM
1461 break;
1462 }
1463 break;
1464
1465 case PARSE_POST_UNION:
1466 {
1467 tre_ast_node_t *tmp_node;
1468 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
d5f8dde1
JM
1469 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1470 /* error on empty expression at end of union */
1471 if (pipechar == ctx->re - 1)
1472 {
1473 return REG_EMPTY;
1474 }
5f2eab64
JM
1475 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1476 if (!tmp_node)
1477 return REG_ESPACE;
1478 result = tmp_node;
1479 break;
1480 }
1481
1482 case PARSE_POSTFIX:
1483 /* Parse postfix operators. */
1484 if (ctx->re >= ctx->re_end)
1485 break;
1486#ifdef REG_LITERAL
1487 if (ctx->cflags & REG_LITERAL)
1488 break;
1489#endif /* REG_LITERAL */
d5f8dde1
JM
1490 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1491 int rep_min = 0;
1492 int rep_max = -1;
1493#ifdef TRE_DEBUG
1494 int lbrace_off;
1495#endif
5f2eab64
JM
1496 switch (*ctx->re)
1497 {
1498 case CHAR_PLUS:
1499 case CHAR_QUESTIONMARK:
1500 if (!(ctx->cflags & REG_EXTENDED))
1501 break;
1502 /*FALLTHROUGH*/
1503 case CHAR_STAR:
1504 {
1505 tre_ast_node_t *tmp_node;
5f2eab64 1506#ifdef TRE_DEBUG
d5f8dde1
JM
1507 const char *tstr = "star";
1508 tmp_re = ctx->re;
5f2eab64
JM
1509#endif
1510
d5f8dde1
JM
1511 handle_plus_or_question:
1512 /* error on iteration of raw assertion (not in subexpression) */
1513 if (result->type == LITERAL && result->submatch_id < 0 &&
1514 IS_ASSERTION((tre_literal_t *)result->obj))
1515 {
1516 if (!(ctx->cflags & REG_EXTENDED)) break;
1517 return REG_BADRPT;
1518 }
5f2eab64 1519 if (*ctx->re == CHAR_PLUS)
d5f8dde1
JM
1520 {
1521 rep_min = 1;
1522#ifdef TRE_DEBUG
1523 tstr = "plus";
1524#endif
1525 }
5f2eab64 1526 if (*ctx->re == CHAR_QUESTIONMARK)
d5f8dde1
JM
1527 {
1528 rep_max = 1;
5f2eab64 1529#ifdef TRE_DEBUG
d5f8dde1 1530 tstr = "questionmark";
5f2eab64 1531#endif
d5f8dde1 1532 }
5f2eab64 1533
d5f8dde1 1534 if (ctx->cflags & REG_EXTENDED)
5f2eab64 1535 {
d5f8dde1 1536 if (ctx->re + 1 < ctx->re_end)
5f2eab64 1537 {
d5f8dde1
JM
1538 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1539 {
1540 /* Process the question mark only in enhanced mode.
1541 Otherwise, the question mark is an error in ERE */
1542 if (ctx->cflags & REG_ENHANCED)
1543 {
1544 minimal = !(ctx->cflags & REG_UNGREEDY);
1545 ctx->re++;
1546 }
1547 else return REG_BADRPT;
1548 }
1549 else if (*(ctx->re + 1) == CHAR_STAR
1550 || *(ctx->re + 1) == CHAR_PLUS)
1551 {
1552 /* These are reserved for future extensions. */
1553 return REG_BADRPT;
1554 }
5f2eab64 1555 }
d5f8dde1
JM
1556 }
1557 else
1558 {
1559 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
5f2eab64 1560 {
d5f8dde1 1561 /* This is reserved for future extensions. */
5f2eab64
JM
1562 return REG_BADRPT;
1563 }
d5f8dde1
JM
1564 if (ctx->re + 2 < ctx->re_end)
1565 {
1566 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1567 {
1568 /* Process the question mark only in enhanced mode.
1569 Otherwise, the question mark is a literal in BRE */
1570 if (ctx->cflags & REG_ENHANCED)
1571 {
1572 minimal = !(ctx->cflags & REG_UNGREEDY);
1573 ctx->re += 2;
1574 }
1575 }
1576 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1577 {
1578 /* This is reserved for future extensions. */
1579 return REG_BADRPT;
1580 }
1581 }
1582 }
5f2eab64 1583
d5f8dde1
JM
1584 if (minimal)
1585 ctx->num_reorder_tags++;
1586
1587 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1588 minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1589 if (result == NULL)
1590 {
1591 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1592 else goto parse_literal;
1593 }
5f2eab64
JM
1594 ctx->re++;
1595 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1596 minimal);
1597 if (tmp_node == NULL)
1598 return REG_ESPACE;
1599 result = tmp_node;
d5f8dde1
JM
1600
1601 /* Set the iterator with a submatch id in the invisible range
1602 * (which will be overridden if a real submatch is needed) */
1603 result->submatch_id = ctx->submatch_id_invisible++;
1604
1605#if 0
1606 /* We don't allow multiple postfixes, but this might be needed
1607 to support approximate matching */
5f2eab64 1608 STACK_PUSHX(stack, int, PARSE_POSTFIX);
d5f8dde1 1609#endif
5f2eab64
JM
1610 }
1611 break;
1612
1613 case CHAR_BACKSLASH:
1614 /* "\{" is special without REG_EXTENDED */
d5f8dde1 1615 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
5f2eab64 1616 if (!(ctx->cflags & REG_EXTENDED)
d5f8dde1 1617 && ctx->re + 1 < ctx->re_end)
5f2eab64 1618 {
d5f8dde1
JM
1619 switch (*(ctx->re + 1))
1620 {
1621 case CHAR_LBRACE:
1622 ctx->re++;
1623#ifdef TRE_DEBUG
1624 lbrace_off = 2;
1625#endif
1626 goto parse_brace;
1627 case CHAR_PLUS:
1628 case CHAR_QUESTIONMARK:
1629 if (ctx->cflags & REG_ENHANCED)
1630 {
1631#ifdef TRE_DEBUG
1632 tmp_re = ctx->re;
1633#endif
1634 ctx->re++;
1635 goto handle_plus_or_question;
1636 }
1637 break;
1638 }
1639 break;
5f2eab64
JM
1640 }
1641 else
1642 break;
1643
1644 case CHAR_LBRACE:
d5f8dde1
JM
1645 {
1646 int raw_assertion;
1647
1648 /* "{" is literal without REG_EXTENDED */
1649 if (!(ctx->cflags & REG_EXTENDED))
1650 break;
1651#ifdef TRE_DEBUG
1652 lbrace_off = 1;
1653#endif
5f2eab64
JM
1654
1655 parse_brace:
d5f8dde1
JM
1656 /* error on iteration of raw assertion (not in subexpression),
1657 but wait until after parsing bounds */
1658 raw_assertion = (result->type == LITERAL
1659 && result->submatch_id < 0
1660 && IS_ASSERTION((tre_literal_t *)result->obj));
1661 ctx->re++;
5f2eab64 1662
d5f8dde1
JM
1663 status = tre_parse_bound(ctx, &result);
1664#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666 is to be treated as a literal. */
1667 if (status == REG_NOMATCH)
1668 {
1669 ctx->re--;
1670 break;
1671 }
1672#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1674 REST(ctx->re - lbrace_off)));
1675 if (status != REG_OK)
1676 return status;
1677 if (raw_assertion) return REG_BADRPT;
1678
1679 /* Set the iterator with a submatch id in the invisible range
1680 * (which will be overridden if a real submatch is needed) */
1681 if (result->type == ITERATION)
1682 result->submatch_id = ctx->submatch_id_invisible++;
1683
1684#if 0
1685 /* We don't allow multiple postfixes, but this might be needed
1686 to support approximate matching */
1687 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1688#endif
1689 break;
1690 }
5f2eab64
JM
1691 }
1692 break;
1693
1694 case PARSE_ATOM:
d5f8dde1
JM
1695 {
1696 /* Parse an atom. An atom is a regular expression enclosed in `()',
1697 an empty set of `()', a bracket expression, `.', `^', `$',
1698 a `\' followed by a character, or a single character. */
5f2eab64 1699
d5f8dde1
JM
1700 /* The stack contains a boolean value, whether PARSE_ATOM is
1701 being called just after the start of a group (left paren)
1702 in a BRE */
1703 bre_branch_begin = tre_stack_pop_int(stack);
1704
1705 /* End of regexp? (empty string). */
1706 if (ctx->re >= ctx->re_end)
1707 goto parse_literal;
5f2eab64
JM
1708
1709#ifdef REG_LITERAL
d5f8dde1
JM
1710 if (ctx->cflags & REG_LITERAL)
1711 goto parse_literal;
5f2eab64
JM
1712#endif /* REG_LITERAL */
1713
d5f8dde1
JM
1714 switch (*ctx->re)
1715 {
1716 case CHAR_LPAREN: /* parenthesized subexpression */
5f2eab64 1717
d5f8dde1
JM
1718 /* Handle "(?...)" extensions. They work in a way similar
1719 to Perls corresponding extensions. */
1720 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1721 (REG_EXTENDED|REG_ENHANCED)
1722 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1723 {
1724 int new_cflags = ctx->cflags;
1725 int bit = 1;
1726 int invisible_submatch = 0;
1727 DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1728 REST(ctx->re)));
1729 ctx->re += 2;
1730 while (/*CONSTCOND*/1)
1731 {
1732 if (*ctx->re == L'i')
1733 {
1734 DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
1735 REST(ctx->re)));
1736 if (bit)
1737 new_cflags |= REG_ICASE;
1738 else
1739 new_cflags &= ~REG_ICASE;
1740 ctx->re++;
1741 }
1742 else if (*ctx->re == L'n')
1743 {
1744 DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
1745 REST(ctx->re)));
1746 if (bit)
1747 new_cflags |= REG_NEWLINE;
1748 else
1749 new_cflags &= ~REG_NEWLINE;
1750 ctx->re++;
1751 }
1752#ifdef REG_LEFT_ASSOC
1753 else if (*ctx->re == L'l')
1754 {
1755 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1756 REST(ctx->re)));
1757 if (bit)
1758 new_cflags |= REG_LEFT_ASSOC;
1759 else
1760 new_cflags &= ~REG_LEFT_ASSOC;
1761 ctx->re++;
1762 }
1763#endif /* REG_LEFT_ASSOC */
5f2eab64 1764#ifdef REG_UNGREEDY
d5f8dde1
JM
1765 else if (*ctx->re == L'U')
1766 {
1767 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
1768 REST(ctx->re)));
1769 if (bit)
1770 new_cflags |= REG_UNGREEDY;
1771 else
1772 new_cflags &= ~REG_UNGREEDY;
1773 ctx->re++;
1774 }
5f2eab64 1775#endif /* REG_UNGREEDY */
d5f8dde1
JM
1776 else if (*ctx->re == CHAR_MINUS)
1777 {
1778 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
1779 REST(ctx->re)));
5f2eab64 1780 ctx->re++;
d5f8dde1
JM
1781 bit = 0;
1782 }
1783 else if (*ctx->re == CHAR_COLON)
1784 {
1785 DPRINT(("tre_parse: no group: '%.*" STRF
1786 "', (invisible submatch %d)\n",
1787 REST(ctx->re), ctx->submatch_id_invisible));
1788 ctx->re++;
1789 depth++;
1790 invisible_submatch = 1;
1791 break;
1792 }
1793 else if (*ctx->re == CHAR_HASH)
1794 {
1795 DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
1796 REST(ctx->re)));
1797 /* A comment can contain any character except a
1798 right parenthesis */
1799 while (*ctx->re != CHAR_RPAREN
1800 && ctx->re < ctx->re_end)
5f2eab64 1801 ctx->re++;
d5f8dde1
JM
1802 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1803 {
1804 ctx->re++;
1805 break;
1806 }
1807 else
1808 return REG_BADPAT;
1809 }
1810 else if (*ctx->re == CHAR_RPAREN)
1811 {
1812 ctx->re++;
1813 break;
1814 }
1815 else
1816 return REG_BADRPT;
1817 }
5f2eab64 1818
d5f8dde1
JM
1819 /* Turn on the cflags changes for the rest of the
1820 enclosing group. */
1821 if (invisible_submatch)
1822 {
1823 STACK_PUSHX(stack, int, ctx->cflags);
1824 STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1825 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1826 ctx->submatch_id_invisible++;
1827 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1828 STACK_PUSHX(stack, int, PARSE_RE);
1829 }
1830 else {
1831 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1832 STACK_PUSHX(stack, int, PARSE_ATOM);
5f2eab64 1833 }
d5f8dde1
JM
1834 ctx->cflags = new_cflags;
1835 break;
1836 }
5f2eab64 1837
d5f8dde1
JM
1838 if (ctx->cflags & REG_EXTENDED)
1839 {
1840 parse_bre_lparen:
1841 DPRINT(("tre_parse: group begin: '%.*" STRF
1842 "', submatch %d\n", REST(ctx->re),
1843 ctx->submatch_id));
1844 ctx->re++;
1845 /* First parse a whole RE, then mark the resulting tree
1846 for submatching. */
1847 STACK_PUSHX(stack, int, ctx->cflags);
1848 STACK_PUSHX(stack, int, ctx->submatch_id);
1849 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1850 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851 indicate if this is the beginning of a BRE group. */
1852 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1853 STACK_PUSHX(stack, int, PARSE_RE);
1854 ctx->submatch_id++;
1855 depth++;
1856 }
1857 else
1858 goto parse_literal;
1859 break;
5f2eab64 1860
d5f8dde1
JM
1861 case CHAR_RPAREN: /* end of current subexpression */
1862 if (ctx->cflags & REG_EXTENDED && depth > 0)
1863 {
1864 parse_bre_rparen_empty:
1865 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1866 return REG_EPAREN;
1867 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1868 REST(ctx->re)));
1869 /* We were expecting an atom, but instead the current
1870 subexpression was closed. POSIX leaves the meaning of
1871 this to be implementation-defined. We interpret this as
1872 an empty expression (which matches an empty string). */
1873 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1874 if (result == NULL)
1875 return REG_ESPACE;
1876 if (!(ctx->cflags & REG_EXTENDED))
1877 ctx->re--;
1878 }
1879 else
1880 goto parse_literal;
1881 break;
5f2eab64 1882
d5f8dde1
JM
1883 case CHAR_LBRACKET: /* bracket expression */
1884 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1885 REST(ctx->re)));
1886 ctx->re++;
1887 status = tre_parse_bracket(ctx, &result);
1888 if (status != REG_OK)
1889 return status;
1890 break;
5f2eab64 1891
d5f8dde1
JM
1892 case CHAR_BACKSLASH:
1893 /* Deal with "\(", "\)" or "\{" for BREs */
1894 if (!(ctx->cflags & REG_EXTENDED)
1895 && ctx->re + 1 < ctx->re_end)
5f2eab64 1896 {
d5f8dde1
JM
1897 if (*(ctx->re + 1) == CHAR_LPAREN)
1898 {
1899 ctx->re++;
1900 goto parse_bre_lparen;
1901 }
1902 else if (*(ctx->re + 1) == CHAR_RPAREN)
1903 {
1904 ctx->re++;
1905 goto parse_bre_rparen_empty;
1906 }
1907 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
5f2eab64 1908 }
5f2eab64 1909
d5f8dde1
JM
1910 if (ctx->re + 1 >= ctx->re_end)
1911 /* Trailing backslash. */
1912 return REG_EESCAPE;
5f2eab64 1913
d5f8dde1
JM
1914 if (!(ctx->cflags & REG_ENHANCED))
1915 {
1916 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1917 ctx->re++;
1918 goto unenhanced_backslash;
1919 }
5f2eab64 1920
d5f8dde1 1921 /* If a macro is used, parse the expanded macro recursively. */
5f2eab64 1922 {
d5f8dde1
JM
1923 tre_char_t buf[64];
1924 tre_expand_macro(ctx->re + 1, ctx->re_end,
1925 buf, elementsof(buf));
1926 if (buf[0] != 0)
5f2eab64 1927 {
d5f8dde1
JM
1928 tre_parse_ctx_t subctx;
1929 memcpy(&subctx, ctx, sizeof(subctx));
1930 subctx.re = buf;
1931 subctx.len = tre_strlen(buf);
1932 subctx.nofirstsub = 1;
1933 status = tre_parse(&subctx);
1934 if (status != REG_OK)
1935 return status;
1936 ctx->re += 2;
1937 ctx->position = subctx.position;
1938 result = subctx.result;
5f2eab64
JM
1939 break;
1940 }
5f2eab64 1941 }
5f2eab64 1942
d5f8dde1
JM
1943#ifdef REG_LITERAL
1944 if (*(ctx->re + 1) == L'Q')
1945 {
1946 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1947 REST(ctx->re)));
1948 ctx->cflags |= REG_LITERAL;
1949 temporary_cflags |= REG_LITERAL;
1950 ctx->re += 2;
1951 STACK_PUSHX(stack, int, 0);
1952 STACK_PUSHX(stack, int, PARSE_ATOM);
1953 break;
1954 }
1955#endif /* REG_LITERAL */
1956
1957 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1958 ctx->re++;
1959 switch (*ctx->re)
1960 {
1961 case L'b':
1962 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1963 ASSERT_AT_WB, -1);
1964 ctx->re++;
1965 break;
1966 case L'B':
1967 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1968 ASSERT_AT_WB_NEG, -1);
1969 ctx->re++;
1970 break;
1971 case L'<':
1972 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1973 ASSERT_AT_BOW, -1);
1974 ctx->re++;
1975 break;
1976 case L'>':
1977 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1978 ASSERT_AT_EOW, -1);
1979 ctx->re++;
1980 break;
1981 case L'x':
1982 ctx->re++;
1983 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1984 {
1985 /* 8 bit hex char. */
1986 char tmp[3] = {0, 0, 0};
1987 long val;
1988 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1989 REST(ctx->re - 2)));
1990
1991 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1992 ctx->re < ctx->re_end)
1993 {
1994 tmp[0] = (char)ctx->re[0];
1995 ctx->re++;
1996 }
1997 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1998 ctx->re < ctx->re_end)
1999 {
2000 tmp[1] = (char)ctx->re[0];
2001 ctx->re++;
2002 }
2003 val = strtol(tmp, NULL, 16);
2004 result = tre_ast_new_literal(ctx->mem, (int)val,
2005 (int)val, ctx->position);
2006 ctx->position++;
2007 break;
2008 }
2009 else if (ctx->re < ctx->re_end)
2010 {
2011 /* Wide char. */
2012 char tmp[32];
2013 long val;
2014 int i = 0;
2015 ctx->re++;
2016 while (ctx->re_end - ctx->re >= 0)
2017 {
2018 if (ctx->re[0] == CHAR_RBRACE)
2019 break;
2020 if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2021 {
2022 tmp[i] = (char)ctx->re[0];
2023 i++;
2024 ctx->re++;
2025 continue;
2026 }
2027 return REG_EBRACE;
2028 }
2029 ctx->re++;
2030 tmp[i] = 0;
2031 val = strtol(tmp, NULL, 16);
2032 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2033 ctx->position);
2034 ctx->position++;
2035 break;
2036 }
2037 /*FALLTHROUGH*/
2038
2039 default:
2040 unenhanced_backslash:
2041 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2042 REG_EXTENDED &&
2043 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2044 {
2045 /* Back reference (only in BRE or enhanced). */
2046 int val = *ctx->re - L'0';
2047 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
2048 REST(ctx->re - 1)));
2049 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2050 ctx->position);
2051 if (result == NULL)
2052 return REG_ESPACE;
2053
2054 /* Set the backref with a submatch id in the invisible
2055 * range (which will be overridden if a real submatch
2056 * is needed) */
2057 result->submatch_id = ctx->submatch_id_invisible++;
2058
2059 ctx->position++;
2060 ctx->num_reorder_tags++;
2061 ctx->max_backref = MAX(val, ctx->max_backref);
2062 ctx->re++;
2063 }
2064 else
2065 {
2066 /* Escaped character. */
2067 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
2068 REST(ctx->re - 1)));
2069 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2070 ctx->position);
2071 ctx->position++;
2072 ctx->re++;
2073 }
2074 break;
2075 }
2076 if (result == NULL)
2077 return REG_ESPACE;
2078 break;
2079
2080 case CHAR_PERIOD: /* the any-symbol */
2081 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
2082 REST(ctx->re)));
2083 if (ctx->cflags & REG_NEWLINE)
2084 {
2085 tre_ast_node_t *tmp1;
2086 tre_ast_node_t *tmp2;
2087 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
5f2eab64 2088 ctx->position);
d5f8dde1
JM
2089 if (!tmp1)
2090 return REG_ESPACE;
2091 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2092 ctx->position + 1);
2093 if (!tmp2)
2094 return REG_ESPACE;
2095 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2096 if (!result)
2097 return REG_ESPACE;
2098 ctx->position += 2;
2099 }
2100 else
2101 {
2102 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2103 ctx->position);
2104 if (!result)
2105 return REG_ESPACE;
2106 ctx->position++;
2107 }
2108 ctx->re++;
2109 break;
5f2eab64 2110
d5f8dde1
JM
2111 case CHAR_CARET: /* beginning of line assertion */
2112 /* '^' has a special meaning everywhere in EREs, at the
2113 beginning of the RE and after \( is BREs. It is also
2114 special in enhanced BREs at the beginning of each branches
2115 of a union */
2116 if (ctx->cflags & REG_EXTENDED
2117 || bre_branch_begin
2118 || ctx->re == ctx->re_start)
2119 {
2120 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
2121 REST(ctx->re)));
2122 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2123 ASSERT_AT_BOL, -1);
2124 if (result == NULL)
2125 return REG_ESPACE;
2126 ctx->re++;
2127 }
2128 else
2129 goto parse_literal;
2130 break;
5f2eab64 2131
d5f8dde1
JM
2132 case CHAR_DOLLAR: /* end of line assertion. */
2133 /* '$' is special everywhere in EREs, and in the end of the
2134 string and before \) is BREs. */
2135 if (ctx->cflags & REG_EXTENDED
2136 || (ctx->re + 2 < ctx->re_end
2137 && *(ctx->re + 1) == CHAR_BACKSLASH
2138 && *(ctx->re + 2) == CHAR_RPAREN)
2139 || ctx->re + 1 == ctx->re_end)
2140 {
2141 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
2142 REST(ctx->re)));
2143 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2144 ASSERT_AT_EOL, -1);
2145 if (result == NULL)
2146 return REG_ESPACE;
2147 ctx->re++;
2148 }
2149 else
2150 goto parse_literal;
2151 break;
5f2eab64 2152
d5f8dde1
JM
2153 default:
2154 parse_literal:
5f2eab64 2155
d5f8dde1
JM
2156 if (temporary_cflags && ctx->re + 1 < ctx->re_end
2157 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2158 {
2159 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
2160 REST(ctx->re)));
2161 ctx->cflags &= ~temporary_cflags;
2162 temporary_cflags = 0;
2163 ctx->re += 2;
2164 if (ctx->re < ctx->re_end)
2165 {
2166 STACK_PUSHX(stack, int, 0);
2167 STACK_PUSHX(stack, int, PARSE_ATOM);
2168 }
2169 else
2170 {
2171 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2172 if (!result) return REG_ESPACE;
2173 }
2174 break;
2175 }
5f2eab64
JM
2176
2177
d5f8dde1
JM
2178 /* We are expecting an atom. If the subexpression (or the whole
2179 regexp ends here, we interpret it as an empty expression
2180 (which matches an empty string), which is an error.
2181 Iterations of an empty expression is also an error. */
5f2eab64 2182#ifdef REG_LITERAL
d5f8dde1
JM
2183 if (!(ctx->cflags & REG_LITERAL))
2184 {
2185#endif /* REG_LITERAL */
2186 /* error on end of string */
2187 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2188 : REG_EMPTY;
2189 /* error on unions and iterations of empty expressions */
2190 if (ctx->cflags & REG_EXTENDED)
2191 {
2192 if (ctx->re < ctx->re_end)
2193 {
2194 if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2195 if (*ctx->re == CHAR_LBRACE)
2196 {
2197 ctx->re++;
2198 empty_parse_bound:
2199 /* We need to parse the bound first and return
2200 any error, before returning REG_BADRPT */
2201 status = tre_parse_bound(ctx, NULL);
2202#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203 /* For ERE, if REG_NOMATCH is returned, we
2204 treat the lbrace as a literal. */
2205 if (status == REG_NOMATCH)
2206 {
2207 ctx->re--;
2208 /* Drop down to literal-handling code */
2209 }
2210 else
2211 {
2212#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213 if (status != REG_OK)
2214 return status;
2215 return REG_BADRPT;
2216#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2217 }
2218#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2219 }
2220#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2221 else
2222#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223 if (*ctx->re == CHAR_STAR
2224 || *ctx->re == CHAR_PLUS
2225 || *ctx->re == CHAR_QUESTIONMARK)
2226 {
2227 return REG_BADRPT;
2228 }
2229 }
2230 }
2231 else if (ctx->re + 1 < ctx->re_end
2232 && *ctx->re == CHAR_BACKSLASH
2233 && *(ctx->re + 1) == CHAR_LBRACE)
2234 {
2235 ctx->re += 2;
2236 goto empty_parse_bound;
2237 }
2238#ifdef REG_LITERAL
2239 }
5f2eab64 2240#endif /* REG_LITERAL */
5f2eab64 2241
d5f8dde1
JM
2242 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
2243 REST(ctx->re)));
2244 /* Note that we can't use an tre_isalpha() test here, since there
2245 may be characters which are alphabetic but neither upper or
2246 lower case. */
2247 if (ctx->cflags & REG_ICASE
2248 && (tre_isupper_l(*ctx->re, ctx->loc) ||
2249 tre_islower_l(*ctx->re, ctx->loc)))
2250 {
2251 tre_ast_node_t *tmp1;
2252 tre_ast_node_t *tmp2;
2253
2254 /* XXX - Can there be more than one opposite-case
2255 counterpoints for some character in some locale? Or
2256 more than two characters which all should be regarded
2257 the same character if case is ignored? If yes, there
2258 does not seem to be a portable way to detect it. I guess
2259 that at least for multi-character collating elements there
2260 could be several opposite-case counterpoints, but they
2261 cannot be supported portably anyway. */
2262 tmp1 = tre_ast_new_literal(ctx->mem,
2263 tre_toupper_l(*ctx->re, ctx->loc),
2264 tre_toupper_l(*ctx->re, ctx->loc),
5f2eab64 2265 ctx->position);
d5f8dde1
JM
2266 if (!tmp1)
2267 return REG_ESPACE;
2268 tmp2 = tre_ast_new_literal(ctx->mem,
2269 tre_tolower_l(*ctx->re, ctx->loc),
2270 tre_tolower_l(*ctx->re, ctx->loc),
2271 ctx->position);
2272 if (!tmp2)
2273 return REG_ESPACE;
2274 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2275 if (!result)
2276 return REG_ESPACE;
2277 }
2278 else
2279 {
2280 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2281 ctx->position);
2282 if (!result)
2283 return REG_ESPACE;
2284 }
2285 ctx->position++;
2286 ctx->re++;
2287 break;
2288 }
2289 break;
2290 }
5f2eab64
JM
2291
2292 case PARSE_MARK_FOR_SUBMATCH:
2293 {
2294 int submatch_id = tre_stack_pop_int(stack);
2295
d5f8dde1
JM
2296 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2297 if (result->submatch_id >= 0 &&
2298 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
5f2eab64
JM
2299 {
2300 tre_ast_node_t *n, *tmp_node;
d5f8dde1
JM
2301 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2302 break;
5f2eab64
JM
2303 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2304 if (n == NULL)
2305 return REG_ESPACE;
2306 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2307 if (tmp_node == NULL)
2308 return REG_ESPACE;
2309 tmp_node->num_submatches = result->num_submatches;
2310 result = tmp_node;
2311 }
2312 result->submatch_id = submatch_id;
d5f8dde1
JM
2313 if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2314 result->num_submatches++;
5f2eab64
JM
2315 break;
2316 }
2317
5f2eab64
JM
2318 default:
2319 assert(0);
2320 break;
2321 }
2322 }
2323
2324 /* Check for missing closing parentheses. */
2325 if (depth > 0)
2326 return REG_EPAREN;
2327
d5f8dde1 2328 ctx->result = result;
5f2eab64 2329
d5f8dde1 2330 return REG_OK;
5f2eab64
JM
2331}
2332
2333/* EOF */