TRE: Add local modifications to extend functionality
[dragonfly.git] / contrib / tre / lib / tre-parse.c
1 /*
2   tre-parse.c - Regexp parser
3
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 #include <stddef.h>
23
24 #include "xmalloc.h"
25 #include "tre-mem.h"
26 #include "tre-ast.h"
27 #include "tre-stack.h"
28 #include "tre-parse.h"
29
30 #include "xlocale_private.h"
31 #include "collate.h"
32
33 /* BSD compatibility:
34      Before looking up a collating symbol, check if the name matches in
35      the character names (cnames) array; if so, use the corresponding
36      character.
37
38      Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39      the implementation choice that for ERE, a non-numeric character following
40      a left brace that would normally be a bound, causes the left brace to be
41      literal. */
42 #define BSD_COMPATIBILITY
43 #ifdef BSD_COMPATIBILITY
44 #include "cname.h"
45 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46 #endif /* BSD_COMPATIBILITY */
47
48 /* Characters with special meanings in regexp syntax. */
49 #define CHAR_PIPE          L'|'
50 #define CHAR_LPAREN        L'('
51 #define CHAR_RPAREN        L')'
52 #define CHAR_LBRACE        L'{'
53 #define CHAR_RBRACE        L'}'
54 #define CHAR_LBRACKET      L'['
55 #define CHAR_RBRACKET      L']'
56 #define CHAR_MINUS         L'-'
57 #define CHAR_STAR          L'*'
58 #define CHAR_QUESTIONMARK  L'?'
59 #define CHAR_PLUS          L'+'
60 #define CHAR_PERIOD        L'.'
61 #define CHAR_COLON         L':'
62 #define CHAR_EQUAL         L'='
63 #define CHAR_COMMA         L','
64 #define CHAR_CARET         L'^'
65 #define CHAR_DOLLAR        L'$'
66 #define CHAR_BACKSLASH     L'\\'
67 #define CHAR_HASH          L'#'
68 #define CHAR_TILDE         L'~'
69
70
71 /* Some macros for expanding \w, \s, etc. */
72 static const struct tre_macro_struct {
73   const char c;
74   const char *expansion;
75 } tre_macros[] =
76   { {'t', "\t"},           {'n', "\n"},            {'r', "\r"},
77     {'f', "\f"},           {'a', "\a"},            {'e', "\033"},
78     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
79     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
80     { 0, NULL }
81   };
82
83
84 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
85    must have at least `len' items.  Sets buf[0] to zero if the there
86    is no match in `tre_macros'. */
87 static void
88 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
89                  tre_char_t *buf, size_t buf_len)
90 {
91   int i;
92
93   buf[0] = 0;
94   if (regex >= regex_end)
95     return;
96
97   for (i = 0; tre_macros[i].expansion; i++)
98     {
99       if (tre_macros[i].c == *regex)
100         {
101           unsigned int j;
102           DPRINT(("Expanding macro '%c' => '%s'\n",
103                   tre_macros[i].c, tre_macros[i].expansion));
104           for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
105             buf[j] = tre_macros[i].expansion[j];
106           buf[j] = 0;
107           break;
108         }
109     }
110 }
111
112 static reg_errcode_t
113 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
114          tre_bracket_match_list_t **items)
115 {
116   reg_errcode_t status = REG_OK;
117   tre_bracket_match_list_t *array = *items;
118   int i = array->num_bracket_matches;
119   /* Allocate more space if necessary. */
120   if (i >= *max_i)
121     {
122       tre_bracket_match_list_t *new_items;
123       DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
124       /* If the array is already 1024 items large, give up -- there's
125          probably an error in the regexp (e.g. not a '\0' terminated
126          string and missing ']') */
127       if (*max_i >= 1024)
128         return REG_ESPACE;
129       *max_i *= 2;
130       new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
131       if (new_items == NULL)
132         return REG_ESPACE;
133       *items = array = new_items;
134     }
135   array->bracket_matches[i].type = type;
136   array->bracket_matches[i].value = val;
137   array->num_bracket_matches++;
138   return status;
139 }
140
141 #ifndef TRE_USE_SYSTEM_WCTYPE
142
143 /* isalnum() and the rest may be macros, so wrap them to functions. */
144 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
145 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
146
147 #ifdef tre_isascii
148 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
149 #else /* !tre_isascii */
150 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
151 #endif /* !tre_isascii */
152
153 #ifdef tre_isblank
154 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
155 #else /* !tre_isblank */
156 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
157 #endif /* !tre_isblank */
158
159 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
160 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
161 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
162 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
163 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
164 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
165 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
166 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
167 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
168
169 struct {
170   char *name;
171   int (*func)(tre_cint_t);
172 } tre_ctype_map[] = {
173   { "alnum", &tre_isalnum_func },
174   { "alpha", &tre_isalpha_func },
175 #ifdef tre_isascii
176   { "ascii", &tre_isascii_func },
177 #endif /* tre_isascii */
178 #ifdef tre_isblank
179   { "blank", &tre_isblank_func },
180 #endif /* tre_isblank */
181   { "cntrl", &tre_iscntrl_func },
182   { "digit", &tre_isdigit_func },
183   { "graph", &tre_isgraph_func },
184   { "lower", &tre_islower_func },
185   { "print", &tre_isprint_func },
186   { "punct", &tre_ispunct_func },
187   { "space", &tre_isspace_func },
188   { "upper", &tre_isupper_func },
189   { "xdigit", &tre_isxdigit_func },
190   { NULL, NULL}
191 };
192
193 tre_ctype_t tre_ctype(const char *name)
194 {
195   int i;
196   for (i = 0; tre_ctype_map[i].name != NULL; i++)
197     {
198       if (strcmp(name, tre_ctype_map[i].name) == 0)
199         return tre_ctype_map[i].func;
200     }
201   return (tre_ctype_t)0;
202 }
203 #endif /* !TRE_USE_SYSTEM_WCTYPE */
204
205 #define REST(re) (int)(ctx->re_end - (re)), (re)
206
207 #define START_COLLATING_SYMBOLS         16
208 #define MAX_COLLATING_SYMBOL_LEN        4
209
210 typedef struct {
211   const tre_char_t *start;
212   int len;
213 } tre_collating_symbol;
214
215 #ifdef BSD_COMPATIBILITY
216 static wchar_t
217 tre_search_cnames(const wchar_t *name, size_t len)
218 {
219   size_t low = 0;
220   size_t high = NCNAMES - 1;
221   size_t cur;
222   int cmp;
223
224   while(low <= high)
225     {
226       cur = (low + high) / 2;
227       cmp = wcsncmp(name, cnames[cur].name, len);
228       if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
229       if (cmp > 0) low = cur + 1;
230       else high = cur - 1;
231     }
232   return (wchar_t)-1;
233 }
234 #endif /* BSD_COMPATIBILITY */
235
236 /* Scan the contents of a bracket expression, and create a
237  * tre_bracket_match_list_t encoding the bracket expression.  If during
238  * the scan, multi-character collating symbols are detected, switch
239  * into a mode to collect those MCCSs into a tre_collating_symbol
240  * list and pass them back.  tre_parse_bracket will use that to
241  * create a new string composed of a union of the bracket expression
242  * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243  * call tre_parse (recursive) to parse that new string (which will
244  * call tre_parse_bracket and tre_parse_bracket_items again. */
245 static reg_errcode_t
246 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
247                         int *items_size, tre_collating_symbol **result)
248 {
249   const tre_char_t *re = ctx->re;
250   const tre_char_t *re_end = ctx->re_end;
251   tre_collating_symbol *col_syms = NULL;
252   tre_collating_symbol *cp = NULL;
253   int n_col_syms = 0;
254   reg_errcode_t status;
255   int max_i = *items_size;
256   int other = 0;  /* contains content other than multi-character collating
257                    * symbols */
258   int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
259   tre_cint_t min, c;
260   int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
261   int collect_MCCS = 0;
262   const tre_char_t *start;
263
264   for ( ;re < re_end; re++)
265     {
266       switch (*re)
267         {
268         case CHAR_MINUS:
269           /* A first hyphen */
270           if (re == ctx->re)
271             {
272               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
273               min = CHAR_MINUS;
274               other++;
275               range = 0;
276               break;
277             }
278           /* The hyphen is the end range */
279           if (range > 0)
280             {
281               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
282               c = CHAR_MINUS;
283               goto process_end_range;
284             }
285           if (re + 1 >= re_end)
286             {
287               status = REG_EBRACK;
288               goto error;
289             }
290           /* The hyphen is at the end */
291           if (re[1] == CHAR_RBRACKET)
292             {
293               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
294               c = CHAR_MINUS;
295               goto process_begin_range;
296             }
297           /* Two ranges are not allowed to share an endpoint, or begin
298            * range is illegal. */
299           if (range < 0)
300             {
301               status = REG_ERANGE;
302               goto error;
303             }
304           range = 1; /* Expect end range */
305           DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
306           break;
307
308         case CHAR_LBRACKET:
309           if (re + 1 >= re_end)
310             {
311               status = REG_EBRACK;
312               goto error;
313             }
314           switch (re[1])
315             {
316             case CHAR_PERIOD:
317               {
318                 re += 2;
319                 start = re;
320                 for (;; re++)
321                   {
322                     if (re >= re_end)
323                       {
324                         status = REG_ECOLLATE;
325                         goto error;
326                       }
327                     if (*re == CHAR_PERIOD)
328                       {
329                         if (re + 1 >= re_end)
330                           {
331                             status = REG_ECOLLATE;
332                             goto error;
333                           }
334                         /* Found end */
335                         if (re[1] == CHAR_RBRACKET)
336                           {
337                             DPRINT(("tre_parse_bracket:   collating "
338                                     "symbol: '%.*" STRF "'\n",
339                                     REST(start - 2)));
340                             /* Empty name */
341                             if (re == start)
342                               {
343                                 status = REG_ECOLLATE;
344                                 goto error;
345                               }
346 #ifdef BSD_COMPATIBILITY
347                             /* Check if the name is in cnames; if so, use
348                                the corresponding code */
349                             c = tre_search_cnames(start, re - start);
350                             if (c != (wchar_t)-1)
351                               {
352                                 re++;
353                                 goto process_single_character;
354                               }
355 #endif /* BSD_COMPATIBILITY */
356                             /* Verify this is a known sequence */
357                             if (__collate_equiv_value(ctx->loc, start,
358                                                           re - start) <= 0)
359                               {
360                                 status = REG_ECOLLATE;
361                                 goto error;
362                               }
363                             /* Process single character collating symbols */
364                             if (re - start == 1)
365                               {
366                                 c = *start;
367                                 re++;
368                                 goto process_single_character;
369                               }
370                             /* Inverted MCCSs are undefined */
371                             if (invert)
372                               {
373                                 status = REG_ECOLLATE;
374                                 goto error;
375                               }
376                             /* Can't have MCCSs as an endpoint to a range */
377                             if (range > 0)
378                               {
379                                 status = REG_ERANGE;
380                                 goto error;
381                               }
382                             range = -1;
383                             /* Switch into MCCS collection mode (if not
384                              * already there */
385 #if TRE_DEBUG
386                             if (!collect_MCCS)
387                               {
388                                 collect_MCCS = 1;
389                                 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
390                               }
391 #else /* !TRE_DEBUG */
392                             collect_MCCS = 1;
393 #endif /* !TRE_DEBUG */
394                             /* Allocate a memory block the first time */
395                             if (!cp)
396                               {
397                                 if ((col_syms = xmalloc(sizeof(*col_syms) *
398                                             (START_COLLATING_SYMBOLS + 2)))
399                                             == NULL)
400                                   return REG_ESPACE;
401                                 cp = col_syms + 1;
402                                 n_col_syms = START_COLLATING_SYMBOLS;
403                               }
404                             /* Enlarge the memory block is more is needed */
405                             if ((cp - col_syms) - 1 >= n_col_syms)
406                               {
407                                 int i = n_col_syms;
408                                 tre_collating_symbol *tmp =
409                                     xrealloc(col_syms, sizeof(*col_syms) *
410                                              ((n_col_syms *= 2) + 2));
411                                 if (tmp == NULL)
412                                   {
413                                     xfree(col_syms);
414                                     return REG_ESPACE;
415                                   }
416                                 DPRINT(("tre_list_collating_symbols: "
417                                         "Enlarging col_syms to %d\n",
418                                         n_col_syms));
419                                 col_syms = tmp;
420                                 cp = col_syms + i + 1;
421                               }
422                             cp->start = start;
423                             cp->len = re - start;
424                             cp++;
425                             re++;
426                             break;
427                           }
428                       }
429                   }
430                 break;
431               }
432
433             case CHAR_EQUAL:
434             case CHAR_COLON:
435               {
436                 /* Process equivalence and character classes */
437                 tre_char_t kind = re[1];
438
439                 /* Can't have a class as an endpoint to a range */
440                 if (range > 0)
441                   {
442                     status = REG_ERANGE;
443                     goto error;
444                   }
445                 if (!collect_MCCS && range == 0)
446                   {
447                     status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
448                                           min, &max_i, items);
449                     if (status != REG_OK)
450                       goto error;
451                   }
452                 range = -1;
453                 re += 2;
454                 start = re;
455                 for (;; re++)
456                   {
457                     if (re >= re_end)
458                       {
459                         status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
460                         goto error;
461                       }
462                     if (*re == kind)
463                       {
464                         if (re + 1 >= re_end)
465                           {
466                             status = kind == CHAR_EQUAL ? REG_ECOLLATE :
467                                                           REG_ECTYPE;
468                             goto error;
469                           }
470                         /* Found end */
471                         if (re[1] == CHAR_RBRACKET)
472                           {
473                             if (re == start)
474                               {
475                                 /* Empty class name */
476                                 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
477                                                               REG_ECTYPE;
478                                 goto error;
479                               }
480                             /* Process equivalence class */
481                             if (kind == CHAR_EQUAL)
482                               {
483                                 int equiv;
484
485                                 DPRINT(("tre_parse_bracket:   equivalence: '%.*"
486                                         STRF "'\n", REST(start - 2)));
487
488                                 /* While we find the collation value even for
489                                    multi-character collating elements , we
490                                    don't (yet) match any collation values
491                                    against multi-character sequences.  We'd have
492                                    to enumerate those multi-character sequences
493                                    and like multi-character collating symbols,
494                                    create a union of those sequences with the
495                                    rest of the bracket expression.  While
496                                    doable, a bracket expression matching
497                                    multiple characters, that doesn't explicitly
498                                    contain multi-character sequences, might
499                                    be unexpected, so we punt for now. */
500                                 if ((equiv = __collate_equiv_value(ctx->loc,
501                                              start, re - start)) <= 0)
502                                   {
503                                     /* The standard says that if no collating
504                                        element if found, we use the collating
505                                        symbol itself.  But __collate_equiv_value
506                                        doesn't make a distinction between
507                                        an element that is in a equvalence
508                                        class with others, or is the only member,
509                                        so we already know there is no collating
510                                        symbol.  (Note that in the case of a
511                                        collating element whose collation value
512                                        is unique, matching against the
513                                        collating element itself, or against
514                                        its collation value, is equivalent.) */
515 #ifdef BSD_COMPATIBILITY
516                                     /* Check if the name is in cnames; if so,
517                                        use the corresponding code */
518                                     c = tre_search_cnames(start, re - start);
519                                     if (c != (wchar_t)-1)
520                                       {
521                                         re++;
522                                         goto process_single_character;
523                                       }
524 #endif /* BSD_COMPATIBILITY */
525                                     status = REG_ECOLLATE;
526                                     goto error;
527                                   }
528                                 if (!collect_MCCS)
529                                   {
530                                     status = tre_new_item(ctx->mem,
531                                              TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
532                                              equiv, &max_i, items);
533                                     if (status != REG_OK)
534                                       goto error;
535                                   }
536                               }
537                             else
538                               {
539                                 /* Process character class */
540                                 DPRINT(("tre_parse_bracket:  class: '%.*" STRF
541                                         "'\n", REST(start - 2)));
542                                 if (!collect_MCCS)
543                                   {
544                                     char tmp_str[64];
545                                     tre_ctype_t class;
546                                     int len = MIN(re - start, 63);
547 #ifdef TRE_WCHAR
548                                     {
549                                       tre_char_t tmp_wcs[64];
550                                       wcsncpy(tmp_wcs, start, (size_t)len);
551                                       tmp_wcs[len] = L'\0';
552 #if defined HAVE_WCSRTOMBS
553                                       {
554                                         mbstate_t state;
555                                         const tre_char_t *src = tmp_wcs;
556                                         memset(&state, '\0', sizeof(state));
557                                         len = wcsrtombs_l(tmp_str, &src,
558                                                       sizeof(tmp_str), &state,
559                                                       ctx->loc);
560                                       }
561 #elif defined HAVE_WCSTOMBS
562                                       len = wcstombs(tmp_str, tmp_wcs, 63);
563 #endif /* defined HAVE_WCSTOMBS */
564                                     }
565 #else /* !TRE_WCHAR */
566                                     strncpy(tmp_str, (const char*)start, len);
567 #endif /* !TRE_WCHAR */
568                                     tmp_str[len] = '\0';
569                                     DPRINT(("  class name: %s\n", tmp_str));
570                                     class = tre_ctype_l(tmp_str, ctx->loc);
571                                     if (!class)
572                                       {
573                                         status = REG_ECTYPE;
574                                         goto error;
575                                       }
576                                     status = tre_new_item(ctx->mem,
577                                              TRE_BRACKET_MATCH_TYPE_CLASS,
578                                              class, &max_i, items);
579                                     if (status != REG_OK)
580                                       goto error;
581                                   }
582                               }
583                             re++;
584                             break;
585                           }
586                       }
587                   }
588                 other++;
589                 break;
590               }
591
592             default:
593               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
594               c = CHAR_LBRACKET;
595               goto process_single_character;
596               break;
597             }
598           break;
599
600         case CHAR_RBRACKET:
601           /* A first right bracket */
602           if (re == ctx->re)
603             {
604               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
605               min = CHAR_RBRACKET;
606               range = 0;
607               other++;
608               break;
609             }
610           /* Done */
611           if (collect_MCCS)
612             {
613               DPRINT(("tre_parse_bracket:       done: '%.*" STRF "'\n",
614                       REST(re)));
615               if (col_syms)
616                 {
617                   /* Mark the character following the right bracket.  Set len
618                    * to whether there are other things besides the
619                    * multi-character collating symbols */
620                   col_syms->start = re + 1;
621                   col_syms->len = other;
622                   /* Mark the end of the list */
623                   cp->start = NULL;
624                 }
625               *result = col_syms;
626               return REG_OK;
627             }
628           /* range > 0 is not possible, since we did a lookahead after the
629            * hyphen */
630           if (range == 0)
631             {
632               status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
633                                     min, &max_i, items);
634               if (status != REG_OK)
635                 goto error;
636             }
637           DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n", REST(re)));
638           *items_size = max_i;
639           ctx->re = re + 1;
640           return REG_OK;
641
642         default:
643           DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
644           c = *re;
645 process_single_character:
646           /* Process single character */
647           if (range > 0)
648             {
649               int mine, maxe;
650
651 process_end_range:
652               /* Get collation equivalence values */
653               mine = __collate_equiv_value(ctx->loc, &min, 1);
654               maxe = __collate_equiv_value(ctx->loc, &c, 1);
655               if (maxe < mine)
656                 {
657                   status = REG_ERANGE;
658                   goto error;
659                 }
660               if (!collect_MCCS)
661                 {
662                   status = tre_new_item(ctx->mem,
663                                         TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
664                                         mine, &max_i, items);
665                   if (status != REG_OK)
666                     goto error;
667                   status = tre_new_item(ctx->mem,
668                                         TRE_BRACKET_MATCH_TYPE_RANGE_END,
669                                         maxe, &max_i, items);
670                   if (status != REG_OK)
671                     goto error;
672                 }
673               range = -1;
674             }
675           else
676             {
677 process_begin_range:
678               if (!collect_MCCS)
679                 {
680                   if (range == 0)
681                     {
682                       status = tre_new_item(ctx->mem,
683                                             TRE_BRACKET_MATCH_TYPE_CHAR,
684                                             min, &max_i, items);
685                       if (status != REG_OK)
686                         goto error;
687                     }
688                   min = c;
689                 }
690               range = 0;
691             }
692           other++;
693           break;
694         }
695     }
696   status = REG_EBRACK;
697 error:
698   DPRINT(("tre_parse_bracket:   error: '%.*" STRF "', status=%d\n",
699           REST(re), status));
700   if (col_syms)
701     xfree(col_syms);
702   return status;
703 }
704
705 #ifdef TRE_DEBUG
706 static const char *bracket_match_type_str[] = {
707   "unused",
708   "char",
709   "range begin",
710   "range end",
711   "class",
712   "equivalence value",
713 };
714 #endif /* TRE_DEBUG */
715
716 static reg_errcode_t
717 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
718 {
719   tre_ast_node_t *node;
720   reg_errcode_t status = REG_OK;
721   tre_bracket_match_list_t *items;
722   int max_i = 32;
723   tre_collating_symbol *col_syms = NULL;
724
725   /* Handle special cases [[:<:]] and [[:>:]] */
726   if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
727       && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
728       && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
729       && ctx->re[5] == CHAR_RBRACKET)
730     {
731       *result = tre_ast_new_literal(ctx->mem, ASSERTION,
732                       (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
733                       -1);
734       DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
735               "[[:<:]]" : "[[:>:]]"));
736       ctx->re += 6;
737       return *result ? REG_OK : REG_ESPACE;
738     }
739
740   /* Start off with an array of `max_i' elements. */
741   items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
742   if (items == NULL)
743     return REG_ESPACE;
744
745   if (*ctx->re == CHAR_CARET)
746     {
747       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
748       items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
749       ctx->re++;
750     }
751
752   status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
753
754   if (status != REG_OK)
755     goto parse_bracket_done;
756
757   /* If there are collating symbols, split off the multi-character ones
758    * into a union of the bracket expression (without the collating symbols)
759    * and the multiple-character sequences.  We create an equivalent input
760    * string and run tre_parse() recursively */
761   if (col_syms)
762     {
763       tre_char_t *str, *sp;
764       tre_collating_symbol *cp;
765       tre_parse_ctx_t subctx;
766
767       /* Allocate a new string.  We start with the size of the original
768        * bracket expression (minus 1) and add 2 (for a leading "[" and
769        * a trailing nil; don't need a "^", since it is illegal to have
770        * inverted MCCSs).  Since a multi-character collating symbols
771        * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772        * need to worry about the new string getting too long. */
773       xfree(items);
774       str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
775       if (str == NULL)
776         {
777           xfree(col_syms);
778           return REG_ESPACE;
779         }
780       sp = str;
781       if (col_syms->len > 0)
782         {
783           /* There are other items in the bracket expression besides the
784            * multi-character collating symbols, so create a new bracket
785            * expression with only those other itmes. */
786           const tre_char_t *re;
787           ptrdiff_t i;
788
789           *sp++ = '[';
790           re = ctx->re;
791           for (cp = col_syms + 1; cp->start; cp++)
792             {
793               /* The "- 2" is to account for the "[." */
794               if ((i = ((cp->start - re) - 2)) > 0)
795                 {
796                   memcpy(sp, re, sizeof(*sp) * i);
797                   sp += i;
798                 }
799               /* The "+ 2" is to account for the ".]" */
800               re = cp->start + cp->len + 2;
801             }
802             i = col_syms->start - re; /* Includes the trailing right bracket */
803             memcpy(sp, re, sizeof(*sp) * i);
804             sp += i;
805             *sp++ = '|';
806         }
807       for (cp = col_syms + 1; cp->start; cp++)
808         {
809           memcpy(sp, cp->start, sizeof(*sp) * cp->len);
810           sp += cp->len;
811           if (cp[1].start)
812             *sp++ = '|';
813         }
814       *sp = 0;
815       DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
816               str));
817
818       memcpy(&subctx, ctx, sizeof(subctx));
819       subctx.re = str;
820       subctx.len = sp - str;
821       subctx.nofirstsub = 1;
822       subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
823       status = tre_parse(&subctx);
824       xfree(str);
825       if (status != REG_OK)
826         {
827           xfree(col_syms);
828           return status;
829         }
830       ctx->re = col_syms->start;
831       ctx->position = subctx.position;
832       xfree(col_syms);
833       *result = subctx.result;
834       DPRINT(("tre_parse_bracket: Returning to original string\n"));
835       return REG_OK;
836     }
837
838   DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839   node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
840   if (node == NULL)
841     {
842       status = REG_ESPACE;
843       goto parse_bracket_done;
844     }
845   else
846     {
847       tre_literal_t *l = node->obj;
848       l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
849                                          SIZEOF_BRACKET_MATCH_LIST(items));
850       if (l->u.bracket_match_list == NULL)
851         {
852           status = REG_ESPACE;
853           goto parse_bracket_done;
854         }
855       memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
856     }
857
858 #ifdef TRE_DEBUG
859   {
860     int i;
861     tre_bracket_match_t *b;
862     DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863             items->num_bracket_matches, items->flags));
864     for (i = 0, b = items->bracket_matches;
865          i < items->num_bracket_matches; i++, b++)
866       {
867         DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
868                 b->value));
869       }
870   }
871 #endif /* TRE_DEBUG */
872
873  parse_bracket_done:
874   xfree(items);
875   ctx->position++;
876   *result = node;
877   return status;
878 }
879
880
881 /* Parses a positive decimal integer.  Returns -1 if the string does not
882    contain a valid number. */
883 static int
884 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
885 {
886   int num = -1;
887   const tre_char_t *r = *regex;
888   while (r < regex_end && *r >= L'0' && *r <= L'9')
889     {
890       if (num < 0)
891         num = 0;
892       num = num * 10 + *r - L'0';
893       r++;
894     }
895   *regex = r;
896   return num;
897 }
898
899
900 static reg_errcode_t
901 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
902 {
903   int min, max;
904 #ifdef TRE_APPROX
905   int i;
906   int cost_ins, cost_del, cost_subst, cost_max;
907   int limit_ins, limit_del, limit_subst, limit_err;
908   const tre_char_t *start;
909 #endif /* TRE_APPROX */
910   const tre_char_t *r = ctx->re;
911   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
912 #ifdef TRE_APPROX
913   int approx = 0;
914   int costs_set = 0;
915   int counts_set = 0;
916
917   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
918   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
919 #endif /* TRE_APPROX */
920
921   /* Parse number (minimum repetition count). */
922   min = -1;
923   if (r >= ctx->re_end)
924 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925     return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
926 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
927     return REG_EBRACE;
928 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929   if (*r >= L'0' && *r <= L'9') {
930     DPRINT(("tre_parse:   min count: '%.*" STRF "'\n", REST(r)));
931     min = tre_parse_int(&r, ctx->re_end);
932   }
933 #ifndef TRE_APPROX
934   else
935 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936       /* For ERE, return REG_NOMATCH to signal that the lbrace should
937          be treated as a literal */
938       return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
939 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
940       return REG_BADBR;
941 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942 #endif /* !TRE_APPROX */
943
944   /* Parse comma and second number (maximum repetition count). */
945   max = min;
946   if (r < ctx->re_end && *r == CHAR_COMMA)
947     {
948       r++;
949       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
950       max = tre_parse_int(&r, ctx->re_end);
951     }
952
953   /* Check that the repeat counts are sane. */
954   if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
955     return REG_BADBR;
956
957
958 #ifdef TRE_APPROX
959   /*
960    '{'
961      optionally followed immediately by a number == minimum repcount
962      optionally followed by , then a number == maximum repcount
963       + then a number == maximum insertion count
964       - then a number == maximum deletion count
965       # then a number == maximum substitution count
966       ~ then a number == maximum number of errors
967       Any of +, -, # or ~ without followed by a number means that
968       the maximum count/number of errors is infinite.
969
970       An equation of the form
971         Xi + Yd + Zs < C
972       can be specified to set costs and the cost limit to a value
973       different from the default value:
974         - X is the cost of an insertion
975         - Y is the cost of a deletion
976         - Z is the cost of a substitution
977         - C is the maximum cost
978
979       If no count limit or cost is set for an operation, the operation
980       is not allowed at all.
981   */
982
983
984   do {
985     int done;
986     start = r;
987
988     /* Parse count limit settings */
989     done = 0;
990     if (!counts_set)
991       while (r + 1 < ctx->re_end && !done)
992         {
993           switch (*r)
994             {
995             case CHAR_PLUS:  /* Insert limit */
996               DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
997               r++;
998               limit_ins = tre_parse_int(&r, ctx->re_end);
999               if (limit_ins < 0)
1000                 limit_ins = INT_MAX;
1001               counts_set = 1;
1002               break;
1003             case CHAR_MINUS: /* Delete limit */
1004               DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
1005               r++;
1006               limit_del = tre_parse_int(&r, ctx->re_end);
1007               if (limit_del < 0)
1008                 limit_del = INT_MAX;
1009               counts_set = 1;
1010               break;
1011             case CHAR_HASH:  /* Substitute limit */
1012               DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1013               r++;
1014               limit_subst = tre_parse_int(&r, ctx->re_end);
1015               if (limit_subst < 0)
1016                 limit_subst = INT_MAX;
1017               counts_set = 1;
1018               break;
1019             case CHAR_TILDE: /* Maximum number of changes */
1020               DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1021               r++;
1022               limit_err = tre_parse_int(&r, ctx->re_end);
1023               if (limit_err < 0)
1024                 limit_err = INT_MAX;
1025               approx = 1;
1026               break;
1027             case CHAR_COMMA:
1028               r++;
1029               break;
1030             case L' ':
1031               r++;
1032               break;
1033             case L'}':
1034               done = 1;
1035               break;
1036             default:
1037               done = 1;
1038               break;
1039             }
1040         }
1041
1042     /* Parse cost restriction equation. */
1043     done = 0;
1044     if (!costs_set)
1045       while (r + 1 < ctx->re_end && !done)
1046         {
1047           switch (*r)
1048             {
1049             case CHAR_PLUS:
1050             case L' ':
1051               r++;
1052               break;
1053             case L'<':
1054               DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
1055               r++;
1056               while (*r == L' ')
1057                 r++;
1058               cost_max = tre_parse_int(&r, ctx->re_end);
1059               if (cost_max < 0)
1060                 cost_max = INT_MAX;
1061               else
1062                 cost_max--;
1063               approx = 1;
1064               break;
1065             case CHAR_COMMA:
1066               r++;
1067               done = 1;
1068               break;
1069             default:
1070               if (*r >= L'0' && *r <= L'9')
1071                 {
1072 #ifdef TRE_DEBUG
1073                   const tre_char_t *sr = r;
1074 #endif /* TRE_DEBUG */
1075                   int cost = tre_parse_int(&r, ctx->re_end);
1076                   /* XXX - make sure r is not past end. */
1077                   switch (*r)
1078                     {
1079                     case L'i':  /* Insert cost */
1080                       DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
1081                               REST(sr)));
1082                       r++;
1083                       cost_ins = cost;
1084                       costs_set = 1;
1085                       break;
1086                     case L'd':  /* Delete cost */
1087                       DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
1088                               REST(sr)));
1089                       r++;
1090                       cost_del = cost;
1091                       costs_set = 1;
1092                       break;
1093                     case L's':  /* Substitute cost */
1094                       DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
1095                               REST(sr)));
1096                       r++;
1097                       cost_subst = cost;
1098                       costs_set = 1;
1099                       break;
1100                     default:
1101                       return REG_BADBR;
1102                     }
1103                 }
1104               else
1105                 {
1106                   done = 1;
1107                   break;
1108                 }
1109             }
1110         }
1111   } while (start != r);
1112 #endif /* TRE_APPROX */
1113
1114   /*{*//* Missing }. */
1115   if (r >= ctx->re_end)
1116     return REG_EBRACE;
1117
1118   /* Empty contents of {}. */
1119   if (r == ctx->re)
1120     return REG_BADBR;
1121
1122   /* Parse the ending '}' or '\}'.*/
1123   if (ctx->cflags & REG_EXTENDED)
1124     {
1125       if (r >= ctx->re_end || *r != CHAR_RBRACE)
1126         return REG_BADBR;
1127       r++;
1128       /* Parse trailing '?' marking minimal repetition. */
1129       if (r < ctx->re_end)
1130         {
1131           if (*r == CHAR_QUESTIONMARK)
1132             {
1133               /* Process the question mark only in enhanced mode.
1134                  Otherwise, the question mark is an error in ERE
1135                  or a literal in BRE */
1136               if (ctx->cflags & REG_ENHANCED)
1137                 {
1138                   minimal = !(ctx->cflags & REG_UNGREEDY);
1139                   r++;
1140                 }
1141               else return REG_BADRPT;
1142             }
1143           else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1144             {
1145               /* These are reserved for future extensions. */
1146               return REG_BADRPT;
1147             }
1148         }
1149     }
1150   else
1151     {
1152       if (r + 1 >= ctx->re_end
1153           || *r != CHAR_BACKSLASH
1154           || *(r + 1) != CHAR_RBRACE)
1155         return REG_BADBR;
1156       r += 2;
1157       if (r < ctx->re_end && *r == CHAR_STAR)
1158         {
1159           /* This is reserved for future extensions. */
1160           return REG_BADRPT;
1161         }
1162     }
1163
1164   if (minimal)
1165     ctx->num_reorder_tags++;
1166
1167   if (!result) goto parse_bound_exit;
1168   /* Create the AST node(s). */
1169   /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170      iteration with EMPTY.  This unfortunately drops any submatches, and
1171      messes up setting the pmatch values (we can get tags of -1, and
1172      tag values in the billions).  So we leave it and process this case as
1173      usual, and wait until tre_expand_ast() to replace with EMPTY */
1174 #ifdef TRE_APPROX
1175   if (min < 0 && max < 0)
1176     /* Only approximate parameters set, no repetitions. */
1177     min = max = 1;
1178 #endif /* TRE_APPROX */
1179
1180   *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1181   if (!*result)
1182     return REG_ESPACE;
1183
1184 #ifdef TRE_APPROX
1185   /* If approximate matching parameters are set, add them to the
1186      iteration node. */
1187   if (approx || costs_set || counts_set)
1188     {
1189       int *params;
1190       tre_iteration_t *iter = (*result)->obj;
1191
1192       if (costs_set || counts_set)
1193         {
1194           if (limit_ins == TRE_PARAM_UNSET)
1195             {
1196               if (cost_ins == TRE_PARAM_UNSET)
1197                 limit_ins = 0;
1198               else
1199                 limit_ins = INT_MAX;
1200             }
1201
1202           if (limit_del == TRE_PARAM_UNSET)
1203             {
1204               if (cost_del == TRE_PARAM_UNSET)
1205                 limit_del = 0;
1206               else
1207                 limit_del = INT_MAX;
1208             }
1209
1210           if (limit_subst == TRE_PARAM_UNSET)
1211             {
1212               if (cost_subst == TRE_PARAM_UNSET)
1213                 limit_subst = 0;
1214               else
1215                 limit_subst = INT_MAX;
1216             }
1217         }
1218
1219       if (cost_max == TRE_PARAM_UNSET)
1220         cost_max = INT_MAX;
1221       if (limit_err == TRE_PARAM_UNSET)
1222         limit_err = INT_MAX;
1223
1224       ctx->have_approx = 1;
1225       params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1226       if (!params)
1227         return REG_ESPACE;
1228       for (i = 0; i < TRE_PARAM_LAST; i++)
1229         params[i] = TRE_PARAM_UNSET;
1230       params[TRE_PARAM_COST_INS] = cost_ins;
1231       params[TRE_PARAM_COST_DEL] = cost_del;
1232       params[TRE_PARAM_COST_SUBST] = cost_subst;
1233       params[TRE_PARAM_COST_MAX] = cost_max;
1234       params[TRE_PARAM_MAX_INS] = limit_ins;
1235       params[TRE_PARAM_MAX_DEL] = limit_del;
1236       params[TRE_PARAM_MAX_SUBST] = limit_subst;
1237       params[TRE_PARAM_MAX_ERR] = limit_err;
1238       iter->params = params;
1239     }
1240 #endif /* TRE_APPROX */
1241
1242 parse_bound_exit:
1243 #ifdef TRE_APPROX
1244   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1245           "limits [%d,%d,%d, total %d]\n",
1246           min, max, cost_ins, cost_del, cost_subst, cost_max,
1247           limit_ins, limit_del, limit_subst, limit_err));
1248 #else /* !TRE_APPROX */
1249   DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1250 #endif /* !TRE_APPROX */
1251
1252
1253   ctx->re = r;
1254   return REG_OK;
1255 }
1256
1257 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258    non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259    treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260    Because we now set up tags for even non-capturing parenthesized
1261    subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
1262    pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263    have it restore cflags after the subexpression, we don't need to have
1264    a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265    non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266    This has the side-benefit of now matching the perl behavior: the RE
1267    foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268    of foo AND (?i) (bar OR zap). */
1269 typedef enum {
1270   PARSE_RE = 0,
1271   PARSE_ATOM,
1272   PARSE_MARK_FOR_SUBMATCH,
1273   PARSE_BRANCH,
1274   PARSE_PIECE,
1275   PARSE_CATENATION,
1276   PARSE_POST_CATENATION,
1277   PARSE_UNION,
1278   PARSE_POST_UNION,
1279   PARSE_POSTFIX,
1280 } tre_parse_re_stack_symbol_t;
1281
1282
1283 reg_errcode_t
1284 tre_parse(tre_parse_ctx_t *ctx)
1285 {
1286   tre_ast_node_t *result = NULL;
1287   tre_parse_re_stack_symbol_t symbol;
1288   reg_errcode_t status = REG_OK;
1289   tre_stack_t *stack = ctx->stack;
1290   int bottom = tre_stack_num_objects(stack);
1291   int depth = 0;
1292   int temporary_cflags = 0;
1293   int bre_branch_begin;
1294 #ifdef TRE_DEBUG
1295   const tre_char_t *tmp_re;
1296 #endif
1297
1298   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1299           ctx->len, ctx->re, ctx->len, ctx->cflags));
1300
1301   if (ctx->len <= 0) return REG_EMPTY;
1302   if (!ctx->nofirstsub)
1303     {
1304       STACK_PUSH(stack, int, ctx->cflags);
1305       STACK_PUSH(stack, int, ctx->submatch_id);
1306       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1307       ctx->submatch_id++;
1308     }
1309   STACK_PUSH(stack, int, 0); // bre_branch_begin
1310   STACK_PUSH(stack, int, PARSE_RE);
1311   ctx->re_start = ctx->re;
1312   ctx->re_end = ctx->re + ctx->len;
1313
1314
1315   /* The following is basically just a recursive descent parser.  I use
1316      an explicit stack instead of recursive functions mostly because of
1317      two reasons: compatibility with systems which have an overflowable
1318      call stack, and efficiency (both in lines of code and speed).  */
1319   while (tre_stack_num_objects(stack) > bottom)
1320     {
1321       symbol = tre_stack_pop_int(stack);
1322       switch (symbol)
1323         {
1324         case PARSE_RE:
1325           /* Parse a full regexp.  A regexp is one or more branches,
1326              separated by the union operator `|'. */
1327           bre_branch_begin = tre_stack_pop_int(stack);
1328           if (
1329 #ifdef REG_LITERAL
1330               !(ctx->cflags & REG_LITERAL) &&
1331 #endif /* REG_LITERAL */
1332               ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1333             STACK_PUSHX(stack, int, PARSE_UNION);
1334           STACK_PUSHX(stack, int, bre_branch_begin);
1335           STACK_PUSHX(stack, int, PARSE_BRANCH);
1336           break;
1337
1338         case PARSE_BRANCH:
1339           /* Parse a branch.  A branch is one or more pieces, concatenated.
1340              A piece is an atom possibly followed by a postfix operator. */
1341           bre_branch_begin = tre_stack_pop_int(stack);
1342           STACK_PUSHX(stack, int, PARSE_CATENATION);
1343           STACK_PUSHX(stack, int, bre_branch_begin);
1344           STACK_PUSHX(stack, int, PARSE_PIECE);
1345           break;
1346
1347         case PARSE_PIECE:
1348           /* Parse a piece.  A piece is an atom possibly followed by one
1349              or more postfix operators. */
1350           bre_branch_begin = tre_stack_pop_int(stack);
1351           STACK_PUSHX(stack, int, PARSE_POSTFIX);
1352           STACK_PUSHX(stack, int, bre_branch_begin);
1353           STACK_PUSHX(stack, int, PARSE_ATOM);
1354           break;
1355
1356         case PARSE_CATENATION:
1357           /* If the expression has not ended, parse another piece. */
1358           {
1359             tre_char_t c;
1360             if (ctx->re >= ctx->re_end)
1361               break;
1362             c = *ctx->re;
1363 #ifdef REG_LITERAL
1364             if (!(ctx->cflags & REG_LITERAL))
1365               {
1366 #endif /* REG_LITERAL */
1367                 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1368                     ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1369                     && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1370                     *(ctx->re + 1) == CHAR_PIPE))
1371                   break;
1372                 if ((ctx->cflags & REG_EXTENDED
1373                      && c == CHAR_RPAREN && depth > 0)
1374                     || (!(ctx->cflags & REG_EXTENDED)
1375                         && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1376                             && *(ctx->re + 1) == CHAR_RPAREN))
1377                   {
1378                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1379                       return REG_EPAREN;
1380                     DPRINT(("tre_parse:   group end: '%.*" STRF "'\n",
1381                             REST(ctx->re)));
1382                     depth--;
1383                     if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1384                       ctx->re += 2;
1385                     break;
1386                   }
1387 #ifdef REG_LITERAL
1388               }
1389 #endif /* REG_LITERAL */
1390
1391 #ifdef REG_LEFT_ASSOC
1392             if (ctx->cflags & REG_LEFT_ASSOC)
1393               {
1394                 /* Left associative concatenation. */
1395                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1396                 STACK_PUSHX(stack, voidptr, result);
1397                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1398                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1399                 STACK_PUSHX(stack, int, PARSE_PIECE);
1400               }
1401             else
1402 #endif /* REG_LEFT_ASSOC */
1403               {
1404                 /* Default case, right associative concatenation. */
1405                 STACK_PUSHX(stack, voidptr, result);
1406                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1407                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1408                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1409                 STACK_PUSHX(stack, int, PARSE_PIECE);
1410               }
1411             break;
1412           }
1413
1414         case PARSE_POST_CATENATION:
1415           {
1416             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1417             tre_ast_node_t *tmp_node;
1418             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1419             if (!tmp_node)
1420               return REG_ESPACE;
1421             result = tmp_node;
1422             break;
1423           }
1424
1425         case PARSE_UNION:
1426           if (ctx->re >= ctx->re_end)
1427             break;
1428 #ifdef REG_LITERAL
1429           if (ctx->cflags & REG_LITERAL)
1430             break;
1431 #endif /* REG_LITERAL */
1432           if (!(ctx->cflags & REG_EXTENDED))
1433             {
1434               if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1435                 break;
1436               ctx->re++;
1437             }
1438           switch (*ctx->re)
1439             {
1440             case CHAR_PIPE:
1441               DPRINT(("tre_parse:       union: '%.*" STRF "'\n",
1442                       REST(ctx->re)));
1443               STACK_PUSHX(stack, int, PARSE_UNION);
1444               STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1445               STACK_PUSHX(stack, voidptr, result);
1446               STACK_PUSHX(stack, int, PARSE_POST_UNION);
1447               /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448                  indicate if this is the beginning of a BRE extended branch. */
1449               STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1450               STACK_PUSHX(stack, int, PARSE_BRANCH);
1451               ctx->re++;
1452               break;
1453
1454             case CHAR_RPAREN:
1455               ctx->re++;
1456               break;
1457
1458             default:
1459               if (!(ctx->cflags & REG_EXTENDED))
1460                 ctx->re--;
1461               break;
1462             }
1463           break;
1464
1465         case PARSE_POST_UNION:
1466           {
1467             tre_ast_node_t *tmp_node;
1468             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1469             const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1470             /* error on empty expression at end of union */
1471             if (pipechar == ctx->re - 1)
1472               {
1473                 return REG_EMPTY;
1474               }
1475             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1476             if (!tmp_node)
1477               return REG_ESPACE;
1478             result = tmp_node;
1479             break;
1480           }
1481
1482         case PARSE_POSTFIX:
1483           /* Parse postfix operators. */
1484           if (ctx->re >= ctx->re_end)
1485             break;
1486 #ifdef REG_LITERAL
1487           if (ctx->cflags & REG_LITERAL)
1488             break;
1489 #endif /* REG_LITERAL */
1490           int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1491           int rep_min = 0;
1492           int rep_max = -1;
1493 #ifdef TRE_DEBUG
1494           int lbrace_off;
1495 #endif
1496           switch (*ctx->re)
1497             {
1498             case CHAR_PLUS:
1499             case CHAR_QUESTIONMARK:
1500               if (!(ctx->cflags & REG_EXTENDED))
1501                 break;
1502                 /*FALLTHROUGH*/
1503             case CHAR_STAR:
1504               {
1505                 tre_ast_node_t *tmp_node;
1506 #ifdef TRE_DEBUG
1507                 const char *tstr = "star";
1508                 tmp_re = ctx->re;
1509 #endif
1510
1511         handle_plus_or_question:
1512                 /* error on iteration of raw assertion (not in subexpression) */
1513                 if (result->type == LITERAL && result->submatch_id < 0 &&
1514                     IS_ASSERTION((tre_literal_t *)result->obj))
1515                   {
1516                     if (!(ctx->cflags & REG_EXTENDED)) break;
1517                     return REG_BADRPT;
1518                   }
1519                 if (*ctx->re == CHAR_PLUS)
1520                   {
1521                     rep_min = 1;
1522 #ifdef TRE_DEBUG
1523                     tstr = "plus";
1524 #endif
1525                   }
1526                 if (*ctx->re == CHAR_QUESTIONMARK)
1527                   {
1528                     rep_max = 1;
1529 #ifdef TRE_DEBUG
1530                     tstr = "questionmark";
1531 #endif
1532                   }
1533
1534                 if (ctx->cflags & REG_EXTENDED)
1535                   {
1536                     if (ctx->re + 1 < ctx->re_end)
1537                       {
1538                         if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1539                           {
1540                             /* Process the question mark only in enhanced mode.
1541                                Otherwise, the question mark is an error in ERE */
1542                             if (ctx->cflags & REG_ENHANCED)
1543                               {
1544                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1545                                 ctx->re++;
1546                               }
1547                             else return REG_BADRPT;
1548                           }
1549                         else if (*(ctx->re + 1) == CHAR_STAR
1550                                  || *(ctx->re + 1) == CHAR_PLUS)
1551                           {
1552                             /* These are reserved for future extensions. */
1553                             return REG_BADRPT;
1554                           }
1555                       }
1556                   }
1557                 else
1558                   {
1559                     if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1560                       {
1561                         /* This is reserved for future extensions. */
1562                         return REG_BADRPT;
1563                       }
1564                     if (ctx->re + 2 < ctx->re_end)
1565                       {
1566                         if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1567                           {
1568                             /* Process the question mark only in enhanced mode.
1569                                Otherwise, the question mark is a literal in BRE */
1570                             if (ctx->cflags & REG_ENHANCED)
1571                               {
1572                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1573                                 ctx->re += 2;
1574                               }
1575                           }
1576                         else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1577                           {
1578                             /* This is reserved for future extensions. */
1579                             return REG_BADRPT;
1580                           }
1581                       }
1582                   }
1583
1584                 if (minimal)
1585                   ctx->num_reorder_tags++;
1586
1587                 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1588                         minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
1589                 if (result == NULL)
1590                   {
1591                     if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1592                     else goto parse_literal;
1593                   }
1594                 ctx->re++;
1595                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1596                                             minimal);
1597                 if (tmp_node == NULL)
1598                   return REG_ESPACE;
1599                 result = tmp_node;
1600
1601                 /* Set the iterator with a submatch id in the invisible range
1602                  * (which will be overridden if a real submatch is needed) */
1603                 result->submatch_id = ctx->submatch_id_invisible++;
1604
1605 #if 0
1606                 /* We don't allow multiple postfixes, but this might be needed
1607                    to support approximate matching */
1608                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1609 #endif
1610               }
1611               break;
1612
1613             case CHAR_BACKSLASH:
1614               /* "\{" is special without REG_EXTENDED */
1615               /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1616               if (!(ctx->cflags & REG_EXTENDED)
1617                   && ctx->re + 1 < ctx->re_end)
1618                 {
1619                   switch (*(ctx->re + 1))
1620                     {
1621                     case CHAR_LBRACE:
1622                       ctx->re++;
1623 #ifdef TRE_DEBUG
1624                       lbrace_off = 2;
1625 #endif
1626                       goto parse_brace;
1627                     case CHAR_PLUS:
1628                     case CHAR_QUESTIONMARK:
1629                       if (ctx->cflags & REG_ENHANCED)
1630                         {
1631 #ifdef TRE_DEBUG
1632                           tmp_re = ctx->re;
1633 #endif
1634                           ctx->re++;
1635                           goto handle_plus_or_question;
1636                         }
1637                       break;
1638                     }
1639                   break;
1640                 }
1641               else
1642                 break;
1643
1644             case CHAR_LBRACE:
1645               {
1646                 int raw_assertion;
1647
1648                 /* "{" is literal without REG_EXTENDED */
1649                 if (!(ctx->cflags & REG_EXTENDED))
1650                   break;
1651 #ifdef TRE_DEBUG
1652                 lbrace_off = 1;
1653 #endif
1654
1655             parse_brace:
1656                 /* error on iteration of raw assertion (not in subexpression),
1657                    but wait until after parsing bounds */
1658                 raw_assertion = (result->type == LITERAL
1659                                  && result->submatch_id < 0
1660                                  && IS_ASSERTION((tre_literal_t *)result->obj));
1661                 ctx->re++;
1662
1663                 status = tre_parse_bound(ctx, &result);
1664 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665                 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666                    is to be treated as a literal. */
1667                 if (status == REG_NOMATCH)
1668                   {
1669                     ctx->re--;
1670                     break;
1671                   }
1672 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673                 DPRINT(("tre_parse:     bound: '%.*" STRF "'\n",
1674                         REST(ctx->re - lbrace_off)));
1675                 if (status != REG_OK)
1676                   return status;
1677                 if (raw_assertion) return REG_BADRPT;
1678
1679                 /* Set the iterator with a submatch id in the invisible range
1680                  * (which will be overridden if a real submatch is needed) */
1681                 if (result->type == ITERATION)
1682                   result->submatch_id = ctx->submatch_id_invisible++;
1683
1684 #if 0
1685                 /* We don't allow multiple postfixes, but this might be needed
1686                    to support approximate matching */
1687                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1688 #endif
1689                 break;
1690               }
1691             }
1692           break;
1693
1694         case PARSE_ATOM:
1695           {
1696             /* Parse an atom.  An atom is a regular expression enclosed in `()',
1697                an empty set of `()', a bracket expression, `.', `^', `$',
1698                a `\' followed by a character, or a single character. */
1699
1700             /* The stack contains a boolean value, whether PARSE_ATOM is
1701                being called just after the start of a group (left paren)
1702                in a BRE */
1703             bre_branch_begin = tre_stack_pop_int(stack);
1704
1705             /* End of regexp? (empty string). */
1706             if (ctx->re >= ctx->re_end)
1707               goto parse_literal;
1708
1709 #ifdef REG_LITERAL
1710             if (ctx->cflags & REG_LITERAL)
1711               goto parse_literal;
1712 #endif /* REG_LITERAL */
1713
1714             switch (*ctx->re)
1715               {
1716               case CHAR_LPAREN:  /* parenthesized subexpression */
1717
1718                 /* Handle "(?...)" extensions.  They work in a way similar
1719                    to Perls corresponding extensions. */
1720                 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1721                     (REG_EXTENDED|REG_ENHANCED)
1722                     && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1723                   {
1724                     int new_cflags = ctx->cflags;
1725                     int bit = 1;
1726                     int invisible_submatch = 0;
1727                     DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1728                             REST(ctx->re)));
1729                     ctx->re += 2;
1730                     while (/*CONSTCOND*/1)
1731                       {
1732                         if (*ctx->re == L'i')
1733                           {
1734                             DPRINT(("tre_parse:     icase: '%.*" STRF "'\n",
1735                                     REST(ctx->re)));
1736                             if (bit)
1737                               new_cflags |= REG_ICASE;
1738                             else
1739                               new_cflags &= ~REG_ICASE;
1740                             ctx->re++;
1741                           }
1742                         else if (*ctx->re == L'n')
1743                           {
1744                             DPRINT(("tre_parse:   newline: '%.*" STRF "'\n",
1745                                     REST(ctx->re)));
1746                             if (bit)
1747                               new_cflags |= REG_NEWLINE;
1748                             else
1749                               new_cflags &= ~REG_NEWLINE;
1750                             ctx->re++;
1751                           }
1752 #ifdef REG_LEFT_ASSOC
1753                         else if (*ctx->re == L'l')
1754                           {
1755                             DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1756                                     REST(ctx->re)));
1757                             if (bit)
1758                               new_cflags |= REG_LEFT_ASSOC;
1759                             else
1760                               new_cflags &= ~REG_LEFT_ASSOC;
1761                             ctx->re++;
1762                           }
1763 #endif /* REG_LEFT_ASSOC */
1764 #ifdef REG_UNGREEDY
1765                         else if (*ctx->re == L'U')
1766                           {
1767                             DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
1768                                     REST(ctx->re)));
1769                             if (bit)
1770                               new_cflags |= REG_UNGREEDY;
1771                             else
1772                               new_cflags &= ~REG_UNGREEDY;
1773                             ctx->re++;
1774                           }
1775 #endif /* REG_UNGREEDY */
1776                         else if (*ctx->re == CHAR_MINUS)
1777                           {
1778                             DPRINT(("tre_parse:  turn off: '%.*" STRF "'\n",
1779                                     REST(ctx->re)));
1780                             ctx->re++;
1781                             bit = 0;
1782                           }
1783                         else if (*ctx->re == CHAR_COLON)
1784                           {
1785                             DPRINT(("tre_parse:  no group: '%.*" STRF
1786                                     "', (invisible submatch %d)\n",
1787                                     REST(ctx->re), ctx->submatch_id_invisible));
1788                             ctx->re++;
1789                             depth++;
1790                             invisible_submatch = 1;
1791                             break;
1792                           }
1793                         else if (*ctx->re == CHAR_HASH)
1794                           {
1795                             DPRINT(("tre_parse:    comment: '%.*" STRF "'\n",
1796                                     REST(ctx->re)));
1797                             /* A comment can contain any character except a
1798                                right parenthesis */
1799                             while (*ctx->re != CHAR_RPAREN
1800                                    && ctx->re < ctx->re_end)
1801                               ctx->re++;
1802                             if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1803                               {
1804                                 ctx->re++;
1805                                 break;
1806                               }
1807                             else
1808                               return REG_BADPAT;
1809                           }
1810                         else if (*ctx->re == CHAR_RPAREN)
1811                           {
1812                             ctx->re++;
1813                             break;
1814                           }
1815                         else
1816                           return REG_BADRPT;
1817                       }
1818
1819                     /* Turn on the cflags changes for the rest of the
1820                        enclosing group. */
1821                     if (invisible_submatch)
1822                       {
1823                         STACK_PUSHX(stack, int, ctx->cflags);
1824                         STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1825                         STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1826                         ctx->submatch_id_invisible++;
1827                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1828                         STACK_PUSHX(stack, int, PARSE_RE);
1829                       }
1830                     else {
1831                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1832                         STACK_PUSHX(stack, int, PARSE_ATOM);
1833                     }
1834                     ctx->cflags = new_cflags;
1835                     break;
1836                   }
1837
1838                 if (ctx->cflags & REG_EXTENDED)
1839                   {
1840                 parse_bre_lparen:
1841                     DPRINT(("tre_parse: group begin: '%.*" STRF
1842                             "', submatch %d\n", REST(ctx->re),
1843                             ctx->submatch_id));
1844                     ctx->re++;
1845                     /* First parse a whole RE, then mark the resulting tree
1846                        for submatching. */
1847                     STACK_PUSHX(stack, int, ctx->cflags);
1848                     STACK_PUSHX(stack, int, ctx->submatch_id);
1849                     STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1850                     /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851                        indicate if this is the beginning of a BRE group. */
1852                     STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1853                     STACK_PUSHX(stack, int, PARSE_RE);
1854                     ctx->submatch_id++;
1855                     depth++;
1856                   }
1857                 else
1858                   goto parse_literal;
1859                 break;
1860
1861               case CHAR_RPAREN:  /* end of current subexpression */
1862                 if (ctx->cflags & REG_EXTENDED && depth > 0)
1863                   {
1864               parse_bre_rparen_empty:
1865                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1866                       return REG_EPAREN;
1867                     DPRINT(("tre_parse:     empty: '%.*" STRF "'\n",
1868                             REST(ctx->re)));
1869                     /* We were expecting an atom, but instead the current
1870                        subexpression was closed.  POSIX leaves the meaning of
1871                        this to be implementation-defined.  We interpret this as
1872                        an empty expression (which matches an empty string).  */
1873                     result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1874                     if (result == NULL)
1875                       return REG_ESPACE;
1876                     if (!(ctx->cflags & REG_EXTENDED))
1877                       ctx->re--;
1878                   }
1879                 else
1880                   goto parse_literal;
1881                 break;
1882
1883               case CHAR_LBRACKET: /* bracket expression */
1884                 DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1885                         REST(ctx->re)));
1886                 ctx->re++;
1887                 status = tre_parse_bracket(ctx, &result);
1888                 if (status != REG_OK)
1889                   return status;
1890                 break;
1891
1892               case CHAR_BACKSLASH:
1893                 /* Deal with "\(", "\)" or "\{" for BREs */
1894                 if (!(ctx->cflags & REG_EXTENDED)
1895                     && ctx->re + 1 < ctx->re_end)
1896                   {
1897                     if (*(ctx->re + 1) == CHAR_LPAREN)
1898                       {
1899                         ctx->re++;
1900                         goto parse_bre_lparen;
1901                       }
1902                     else if (*(ctx->re + 1) == CHAR_RPAREN)
1903                       {
1904                         ctx->re++;
1905                         goto parse_bre_rparen_empty;
1906                       }
1907                     if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1908                   }
1909
1910                 if (ctx->re + 1 >= ctx->re_end)
1911                   /* Trailing backslash. */
1912                   return REG_EESCAPE;
1913
1914                 if (!(ctx->cflags & REG_ENHANCED))
1915                   {
1916                     DPRINT(("tre_parse:  unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1917                     ctx->re++;
1918                     goto unenhanced_backslash;
1919                   }
1920
1921                 /* If a macro is used, parse the expanded macro recursively. */
1922                 {
1923                   tre_char_t buf[64];
1924                   tre_expand_macro(ctx->re + 1, ctx->re_end,
1925                                    buf, elementsof(buf));
1926                   if (buf[0] != 0)
1927                     {
1928                       tre_parse_ctx_t subctx;
1929                       memcpy(&subctx, ctx, sizeof(subctx));
1930                       subctx.re = buf;
1931                       subctx.len = tre_strlen(buf);
1932                       subctx.nofirstsub = 1;
1933                       status = tre_parse(&subctx);
1934                       if (status != REG_OK)
1935                         return status;
1936                       ctx->re += 2;
1937                       ctx->position = subctx.position;
1938                       result = subctx.result;
1939                       break;
1940                     }
1941                 }
1942
1943 #ifdef REG_LITERAL
1944                 if (*(ctx->re + 1) == L'Q')
1945                   {
1946                     DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1947                             REST(ctx->re)));
1948                     ctx->cflags |= REG_LITERAL;
1949                     temporary_cflags |= REG_LITERAL;
1950                     ctx->re += 2;
1951                     STACK_PUSHX(stack, int, 0);
1952                     STACK_PUSHX(stack, int, PARSE_ATOM);
1953                     break;
1954                   }
1955 #endif /* REG_LITERAL */
1956
1957                 DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1958                 ctx->re++;
1959                 switch (*ctx->re)
1960                   {
1961                   case L'b':
1962                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1963                                                  ASSERT_AT_WB, -1);
1964                     ctx->re++;
1965                     break;
1966                   case L'B':
1967                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1968                                                  ASSERT_AT_WB_NEG, -1);
1969                     ctx->re++;
1970                     break;
1971                   case L'<':
1972                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1973                                                  ASSERT_AT_BOW, -1);
1974                     ctx->re++;
1975                     break;
1976                   case L'>':
1977                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1978                                                  ASSERT_AT_EOW, -1);
1979                     ctx->re++;
1980                     break;
1981                   case L'x':
1982                     ctx->re++;
1983                     if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1984                       {
1985                         /* 8 bit hex char. */
1986                         char tmp[3] = {0, 0, 0};
1987                         long val;
1988                         DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1989                                 REST(ctx->re - 2)));
1990
1991                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1992                             ctx->re < ctx->re_end)
1993                           {
1994                             tmp[0] = (char)ctx->re[0];
1995                             ctx->re++;
1996                           }
1997                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1998                             ctx->re < ctx->re_end)
1999                           {
2000                             tmp[1] = (char)ctx->re[0];
2001                             ctx->re++;
2002                           }
2003                         val = strtol(tmp, NULL, 16);
2004                         result = tre_ast_new_literal(ctx->mem, (int)val,
2005                                                      (int)val, ctx->position);
2006                         ctx->position++;
2007                         break;
2008                       }
2009                     else if (ctx->re < ctx->re_end)
2010                       {
2011                         /* Wide char. */
2012                         char tmp[32];
2013                         long val;
2014                         int i = 0;
2015                         ctx->re++;
2016                         while (ctx->re_end - ctx->re >= 0)
2017                           {
2018                             if (ctx->re[0] == CHAR_RBRACE)
2019                               break;
2020                             if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2021                               {
2022                                 tmp[i] = (char)ctx->re[0];
2023                                 i++;
2024                                 ctx->re++;
2025                                 continue;
2026                               }
2027                             return REG_EBRACE;
2028                           }
2029                         ctx->re++;
2030                         tmp[i] = 0;
2031                         val = strtol(tmp, NULL, 16);
2032                         result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2033                                                      ctx->position);
2034                         ctx->position++;
2035                         break;
2036                       }
2037                     /*FALLTHROUGH*/
2038
2039                   default:
2040                   unenhanced_backslash:
2041                     if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2042                         REG_EXTENDED &&
2043                         tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2044                       {
2045                         /* Back reference (only in BRE or enhanced). */
2046                         int val = *ctx->re - L'0';
2047                         DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
2048                                 REST(ctx->re - 1)));
2049                         result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2050                                                      ctx->position);
2051                         if (result == NULL)
2052                           return REG_ESPACE;
2053
2054                         /* Set the backref with a submatch id in the invisible
2055                          * range (which will be overridden if a real submatch
2056                          * is needed) */
2057                         result->submatch_id = ctx->submatch_id_invisible++;
2058
2059                         ctx->position++;
2060                         ctx->num_reorder_tags++;
2061                         ctx->max_backref = MAX(val, ctx->max_backref);
2062                         ctx->re++;
2063                       }
2064                     else
2065                       {
2066                         /* Escaped character. */
2067                         DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
2068                                 REST(ctx->re - 1)));
2069                         result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2070                                                      ctx->position);
2071                         ctx->position++;
2072                         ctx->re++;
2073                       }
2074                     break;
2075                   }
2076                 if (result == NULL)
2077                   return REG_ESPACE;
2078                 break;
2079
2080               case CHAR_PERIOD:  /* the any-symbol */
2081                 DPRINT(("tre_parse:       any: '%.*" STRF "'\n",
2082                         REST(ctx->re)));
2083                 if (ctx->cflags & REG_NEWLINE)
2084                   {
2085                     tre_ast_node_t *tmp1;
2086                     tre_ast_node_t *tmp2;
2087                     tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2088                                                ctx->position);
2089                     if (!tmp1)
2090                       return REG_ESPACE;
2091                     tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2092                                                ctx->position + 1);
2093                     if (!tmp2)
2094                       return REG_ESPACE;
2095                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2096                     if (!result)
2097                       return REG_ESPACE;
2098                     ctx->position += 2;
2099                   }
2100                 else
2101                   {
2102                     result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2103                                                  ctx->position);
2104                     if (!result)
2105                       return REG_ESPACE;
2106                     ctx->position++;
2107                   }
2108                 ctx->re++;
2109                 break;
2110
2111               case CHAR_CARET:   /* beginning of line assertion */
2112                 /* '^' has a special meaning everywhere in EREs, at the
2113                    beginning of the RE and after \( is BREs.  It is also
2114                    special in enhanced BREs at the beginning of each branches
2115                    of a union */
2116                 if (ctx->cflags & REG_EXTENDED
2117                     || bre_branch_begin
2118                     || ctx->re == ctx->re_start)
2119                   {
2120                     DPRINT(("tre_parse:       BOL: '%.*" STRF "'\n",
2121                             REST(ctx->re)));
2122                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
2123                                                  ASSERT_AT_BOL, -1);
2124                     if (result == NULL)
2125                       return REG_ESPACE;
2126                     ctx->re++;
2127                   }
2128                 else
2129                   goto parse_literal;
2130                 break;
2131
2132               case CHAR_DOLLAR:  /* end of line assertion. */
2133                 /* '$' is special everywhere in EREs, and in the end of the
2134                    string and before \) is BREs. */
2135                 if (ctx->cflags & REG_EXTENDED
2136                     || (ctx->re + 2 < ctx->re_end
2137                         && *(ctx->re + 1) == CHAR_BACKSLASH
2138                         && *(ctx->re + 2) == CHAR_RPAREN)
2139                     || ctx->re + 1 == ctx->re_end)
2140                   {
2141                     DPRINT(("tre_parse:       EOL: '%.*" STRF "'\n",
2142                             REST(ctx->re)));
2143                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
2144                                                  ASSERT_AT_EOL, -1);
2145                     if (result == NULL)
2146                       return REG_ESPACE;
2147                     ctx->re++;
2148                   }
2149                 else
2150                   goto parse_literal;
2151                 break;
2152
2153               default:
2154               parse_literal:
2155
2156                 if (temporary_cflags && ctx->re + 1 < ctx->re_end
2157                     && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2158                   {
2159                     DPRINT(("tre_parse:  end tmps: '%.*" STRF "'\n",
2160                             REST(ctx->re)));
2161                     ctx->cflags &= ~temporary_cflags;
2162                     temporary_cflags = 0;
2163                     ctx->re += 2;
2164                     if (ctx->re < ctx->re_end)
2165                       {
2166                         STACK_PUSHX(stack, int, 0);
2167                         STACK_PUSHX(stack, int, PARSE_ATOM);
2168                       }
2169                     else
2170                       {
2171                         result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2172                         if (!result) return REG_ESPACE;
2173                       }
2174                     break;
2175                   }
2176
2177
2178                 /* We are expecting an atom.  If the subexpression (or the whole
2179                    regexp ends here, we interpret it as an empty expression
2180                    (which matches an empty string), which is an error.
2181                    Iterations of an empty expression is also an error. */
2182 #ifdef REG_LITERAL
2183                 if (!(ctx->cflags & REG_LITERAL))
2184                   {
2185 #endif /* REG_LITERAL */
2186                     /* error on end of string */
2187                     if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2188                                                        : REG_EMPTY;
2189                     /* error on unions and iterations of empty expressions */
2190                     if (ctx->cflags & REG_EXTENDED)
2191                       {
2192                         if (ctx->re < ctx->re_end)
2193                           {
2194                             if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2195                             if (*ctx->re == CHAR_LBRACE)
2196                               {
2197                                 ctx->re++;
2198                   empty_parse_bound:
2199                                 /* We need to parse the bound first and return
2200                                    any error, before returning REG_BADRPT */
2201                                 status = tre_parse_bound(ctx, NULL);
2202 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203                                 /* For ERE, if REG_NOMATCH is returned, we
2204                                    treat the lbrace as a literal. */
2205                                 if (status == REG_NOMATCH)
2206                                   {
2207                                     ctx->re--;
2208                                     /* Drop down to literal-handling code */
2209                                   }
2210                                 else
2211                                   {
2212 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213                                     if (status != REG_OK)
2214                                       return status;
2215                                     return REG_BADRPT;
2216 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2217                                   }
2218 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2219                               }
2220 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2221                             else
2222 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223                             if (*ctx->re == CHAR_STAR
2224                                 || *ctx->re == CHAR_PLUS
2225                                 || *ctx->re == CHAR_QUESTIONMARK)
2226                               {
2227                                 return REG_BADRPT;
2228                               }
2229                           }
2230                       }
2231                     else if (ctx->re + 1 < ctx->re_end
2232                              && *ctx->re == CHAR_BACKSLASH
2233                              && *(ctx->re + 1) == CHAR_LBRACE)
2234                       {
2235                         ctx->re += 2;
2236                         goto empty_parse_bound;
2237                       }
2238 #ifdef REG_LITERAL
2239                   }
2240 #endif /* REG_LITERAL */
2241
2242                 DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
2243                         REST(ctx->re)));
2244                 /* Note that we can't use an tre_isalpha() test here, since there
2245                    may be characters which are alphabetic but neither upper or
2246                    lower case. */
2247                 if (ctx->cflags & REG_ICASE
2248                     && (tre_isupper_l(*ctx->re, ctx->loc) ||
2249                     tre_islower_l(*ctx->re, ctx->loc)))
2250                   {
2251                     tre_ast_node_t *tmp1;
2252                     tre_ast_node_t *tmp2;
2253
2254                     /* XXX - Can there be more than one opposite-case
2255                        counterpoints for some character in some locale?  Or
2256                        more than two characters which all should be regarded
2257                        the same character if case is ignored?  If yes, there
2258                        does not seem to be a portable way to detect it.  I guess
2259                        that at least for multi-character collating elements there
2260                        could be several opposite-case counterpoints, but they
2261                        cannot be supported portably anyway. */
2262                     tmp1 = tre_ast_new_literal(ctx->mem,
2263                                                tre_toupper_l(*ctx->re, ctx->loc),
2264                                                tre_toupper_l(*ctx->re, ctx->loc),
2265                                                ctx->position);
2266                     if (!tmp1)
2267                       return REG_ESPACE;
2268                     tmp2 = tre_ast_new_literal(ctx->mem,
2269                                                tre_tolower_l(*ctx->re, ctx->loc),
2270                                                tre_tolower_l(*ctx->re, ctx->loc),
2271                                                ctx->position);
2272                     if (!tmp2)
2273                       return REG_ESPACE;
2274                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2275                     if (!result)
2276                       return REG_ESPACE;
2277                   }
2278                 else
2279                   {
2280                     result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2281                                                  ctx->position);
2282                     if (!result)
2283                       return REG_ESPACE;
2284                   }
2285                 ctx->position++;
2286                 ctx->re++;
2287                 break;
2288               }
2289             break;
2290           }
2291
2292         case PARSE_MARK_FOR_SUBMATCH:
2293           {
2294             int submatch_id = tre_stack_pop_int(stack);
2295
2296             ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2297             if (result->submatch_id >= 0 &&
2298                 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2299               {
2300                 tre_ast_node_t *n, *tmp_node;
2301                 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2302                   break;
2303                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2304                 if (n == NULL)
2305                   return REG_ESPACE;
2306                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2307                 if (tmp_node == NULL)
2308                   return REG_ESPACE;
2309                 tmp_node->num_submatches = result->num_submatches;
2310                 result = tmp_node;
2311               }
2312             result->submatch_id = submatch_id;
2313             if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2314               result->num_submatches++;
2315             break;
2316           }
2317
2318         default:
2319           assert(0);
2320           break;
2321         }
2322     }
2323
2324   /* Check for missing closing parentheses. */
2325   if (depth > 0)
2326     return REG_EPAREN;
2327
2328   ctx->result = result;
2329
2330   return REG_OK;
2331 }
2332
2333 /* EOF */