TRE: Add local modifications to extend functionality
authorJohn Marino <draco@marino.st>
Thu, 6 Aug 2015 20:58:06 +0000 (22:58 +0200)
committerJohn Marino <draco@marino.st>
Thu, 6 Aug 2015 22:07:04 +0000 (00:07 +0200)
The stock TRE regex library is very good, but it is missing three
key functionalities:
  1) collation support
  2) equivalence classes
  3) legacy support for character-name table

Luckily, TRE was imported in Apple's Libc and they solved these issues.
This commit represents the modifications Apple made (under the same
2-clause license the author Ville Laurikari issued) minus differences
in xlocale support and the exclusion of "if 0" equivalent blocks.

13 files changed:
contrib/tre/lib/regcomp.c
contrib/tre/lib/regerror.c
contrib/tre/lib/regexec.c
contrib/tre/lib/tre-ast.c
contrib/tre/lib/tre-ast.h
contrib/tre/lib/tre-compile.c
contrib/tre/lib/tre-compile.h
contrib/tre/lib/tre-internal.h
contrib/tre/lib/tre-match-backtrack.c
contrib/tre/lib/tre-match-parallel.c
contrib/tre/lib/tre-match-utils.h
contrib/tre/lib/tre-parse.c
contrib/tre/lib/tre-parse.h

index 281b38e..fefd429 100644 (file)
@@ -19,7 +19,8 @@
 #include "xmalloc.h"
 
 int
-tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
+tre_regncomp_l(regex_t *preg, const char *regex, size_t n, int cflags,
+    locale_t loc)
 {
   int ret;
 #if TRE_WCHAR
@@ -30,13 +31,15 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
   if (wregex == NULL)
     return REG_ESPACE;
 
+  FIX_LOCALE(loc);
+
   /* If the current locale uses the standard single byte encoding of
      characters, we don't do a multibyte string conversion.  If we did,
      many applications which use the default locale would break since
      the default "C" locale uses the 7-bit ASCII character set, and
      all characters with the eighth bit set would be considered invalid. */
 #if TRE_MULTIBYTE
-  if (TRE_MB_CUR_MAX == 1)
+  if (TRE_MB_CUR_MAX_L(loc) == 1)
 #endif /* TRE_MULTIBYTE */
     {
       unsigned int i;
@@ -50,7 +53,7 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
 #if TRE_MULTIBYTE
   else
     {
-      int consumed;
+      size_t consumed;
       tre_char_t *wcptr = wregex;
 #ifdef HAVE_MBSTATE_T
       mbstate_t state;
@@ -58,7 +61,7 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
 #endif /* HAVE_MBSTATE_T */
       while (n > 0)
        {
-         consumed = tre_mbrtowc(wcptr, regex, n, &state);
+         consumed = tre_mbrtowc_l(wcptr, regex, n, &state, loc);
 
          switch (consumed)
            {
@@ -71,15 +74,11 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
                  return REG_BADPAT;
                }
              break;
-           case -1:
+           case (size_t)-1:
+           case (size_t)-2:
              DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
              xfree(wregex);
-             return REG_BADPAT;
-           case -2:
-             /* The last character wasn't complete.  Let's not call it a
-                fatal error. */
-             consumed = n;
-             break;
+             return REG_ILLSEQ;
            }
          regex += consumed;
          n -= consumed;
@@ -90,33 +89,70 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
 #endif /* TRE_MULTIBYTE */
 
   wregex[wlen] = L'\0';
-  ret = tre_compile(preg, wregex, wlen, cflags);
+  ret = tre_compile(preg, wregex, wlen, cflags, loc);
   xfree(wregex);
 #else /* !TRE_WCHAR */
-  ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags);
+  FIX_LOCALE(loc);
+  ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags, loc);
 #endif /* !TRE_WCHAR */
 
   return ret;
 }
 
 int
-tre_regcomp(regex_t *preg, const char *regex, int cflags)
+tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
 {
-  return tre_regncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+  return tre_regncomp_l(preg, regex, n, cflags, __get_locale());
 }
 
+int
+tre_regcomp_l(regex_t *preg, const char *regex, int cflags, locale_t loc)
+{
+  size_t len;
+
+  if (cflags & REG_PEND)
+    {
+      if ((const char *)(preg->re_endp) < regex)
+       return REG_INVARG;
+      len = (const char *)(preg->re_endp) - regex;
+    }
+  else
+    len = strlen(regex);
+  return tre_regncomp_l(preg, regex, len, cflags, loc);
+}
+
+int
+tre_regcomp(regex_t *preg, const char *regex, int cflags)
+{
+  return tre_regcomp_l(preg, regex, cflags, __get_locale());
+}
 
 #ifdef TRE_WCHAR
+int
+tre_regwncomp_l(regex_t *preg, const wchar_t *regex, size_t n, int cflags,
+    locale_t loc)
+{
+  FIX_LOCALE(loc);
+  return tre_compile(preg, regex, n, cflags, loc);
+}
+
 int
 tre_regwncomp(regex_t *preg, const wchar_t *regex, size_t n, int cflags)
 {
-  return tre_compile(preg, regex, n, cflags);
+  return tre_compile(preg, regex, n, cflags, __get_locale());
+}
+
+int
+tre_regwcomp_l(regex_t *preg, const wchar_t *regex, int cflags, locale_t loc)
+{
+  FIX_LOCALE(loc);
+  return tre_compile(preg, regex, wcslen(regex), cflags, loc);
 }
 
 int
 tre_regwcomp(regex_t *preg, const wchar_t *regex, int cflags)
 {
-  return tre_compile(preg, regex, regex ? wcslen(regex) : 0, cflags);
+  return tre_regwncomp(preg, regex, wcslen(regex), cflags);
 }
 #endif /* TRE_WCHAR */
 
index 701f701..298bf46 100644 (file)
@@ -47,7 +47,10 @@ static const char *tre_error_messages[] =
     gettext_noop("Invalid contents of {}"),             /* REG_BADBR */
     gettext_noop("Invalid character range"),            /* REG_ERANGE */
     gettext_noop("Out of memory"),                      /* REG_ESPACE */
-    gettext_noop("Invalid use of repetition operators")         /* REG_BADRPT */
+    gettext_noop("Invalid use of repetition operators"), /* REG_BADRPT */
+    gettext_noop("Empty (sub)expression"),              /* REG_EMPTY */
+    gettext_noop("Invalid argument to regex routine"),  /* REG_INVARG */
+    gettext_noop("Illegal byte sequence")               /* REG_ILLSEQ */
   };
 
 size_t
index 54779ef..a72bd96 100644 (file)
@@ -45,23 +45,136 @@ char *alloca ();
 #include <limits.h>
 
 #include "tre-internal.h"
+#include "tre-match-utils.h"
 #include "tre.h"
 #include "xmalloc.h"
 
 
+/* For each tre_last_matched_t in the lm array, find the last matched branch by
+   comparing the touch value of the cmp_tag's.  For all other branches, reset
+   the corresponding tags.  If reset_all is non-zero, reset all tags in all
+   branches.  Recurse into the nested last matched structures, clearing tags as
+   apprpriate. */
+static void
+tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm,
+                               int n, int start_tag, int reset_all)
+{
+  int max, i, reset;
+  tre_last_matched_branch_t *b;
+
+  DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
+         n, start_tag, reset_all));
+  for (; n-- > 0; lm++)
+    {
+      if (lm->n_branches == 1)
+       {
+         b = lm->branches;
+         if (start_tag > 0)
+           {
+             DPRINT(("  b->cmp_tag=%d %d <? %d\n", b->cmp_tag,
+                     tre_tag_touch_get(tags, b->cmp_tag),
+                     tre_tag_touch_get(tags, start_tag)));
+             reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) <
+                      tre_tag_touch_get(tags, start_tag));
+           }
+         else
+           reset = 0;
+
+         if (reset)
+           {
+             int *t;
+
+             for (i = b->n_tags, t = b->tags; i > 0; i--, t++)
+               {
+                 DPRINT(("  Resetting t%d\n", *t));
+                 tre_tag_reset(tags, *t);
+               }
+           }
+         if (b->n_last_matched > 0)
+           tre_reset_last_matched_branches(tags, b->last_matched,
+                                               b->n_last_matched, 
+                                               lm->start_tag, reset);
+       }
+      else
+       {
+         if (!reset_all)
+           {
+#ifdef TRE_DEBUG
+             int last;
+#endif /* TRE_DEBUG */
+             max = 0;
+             for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
+               {
+                 int t = b->cmp_tag;
+                 int touch = tre_tag_touch_get(tags, t);
+                 if (touch > max)
+                   {
+                     max = touch;
+#ifdef TRE_DEBUG
+                     last = t;
+#endif /* TRE_DEBUG */
+                   }
+               }
+             DPRINT(("  Last touched end tag t%d=%d\n", last, max));
+           }
+
+         for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
+           {
+             reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max);
+             if (reset)
+               {
+                 int j;
+                 int *t;
+
+                 for (j = b->n_tags, t = b->tags; j > 0; j--, t++)
+                   {
+                     DPRINT(("  Resetting t%d\n", *t));
+                     tre_tag_reset(tags, *t);
+                   }
+               }
+             if (b->n_last_matched > 0)
+               tre_reset_last_matched_branches(tags, b->last_matched,
+                                               b->n_last_matched,
+                                               lm->start_tag, reset);
+           }
+       }
+    }
+}
+
+
 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
    endpoint values. */
-void
+reg_errcode_t
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-               const tre_tnfa_t *tnfa, int *tags, int match_eo)
+               const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo)
 {
-  tre_submatch_data_t *submatch_data;
-  unsigned int i, j;
-  int *parents;
+  unsigned int i;
+
+  if (cflags & REG_NOSUB) return REG_OK;
 
   i = 0;
-  if (match_eo >= 0 && !(cflags & REG_NOSUB))
+  if (match_eo >= 0 && intags)
     {
+      const tre_tag_t *tags = intags;
+      tre_submatch_data_t *submatch_data;
+
+      if (tnfa->last_matched_branch &&
+         tnfa->last_matched_branch->n_last_matched > 0)
+       {
+         tre_tag_t *t;
+#ifdef TRE_USE_ALLOCA
+         t = alloca(sizeof(*t) * tnfa->num_tags);
+#else /* !TRE_USE_ALLOCA */
+         t = xmalloc(sizeof(*t) * tnfa->num_tags);
+#endif /* !TRE_USE_ALLOCA */
+         if (!t) return REG_ESPACE;
+         memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t));
+         tre_reset_last_matched_branches(t,
+                                   tnfa->last_matched_branch->last_matched,
+                                   tnfa->last_matched_branch->n_last_matched,
+                                   0, 0);
+         tags = t;
+       }
       /* Construct submatch offsets from the tags. */
       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
       submatch_data = tnfa->submatch_data;
@@ -70,43 +183,29 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
          if (submatch_data[i].so_tag == tnfa->end_tag)
            pmatch[i].rm_so = match_eo;
          else
-           pmatch[i].rm_so = tags[submatch_data[i].so_tag];
+           pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag);
 
          if (submatch_data[i].eo_tag == tnfa->end_tag)
            pmatch[i].rm_eo = match_eo;
          else
-           pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
+           pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag);
 
          /* If either of the endpoints were not used, this submatch
             was not part of the match. */
          if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
            pmatch[i].rm_so = pmatch[i].rm_eo = -1;
 
-         DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
+         DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i,
                  submatch_data[i].so_tag, pmatch[i].rm_so,
                  submatch_data[i].eo_tag, pmatch[i].rm_eo));
          i++;
        }
-      /* Reset all submatches that are not within all of their parent
-        submatches. */
-      i = 0;
-      while (i < tnfa->num_submatches && i < nmatch)
-       {
-         if (pmatch[i].rm_eo == -1)
-           assert(pmatch[i].rm_so == -1);
-         assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
-
-         parents = submatch_data[i].parents;
-         if (parents != NULL)
-           for (j = 0; parents[j] >= 0; j++)
-             {
-               DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
-               if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
-                   || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
-                 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
-             }
-         i++;
-       }
+#ifndef TRE_USE_ALLOCA
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+       if (tags != intags) xfree(tags);
+#pragma GCC diagnostic pop
+#endif /* !TRE_USE_ALLOCA */
     }
 
   while (i < nmatch)
@@ -115,6 +214,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
       pmatch[i].rm_eo = -1;
       i++;
     }
+
+  return REG_OK;
 }
 
 
@@ -129,12 +230,14 @@ tre_have_backrefs(const regex_t *preg)
   return tnfa->have_backrefs;
 }
 
+#ifdef TRE_APPROX
 int
 tre_have_approx(const regex_t *preg)
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   return tnfa->have_approx;
 }
+#endif /* TRE_APPROX */
 
 static int
 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
@@ -142,7 +245,9 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
          int eflags)
 {
   reg_errcode_t status;
-  int *tags = NULL, eo;
+  tre_tag_t *tags = NULL;
+  int eo;
+  size_t offset = 0, count = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
 #ifdef TRE_USE_ALLOCA
@@ -154,19 +259,26 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
        return REG_ESPACE;
     }
 
+  if (
+      (eflags & REG_STARTEND) && pmatch)
+    {
+      if (pmatch->rm_so < 0)
+       return REG_INVARG;
+      if (len == (size_t)-1)
+       {
+         if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo)
+           return REG_INVARG;
+         len = pmatch->rm_eo - pmatch->rm_so;
+       }
+      count = offset = pmatch->rm_so;
+      if (type == STR_WIDE) offset *= sizeof(wchar_t);
+    }
+
   /* Dispatch to the appropriate matcher. */
   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
     {
       /* The regex has back references, use the backtracking matcher. */
-      if (type == STR_USER)
-       {
-         const tre_str_source *source = string;
-         if (source->rewind == NULL || source->compare == NULL)
-           /* The backtracking matcher requires rewind and compare
-              capabilities from the input stream. */
-           return REG_BADPAT;
-       }
-      status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type,
+      status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type,
                                      tags, eflags, &eo);
     }
 #ifdef TRE_APPROX
@@ -178,20 +290,36 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
       tre_regaparams_default(&params);
       params.max_err = 0;
       params.max_cost = 0;
-      status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
+      status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags,
                                   &match, params, eflags, &eo);
     }
 #endif /* TRE_APPROX */
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, (int)len, type,
+      status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type,
                                     tags, eflags, &eo);
     }
 
   if (status == REG_OK)
-    /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    {
+      /* A match was found, so fill the submatch registers. */
+      status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+      /* If doing REG_STARTEND, adjust the pmatch array (we can't build
+         this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
+        tre_fill_pmatch itself). */
+      if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
+         (eflags & REG_STARTEND) && pmatch && nmatch > 0)
+       {
+         size_t i;
+         regmatch_t *p;
+         for (i = nmatch, p = pmatch; i > 0; p++, i--)
+           {
+             if (p->rm_so >= 0) p->rm_so += count;
+             if (p->rm_eo >= 0) p->rm_eo += count;
+           }
+       }
+    }
 #ifndef TRE_USE_ALLOCA
   if (tags)
     xfree(tags);
@@ -204,7 +332,11 @@ tre_regnexec(const regex_t *preg, const char *str, size_t len,
         size_t nmatch, regmatch_t pmatch[], int eflags)
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
-  tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
+  tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
+
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
 
   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
 }
@@ -213,7 +345,7 @@ int
 tre_regexec(const regex_t *preg, const char *str,
        size_t nmatch, regmatch_t pmatch[], int eflags)
 {
-  return tre_regnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags);
+  return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
 }
 
 
@@ -224,6 +356,11 @@ tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
          size_t nmatch, regmatch_t pmatch[], int eflags)
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
+
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
+
   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
 }
 
@@ -231,20 +368,11 @@ int
 tre_regwexec(const regex_t *preg, const wchar_t *str,
         size_t nmatch, regmatch_t pmatch[], int eflags)
 {
-  return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags);
+  return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
 }
 
 #endif /* TRE_WCHAR */
 
-int
-tre_reguexec(const regex_t *preg, const tre_str_source *str,
-        size_t nmatch, regmatch_t pmatch[], int eflags)
-{
-  tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
-  return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags);
-}
-
-
 #ifdef TRE_APPROX
 
 /*
@@ -257,7 +385,9 @@ tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
                 int eflags)
 {
   reg_errcode_t status;
-  int *tags = NULL, eo;
+  tre_tag_t *tags = NULL;
+  int eo;
+  size_t offset = 0, count = 0;
 
   /* If the regexp does not use approximate matching features, the
      maximum cost is zero, and the approximate matcher isn't forced,
@@ -281,10 +411,44 @@ tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
       if (tags == NULL)
        return REG_ESPACE;
     }
+
+  if (
+      (eflags & REG_STARTEND) && match->pmatch)
+    {
+      if (match->pmatch->rm_so < 0)
+       return REG_INVARG;
+      if (len == (size_t)-1)
+       {
+         if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so >
+             match->pmatch->rm_eo)
+           return REG_INVARG;
+         len = match->pmatch->rm_eo - match->pmatch->rm_so;
+       }
+      count = offset = match->pmatch->rm_so;
+      if (type == STR_WIDE) offset *= sizeof(wchar_t);
+    }
+
   status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
                               match, params, eflags, &eo);
   if (status == REG_OK)
-    tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
+    {
+      status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags,
+                              tnfa, tags, eo);
+      /* If doing REG_STARTEND, adjust the pmatch array (we can't build
+         this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
+        tre_fill_pmatch itself). */
+      if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
+         (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0)
+       {
+         size_t i;
+         regmatch_t *p;
+         for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--)
+           {
+             if (p->rm_so >= 0) p->rm_so += count;
+             if (p->rm_eo >= 0) p->rm_eo += count;
+           }
+       }
+    }
 #ifndef TRE_USE_ALLOCA
   if (tags)
     xfree(tags);
@@ -297,7 +461,11 @@ tre_reganexec(const regex_t *preg, const char *str, size_t len,
          regamatch_t *match, regaparams_t params, int eflags)
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
-  tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
+  tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
+
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
 
   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
 }
@@ -306,7 +474,7 @@ int
 tre_regaexec(const regex_t *preg, const char *str,
         regamatch_t *match, regaparams_t params, int eflags)
 {
-  return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags);
+  return tre_reganexec(preg, str, (size_t)-1, match, params, eflags);
 }
 
 #ifdef TRE_WCHAR
@@ -316,6 +484,11 @@ tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
           regamatch_t *match, regaparams_t params, int eflags)
 {
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
+
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
+
   return tre_match_approx(tnfa, str, len, STR_WIDE,
                          match, params, eflags);
 }
@@ -324,7 +497,7 @@ int
 tre_regawexec(const regex_t *preg, const wchar_t *str,
          regamatch_t *match, regaparams_t params, int eflags)
 {
-  return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags);
+  return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags);
 }
 
 #endif /* TRE_WCHAR */
index acb387a..12f8cdc 100644 (file)
@@ -154,8 +154,8 @@ tre_do_print(FILE *stream, tre_ast_node_t *ast, int indent)
       else if (IS_ASSERTION(lit))
        {
          int i;
-         char *assertions[] = { "bol", "eol", "ctype", "!ctype",
-                                "bow", "eow", "wb", "!wb" };
+         char *assertions[] = { "bol", "eol", "bracket",
+                                "bow", "eow", "wb", "!wb", "backref" };
          if (code_max >= ASSERT_LAST << 1)
            assert(0);
          fprintf(stream, "assertions: ");
index d9af376..3804eff 100644 (file)
@@ -10,6 +10,8 @@
 #ifndef TRE_AST_H
 #define TRE_AST_H 1
 
+#include <limits.h>
+
 #include "tre-mem.h"
 #include "tre-internal.h"
 #include "tre-compile.h"
@@ -36,18 +38,26 @@ typedef enum {
 #define IS_BACKREF(x)  ((x)->code_min == BACKREF)
 #define IS_PARAMETER(x) ((x)->code_min == PARAMETER)
 
+#define SUBMATCH_ID_INVISIBLE_START    (INT_MAX / 2 + 1)
+
 
 /* A generic AST node.  All AST nodes consist of this node on the top
    level with `obj' pointing to the actual content. */
-typedef struct {
-  tre_ast_type_t type;   /* Type of the node. */
+typedef struct _tre_ast_node {
   void *obj;             /* Pointer to actual node. */
-  int nullable;
+  tre_last_matched_branch_pre_t *last_matched_branch;
+  tre_last_matched_pre_t *last_matched_in_progress;
+  tre_pos_and_tags_t *firstpos;
+  tre_pos_and_tags_t *lastpos;
+  /* The original pointer is used to point to the original node, when a
+   * CATENATION and tag are added.  If NULL, this is node is the original */
+  struct _tre_ast_node *original;
+  tre_ast_type_t type;   /* Type of the node. */
   int submatch_id;
   int num_submatches;
   int num_tags;
-  tre_pos_and_tags_t *firstpos;
-  tre_pos_and_tags_t *lastpos;
+  short nullable;
+  short make_branches;
 } tre_ast_node_t;
 
 
@@ -55,14 +65,13 @@ typedef struct {
    tags, matching parameter settings, and all expressions that match one
    character. */
 typedef struct {
-  long code_min;
-  long code_max;
+  tre_cint_t code_min;
+  tre_cint_t code_max;
   int position;
   union {
-    tre_ctype_t class;
+    tre_bracket_match_list_t *bracket_match_list;
     int *params;
   } u;
-  tre_ctype_t *neg_classes;
 } tre_literal_t;
 
 /* A "catenation" node.         These are created when two regexps are concatenated.
@@ -95,6 +104,9 @@ typedef struct {
 typedef struct {
   tre_ast_node_t *left;
   tre_ast_node_t *right;
+  /* The left end right end tags if non-zero */
+  int left_tag;
+  int right_tag;
 } tre_union_t;
 
 tre_ast_node_t *
index 948cc46..6c4187e 100644 (file)
@@ -19,6 +19,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
+#include <limits.h>
 
 #include "tre-internal.h"
 #include "tre-mem.h"
 #include "tre.h"
 #include "xmalloc.h"
 
+/*
+  The bit_ffs() macro in bitstring.h is flawed.  Replace it with a working one.
+*/
+#undef bit_ffs
+#define        bit_ffs(name, nbits, value) { \
+       register bitstr_t *_name = name; \
+       register int _byte, _nbits = nbits; \
+       register int _stopbyte = _bit_byte(_nbits), _value = -1; \
+       for (_byte = 0; _byte <= _stopbyte; ++_byte) \
+               if (_name[_byte]) { \
+                       _value = _byte << 3; \
+                       for (_stopbyte = _name[_byte]; !(_stopbyte&0x1); \
+                           ++_value, _stopbyte >>= 1); \
+                       break; \
+               } \
+       *(value) = _value; \
+}
+
 /*
   Algorithms to setup tags so that submatch addressing can be done.
 */
 
 
+#ifdef TRE_DEBUG
+static const char *tag_dir_str[] = {
+  "minimize",
+  "maximize",
+  "left-maximize"
+};
+
+static const char _indent[] = "  ";
+
+static void
+print_indent(int indent)
+{
+  while (indent-- > 0)
+    DPRINT((_indent));
+}
+
+static void print_last_matched_pre(tre_last_matched_pre_t *lm, int indent,
+                                  int num_tags);
+static void
+print_last_match_branch_pre(tre_last_matched_branch_pre_t *branch, int indent,
+                           int num_tags)
+{
+  tre_last_matched_pre_t *u = branch->last_matched;
+  int n_last_matched = 0;
+
+  while (u)
+    {
+      n_last_matched++;
+      u = u->next;
+    }
+
+  print_indent(indent);
+  DPRINT(("BRANCH: tot_branches=%d tot_last_matched=%d tot_tags=%d\n",
+         branch->tot_branches, branch->tot_last_matched, branch->tot_tags));
+  print_indent(indent);
+  DPRINT(("..n_last_matched=%d last_matched=%d\n", branch->n_last_matched,
+         n_last_matched));
+  if (branch->n_last_matched != n_last_matched)
+    DPRINT(("*** mismatch between n_last_matched and unions ***\n"));
+  if (branch->cmp_tag > 0)
+    {
+      int i;
+      const char *sep = " tags=";
+      print_indent(indent);
+      DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags));
+      for (i = 0; i < num_tags; i++)
+       if (bit_test(branch->tags, i))
+         {
+           DPRINT(("%s%d", sep, i));
+           sep = ",";
+         }
+      DPRINT(("\n"));
+    }
+
+  u = branch->last_matched;
+  indent++;
+  while (u)
+    {
+      print_last_matched_pre(u, indent, num_tags);
+      u = u->next;
+    }
+}
+
+static void
+print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, int num_tags)
+{
+  tre_last_matched_branch_pre_t *b = lm->branches;
+  int n_branches = 0;
+
+  while (b)
+    {
+      n_branches++;
+      b = b->next;
+    }
+
+  print_indent(indent);
+  DPRINT(("LAST_MATCHED: tot_branches=%d tot_last_matched=%d tot_tags=%d\n",
+         lm->tot_branches, lm->tot_last_matched, lm->tot_tags));
+  print_indent(indent);
+  DPRINT(("..start_tag=%d n_branches=%d branches=%d\n", lm->start_tag,
+         lm->n_branches, n_branches));
+  if (lm->n_branches != n_branches)
+    DPRINT(("*** mismatch between n and branches ***\n"));
+
+  b = lm->branches;
+  indent++;
+  while (b)
+    {
+      print_last_match_branch_pre(b, indent, num_tags);
+      b = b->next;
+    }
+}
+
+static void print_last_matched(tre_last_matched_t *lm, int indent);
+static void
+print_last_match_branch(tre_last_matched_branch_t *branch, int indent)
+{
+  tre_last_matched_t *u;
+  int i;
+
+  print_indent(indent);
+  DPRINT(("BRANCH: n_last_matched=%d\n", branch->n_last_matched));
+  if (branch->cmp_tag > 0)
+    {
+      print_indent(indent);
+      DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags));
+      if (branch->n_tags > 0)
+       {
+         const char *sep = " tags=";
+         for (i = 0; i < branch->n_tags; i++)
+           {
+             DPRINT(("%s%d", sep, branch->tags[i]));
+             sep = ",";
+           }
+       }
+      DPRINT(("\n"));
+    }
+
+  u = branch->last_matched;
+  indent++;
+  for (i = branch->n_last_matched; i > 0; i--, u++)
+    print_last_matched(u, indent);
+}
+
+static void
+print_last_matched(tre_last_matched_t *lm, int indent)
+{
+  int i;
+  tre_last_matched_branch_t *b;
+
+  print_indent(indent);
+  DPRINT(("LAST_MATCHED: n_branches=%d start_tag=%d\n", lm->n_branches,
+         lm->start_tag));
+
+  b = lm->branches;
+  indent++;
+  for (i = lm->n_branches; i > 0; i--, b++)
+    print_last_match_branch(b, indent);
+}
+#endif /* TRE_DEBUG */
+
+
+/* Merge the tre_last_matched_branch_pre_t of src into dst, creating a new
+   one if needed.  If tag_id > 0, add that tag as well (a negative tag_id will
+   create an unset tre_last_matched_branch_pre_t. */
+static reg_errcode_t
+tre_merge_branches(tre_mem_t mem, tre_ast_node_t *dst, tre_ast_node_t *src,
+                  int tag_id, int num_tags)
+{
+  tre_last_matched_branch_pre_t *db = dst->last_matched_branch;
+  tre_last_matched_branch_pre_t *sb = (src ? src->last_matched_branch : NULL);
+
+  if (db)
+    {
+      if (sb)
+       {
+         bitstr_t *l = db->tags;
+         bitstr_t *r = sb->tags;
+         int i = bitstr_size(num_tags);
+
+         while(i-- > 0)
+           *l++ |= *r++;
+         /* db and sb are the info from two parallel sub-trees, so the tags
+            must be mutually exclusive, and we can just add their numbers */
+         db->n_tags += sb->n_tags;
+         db->tot_tags += sb->tot_tags;
+         if (db->last_matched)
+           {
+             if (sb->last_matched)
+               {
+                 tre_last_matched_pre_t *u = db->last_matched;
+
+                 while(u->next)
+                   u = u->next;
+                 u->next = sb->last_matched;
+                 db->n_last_matched += sb->n_last_matched;
+                 db->tot_branches += sb->tot_branches;
+                 db->tot_last_matched += sb->tot_last_matched;
+               }
+           }
+           else if (sb->last_matched)
+             {
+               db->last_matched = sb->last_matched;
+               db->n_last_matched = sb->n_last_matched;
+               db->tot_branches = sb->tot_branches;
+               db->tot_last_matched = sb->tot_last_matched;
+             }
+       }
+    }
+  else
+    db = sb;
+
+  if (tag_id != 0)
+    {
+      if (!db)
+       {
+         db = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t)
+                             + bitstr_size(num_tags));
+         if (db == NULL)
+           return REG_ESPACE;
+         db->tot_branches = 1;
+       }
+      if (tag_id > 0)
+       {
+         /* tag_id is a new tag, and shouldn't exist in db's tags,
+            so we can always increment n_tags */
+         bit_set(db->tags, tag_id);
+         db->n_tags++;
+         db->tot_tags++;
+       }
+    }
+  dst->last_matched_branch = db;
+  return REG_OK;
+}
+
+
 /* Inserts a catenation node to the root of the tree given in `node'.
    As the left child a new tag with number `tag_id' to `node' is added,
    and the right child is the old root. */
@@ -50,19 +285,18 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
   if (c->left == NULL)
     return REG_ESPACE;
-  c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
+  c->right = tre_mem_calloc(mem, sizeof(tre_ast_node_t));
   if (c->right == NULL)
     return REG_ESPACE;
 
   c->right->obj = node->obj;
   c->right->type = node->type;
+  c->right->last_matched_branch = node->last_matched_branch;
   c->right->nullable = -1;
   c->right->submatch_id = -1;
-  c->right->firstpos = NULL;
-  c->right->lastpos = NULL;
-  c->right->num_tags = 0;
   node->obj = c;
   node->type = CATENATION;
+  node->original = c->right;
   return REG_OK;
 }
 
@@ -82,50 +316,58 @@ tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
   if (c->right == NULL)
     return REG_ESPACE;
-  c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
+  c->left = tre_mem_calloc(mem, sizeof(tre_ast_node_t));
   if (c->left == NULL)
     return REG_ESPACE;
 
   c->left->obj = node->obj;
   c->left->type = node->type;
+  c->left->last_matched_branch = node->last_matched_branch;
   c->left->nullable = -1;
   c->left->submatch_id = -1;
-  c->left->firstpos = NULL;
-  c->left->lastpos = NULL;
-  c->left->num_tags = 0;
   node->obj = c;
   node->type = CATENATION;
+  node->original = c->left;
   return REG_OK;
 }
 
 typedef enum {
   ADDTAGS_RECURSE,
+  ADDTAGS_RECURSE_NOT_TOP_UNION,
   ADDTAGS_AFTER_ITERATION,
   ADDTAGS_AFTER_UNION_LEFT,
   ADDTAGS_AFTER_UNION_RIGHT,
   ADDTAGS_AFTER_CAT_LEFT,
   ADDTAGS_AFTER_CAT_RIGHT,
-  ADDTAGS_SET_SUBMATCH_END
+  ADDTAGS_SET_SUBMATCH_END,
+  ADDTAGS_UNION_RECURSE,
+  ADDTAGS_UNION_RIGHT_RECURSE,
+  ADDTAGS_AFTER_UNION_TOP,
 } tre_addtags_symbol_t;
 
+enum {
+  COPY_LAST_MATCHED_BRANCH,
+  COPY_LAST_MATCHED_BRANCH_NEXT,
+  COPY_LAST_MATCHED,
+  COPY_LAST_MATCHED_NEXT,
+};
 
-typedef struct {
-  int tag;
-  int next_tag;
-} tre_tag_states_t;
 
+#define REGSET_UNSET           ((unsigned)-1)
 
 /* Go through `regset' and set submatch data for submatches that are
    using this tag. */
 static void
-tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
+tre_purge_regset(unsigned *regset, tre_tnfa_t *tnfa, int tag)
 {
   int i;
 
-  for (i = 0; regset[i] >= 0; i++)
+  for (i = 0; regset[i] != REGSET_UNSET; i++)
     {
       int id = regset[i] / 2;
       int start = !(regset[i] % 2);
+      if (id >= SUBMATCH_ID_INVISIBLE_START)
+       continue;
       DPRINT(("  Using tag %d for %s offset of "
              "submatch %d\n", tag,
              start ? "start" : "end", id));
@@ -138,6 +380,10 @@ tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
 }
 
 
+#define REGSET_HAS_STARTS      0x1
+#define REGSET_HAS_ENDS                0x2
+
+
 /* Adds tags to appropriate locations in the parse tree in `tree', so that
    subexpressions marked for submatch addressing can be traced. */
 static reg_errcode_t
@@ -150,49 +396,53 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
   int bottom = tre_stack_num_objects(stack);
   /* True for first pass (counting number of needed tags) */
   int first_pass = (mem == NULL || tnfa == NULL);
-  int *regset, *orig_regset;
+  unsigned *regset, *orig_regset;
+  int regset_contains = 0;
   int num_tags = 0; /* Total number of tags. */
   int num_minimals = 0;         /* Number of special minimal tags. */
   int tag = 0;     /* The tag that is to be added next. */
   int next_tag = 1; /* Next tag to use after this one. */
-  int *parents;            /* Stack of submatches the current submatch is
-                      contained in. */
   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
-  tre_tag_states_t *saved_states;
-
-  tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
+  int *reorder_tags = NULL; /* Tag reorder array: a pair for each reorder,
+                            * the first is the tag to reorder, the second
+                            * is the tag after which the first is reordered */
+  int *rtp;                /* Pointer used to fill in reorder_tags and
+                            * tag_order */
+  int *to_reorder;          /* Transform array converting sequential order to
+                            * that specified by reorder_tags */
+  int id;
+
+  tre_tag_direction_t direction = TRE_TAG_LEFT_MAXIMIZE;
   if (!first_pass)
     {
+      DPRINT(("Initializing direction to %s\n", tag_dir_str[direction]));
       tnfa->end_tag = 0;
       tnfa->minimal_tags[0] = -1;
     }
 
-  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
+  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches
+                  + tnfa->num_submatches_invisible + 1) * 2));
   if (regset == NULL)
-    return REG_ESPACE;
-  regset[0] = -1;
-  orig_regset = regset;
-
-  parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
-  if (parents == NULL)
     {
-      xfree(regset);
-      return REG_ESPACE;
+      status = REG_ESPACE;
+      goto error_regset;
     }
-  parents[0] = -1;
+  regset[0] = REGSET_UNSET;
+  orig_regset = regset;
 
-  saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
-  if (saved_states == NULL)
-    {
-      xfree(regset);
-      xfree(parents);
-      return REG_ESPACE;
-    }
-  else
+  if (!first_pass)
     {
-      unsigned int i;
-      for (i = 0; i <= tnfa->num_submatches; i++)
-       saved_states[i].tag = -1;
+      /* Allocate all memory for reorder_tags, tag_order, to_seq_order and
+       * to_reorder in one batch (assuming all are the same type) */
+      rtp = reorder_tags = xmalloc(sizeof(*reorder_tags) *
+                                  ((2 * tnfa->num_reorder_tags + 1) +
+                                  tnfa->num_tags));
+      if (reorder_tags == NULL)
+       {
+         status = REG_ESPACE;
+         goto error_reorder_tags;
+       }
+      to_reorder = reorder_tags + (2 * tnfa->num_reorder_tags + 1);
     }
 
   STACK_PUSH(stack, voidptr, node);
@@ -206,60 +456,102 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
       switch (symbol)
        {
+         int top_union;
 
        case ADDTAGS_SET_SUBMATCH_END:
          {
-           int id = tre_stack_pop_int(stack);
            int i;
 
+           id = tre_stack_pop_int(stack);
+           node = tre_stack_pop_voidptr(stack);
            /* Add end of this submatch to regset. */
-           for (i = 0; regset[i] >= 0; i++);
+           for (i = 0; regset[i] != REGSET_UNSET; i++);
            regset[i] = id * 2 + 1;
            regset[i + 1] = -1;
+           regset_contains |= REGSET_HAS_ENDS;
 
-           /* Pop this submatch from the parents stack. */
-           for (i = 0; parents[i] >= 0; i++);
-           parents[i - 1] = -1;
+           /* Always put a tag after a minimal iterator. */
+           if (minimal_tag >= 0)
+             {
+               if (first_pass)
+                 {
+                   node->num_tags++;
+                   DPRINT(("  ADDTAGS_SET_SUBMATCH_END: node->num_tags = %d\n",
+                           node->num_tags));
+                 }
+               else
+                 {
+                   int i;
+                   status = tre_merge_branches(mem, node, NULL, tag,
+                                               tnfa->num_tags);
+                   if (status != REG_OK)
+                     break;
+                   status = tre_add_tag_right(mem, node, tag);
+                   if (status != REG_OK)
+                     break;
+                   tnfa->tag_directions[tag] = TRE_TAG_MINIMIZE;
+                   DPRINT(("Setting t%d direction to %s\n", tag,
+                           tag_dir_str[tnfa->tag_directions[tag]]));
+                   DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
+                   for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
+                   tnfa->minimal_tags[i] = tag;
+                   tnfa->minimal_tags[i + 1] = minimal_tag;
+                   tnfa->minimal_tags[i + 2] = -1;
+
+                   DPRINT(("  Minimal end: t%d reordered to "
+                           "after t%d\n", tag, minimal_tag));
+                   /* Append to tag_order, move "tag" after
+                    * "minimal_tag" */
+                   *rtp++ = tag;
+                   *rtp++ = minimal_tag;
+
+                   num_minimals++;
+                   tre_purge_regset(regset, tnfa, tag);
+                 }
+
+               minimal_tag = -1;
+               DPRINT(("  ADDTAGS_SET_SUBMATCH_END num_tags++ tag=%d\n", tag));
+               regset[0] = REGSET_UNSET;
+               regset_contains = 0;
+               tag = next_tag;
+               num_tags++;
+               next_tag++;
+             }
            break;
          }
 
+       case ADDTAGS_RECURSE_NOT_TOP_UNION:
+         /* Like ADDTAGS_RECURSE, except that top_union is set to zero,
+          * indicating that if a union is being processed, it is not the
+          * top-most of a series */
+         top_union = 0;
+         goto do_addtags_recurse;
+
        case ADDTAGS_RECURSE:
+         /* Setting top_union to 1 means that if a union is begin processed,
+          * it is the top-most of a series, and should recurse through the
+          * series to set the left_tag and right_tag values */
+         top_union = 1;
+
+do_addtags_recurse:
          node = tre_stack_pop_voidptr(stack);
 
-         if (node->submatch_id >= 0)
+         id = node->submatch_id;
+         if (id >= 0)
            {
-             int id = node->submatch_id;
              int i;
 
 
              /* Add start of this submatch to regset. */
-             for (i = 0; regset[i] >= 0; i++);
+             for (i = 0; regset[i] != REGSET_UNSET; i++);
              regset[i] = id * 2;
              regset[i + 1] = -1;
-
-             if (!first_pass)
-               {
-                 for (i = 0; parents[i] >= 0; i++);
-                 tnfa->submatch_data[id].parents = NULL;
-                 if (i > 0)
-                   {
-                     int *p = xmalloc(sizeof(*p) * (i + 1));
-                     if (p == NULL)
-                       {
-                         status = REG_ESPACE;
-                         break;
-                       }
-                     assert(tnfa->submatch_data[id].parents == NULL);
-                     tnfa->submatch_data[id].parents = p;
-                     for (i = 0; parents[i] >= 0; i++)
-                       p[i] = parents[i];
-                     p[i] = -1;
-                   }
-               }
+             regset_contains |= REGSET_HAS_STARTS;
 
              /* Add end of this submatch to regset after processing this
                 node. */
-             STACK_PUSHX(stack, int, node->submatch_id);
+             STACK_PUSH(stack, voidptr, node);
+             STACK_PUSHX(stack, int, id);
              STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
            }
 
@@ -269,39 +561,72 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
              {
                tre_literal_t *lit = node->obj;
 
-               if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
+               if (!IS_SPECIAL(lit) || IS_BACKREF(lit) || IS_EMPTY(lit) || IS_ASSERTION(lit))
                  {
-                   int i;
                    DPRINT(("Literal %d-%d\n",
                            (int)lit->code_min, (int)lit->code_max));
-                   if (regset[0] >= 0)
+                   if (regset_contains)
                      {
                        /* Regset is not empty, so add a tag before the
                           literal or backref. */
-                       if (!first_pass)
+                       if (first_pass)
                          {
-                           status = tre_add_tag_left(mem, node, tag);
-                           tnfa->tag_directions[tag] = direction;
-                           if (minimal_tag >= 0)
-                             {
-                               DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
-                               for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-                               tnfa->minimal_tags[i] = tag;
-                               tnfa->minimal_tags[i + 1] = minimal_tag;
-                               tnfa->minimal_tags[i + 2] = -1;
-                               minimal_tag = -1;
-                               num_minimals++;
-                             }
-                           tre_purge_regset(regset, tnfa, tag);
+                           DPRINT(("  ADDTAGS_RECURSE:LITERAL node->num_tags = 1\n"));
+                           node->num_tags = 1;
                          }
                        else
                          {
-                           DPRINT(("  num_tags = 1\n"));
-                           node->num_tags = 1;
+                           status = tre_merge_branches(mem, node, NULL, tag,
+                                                       tnfa->num_tags);
+                           if (status != REG_OK)
+                             break;
+                           status = tre_add_tag_left(mem, node, tag);
+                           if (status != REG_OK)
+                             break;
+                           if (regset_contains == REGSET_HAS_STARTS)
+                             tnfa->tag_directions[tag] = TRE_TAG_LEFT_MAXIMIZE;
+                           else
+                             tnfa->tag_directions[tag] = direction;
+                           DPRINT(("Setting t%d direction to %s\n", tag,
+                                   tag_dir_str[tnfa->tag_directions[tag]]));
+                           tre_purge_regset(regset, tnfa, tag);
+
+                           if (IS_BACKREF(lit))
+                             {
+                               int b = lit->code_max;
+                               int t = tnfa->submatch_data[b].so_tag;
+                               /* Fail if the referenced submatch hasn't been
+                                * completed yet */
+                               if (tnfa->submatch_data[b].eo_tag < 0)
+                                 {
+                                   status = REG_ESUBREG;
+                                   break;
+                                 }
+                               if (t < tag)
+                                 {
+                                   DPRINT(("  Backref %d start: "
+                                           "t%d reordered to before t%d\n",
+                                           b, tag, t));
+                                   if(t > 0)
+                                     t--;
+                                   /* Append to tag_order, move "tag" after
+                                    * "t" */
+                                   *rtp++ = tag;
+                                   *rtp++ = t;
+                                 }
+#if TRE_DEBUG
+                               else
+                                 DPRINT(("  Backref %d start: "
+                                         "(t%d already before t%d)\n",
+                                           b, tag, t));
+#endif /* TRE_DEBUG */
+                             }
                          }
 
-                       DPRINT(("  num_tags++\n"));
-                       regset[0] = -1;
+                       DPRINT(("  ADDTAGS_RECURSE:LITERAL num_tags++ tag=%d\n",
+                               tag));
+                       regset[0] = REGSET_UNSET;
+                       regset_contains = 0;
                        tag = next_tag;
                        num_tags++;
                        next_tag++;
@@ -337,7 +662,8 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
                if (left->num_tags > 0 && right->num_tags > 0)
                  {
                    /* Reserve the next tag to the right child. */
-                   DPRINT(("  Reserving next_tag %d to right child\n",
+                   DPRINT(("  ADDTAGS_RECURSE:CATENATION num_tags++ "
+                           "Reserving next_tag %d to right child\n",
                            next_tag));
                    reserved_tag = next_tag;
                    next_tag++;
@@ -357,174 +683,309 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
                DPRINT(("Iteration\n"));
 
                if (first_pass)
-                 {
-                   STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
-                 }
-               else
-                 {
-                   STACK_PUSHX(stack, int, tag);
-                   STACK_PUSHX(stack, int, iter->minimal);
-                 }
+                 STACK_PUSHX(stack, int, regset_contains != 0);
+               STACK_PUSHX(stack, int, tag);
                STACK_PUSHX(stack, voidptr, node);
                STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
 
                STACK_PUSHX(stack, voidptr, iter->arg);
                STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
 
-               /* Regset is not empty, so add a tag here. */
-               if (regset[0] >= 0 || iter->minimal)
+               /* Regset is not empty, so add a tag here (this always happens
+                  because iterators always get submatch id, even if in the
+                  invisible range) */
+               if (regset_contains)
                  {
                    if (!first_pass)
                      {
-                       int i;
+                       status = tre_merge_branches(mem, node, NULL, tag,
+                                                   tnfa->num_tags);
+                       if (status != REG_OK)
+                         break;
                        status = tre_add_tag_left(mem, node, tag);
-                       if (iter->minimal)
-                         tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
+                       if (status != REG_OK)
+                         break;
+                       if (regset_contains == REGSET_HAS_STARTS && tag != 0)
+                         tnfa->tag_directions[tag] = iter->minimal ?
+                                                     TRE_TAG_MINIMIZE :
+                                                     TRE_TAG_LEFT_MAXIMIZE;
                        else
                          tnfa->tag_directions[tag] = direction;
-                       if (minimal_tag >= 0)
-                         {
-                           DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
-                           for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-                           tnfa->minimal_tags[i] = tag;
-                           tnfa->minimal_tags[i + 1] = minimal_tag;
-                           tnfa->minimal_tags[i + 2] = -1;
-                           minimal_tag = -1;
-                           num_minimals++;
-                         }
+                       DPRINT(("Setting t%d direction to %s\n", tag,
+                               tag_dir_str[tnfa->tag_directions[tag]]));
                        tre_purge_regset(regset, tnfa, tag);
                      }
 
-                   DPRINT(("  num_tags++\n"));
-                   regset[0] = -1;
+                   DPRINT(("  ADDTAGS_RECURSE:ITERATION num_tags++ tag=%d\n",
+                           tag));
+                   regset[0] = REGSET_UNSET;
+                   regset_contains = 0;
                    tag = next_tag;
                    num_tags++;
                    next_tag++;
                  }
-               direction = TRE_TAG_MINIMIZE;
+               direction = TRE_TAG_LEFT_MAXIMIZE;
+               DPRINT(("  Setting direction to %s\n", tag_dir_str[direction]));
              }
              break;
            case UNION:
              {
-               tre_union_t *uni = node->obj;
-               tre_ast_node_t *left = uni->left;
-               tre_ast_node_t *right = uni->right;
-               int left_tag;
-               int right_tag;
+               tre_union_t *uni;
+               tre_ast_node_t *left;
+               tre_ast_node_t *right;
+               int front_tag = -1;
+
+               DPRINT(("Union\n"));
 
-               if (regset[0] >= 0)
+               if (regset_contains)
                  {
-                   left_tag = next_tag;
-                   right_tag = next_tag + 1;
+                   DPRINT(("  UNION num_tags++ tag=%d\n", tag));
+                   front_tag = tag;
+                   tag = next_tag;
+                   num_tags++;
+                   next_tag++;
                  }
-               else
+
+               /* For the top union, walk the tree of consecutive unions,
+                * setting the left_tag and right_tag values in increasing
+                * order (left to right priority) */
+               if (top_union &&
+                   (node->num_submatches -
+                   (node->submatch_id >= 0 &&
+                   node->submatch_id < SUBMATCH_ID_INVISIBLE_START)) > 0)
                  {
-                   left_tag = tag;
-                   right_tag = next_tag;
-                 }
+                   tre_ast_node_t *n;
+                   int last = tre_stack_num_objects(stack);
 
-               DPRINT(("Union\n"));
+                   STACK_PUSH(stack, voidptr, node);
+                   STACK_PUSH(stack, int, ADDTAGS_UNION_RECURSE);
+
+                   while (tre_stack_num_objects(stack) > last)
+                     {
+                       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
+                       switch (symbol)
+                         {
+                         case ADDTAGS_UNION_RECURSE:
+                           n = tre_stack_pop_voidptr(stack);
+                           uni = n->obj;
+                           left = uni->left;
+
+                           /* Since the top union has num_submatches > 0,
+                            * we set all the consecutive union's
+                            * make_branches to 1 to force the generation
+                            * of end tags for each union branch. */
+                           n->make_branches = 1;
+
+                           STACK_PUSH(stack, voidptr, n);
+                           STACK_PUSH(stack, int,
+                                      ADDTAGS_UNION_RIGHT_RECURSE);
+
+                           if (left->type == UNION)
+                             {
+                               STACK_PUSH(stack, voidptr, left);
+                               STACK_PUSH(stack, int,
+                                          ADDTAGS_UNION_RECURSE);
+                             }
+                           else
+                             {
+                               DPRINT(("  ADDTAGS_UNION_RECURSE "
+                                       "num_tags++ tag=%d\n", tag));
+                               uni->left_tag = tag;
+                               tag = next_tag;
+                               num_tags++;
+                               next_tag++;
+                             }
+                           break;
+
+                         case ADDTAGS_UNION_RIGHT_RECURSE:
+                           n = tre_stack_pop_voidptr(stack);
+                           uni = n->obj;
+                           right = uni->right;
+
+                           if (right->type == UNION)
+                             {
+                               STACK_PUSH(stack, voidptr, right);
+                               STACK_PUSH(stack, int,
+                                          ADDTAGS_UNION_RECURSE);
+                             }
+                           else
+                             {
+                               DPRINT(("  ADDTAGS_UNION_RIGHT_RECURSE "
+                                       "num_tags++ tag=%d\n", tag));
+                               uni->right_tag = tag;
+                               tag = next_tag;
+                               num_tags++;
+                               next_tag++;
+                             }
+
+                           break;
+
+                         default:
+                           assert(0);
+                           break;
+
+                         } /* end switch(symbol) */
+                     } /* end while(tre_stack_num_objects(stack) > last */
+                   if (!first_pass)
+                     {
+                       STACK_PUSHX(stack, int, front_tag);
+                       STACK_PUSHX(stack, voidptr, node);
+                       STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_TOP);
+                     }
+                 } /* end if (top_union && ...) */
+
+               uni = node->obj;
+               left = uni->left;
+               right = uni->right;
 
                /* After processing right child. */
-               STACK_PUSHX(stack, int, right_tag);
-               STACK_PUSHX(stack, int, left_tag);
                STACK_PUSHX(stack, voidptr, regset);
-               STACK_PUSHX(stack, int, regset[0] >= 0);
+               STACK_PUSHX(stack, int, regset_contains != 0);
                STACK_PUSHX(stack, voidptr, node);
-               STACK_PUSHX(stack, voidptr, right);
-               STACK_PUSHX(stack, voidptr, left);
                STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
 
                /* Process right child. */
                STACK_PUSHX(stack, voidptr, right);
-               STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+               STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION);
 
                /* After processing left child. */
                STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
 
                /* Process left child. */
                STACK_PUSHX(stack, voidptr, left);
-               STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
+               STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION);
 
                /* Regset is not empty, so add a tag here. */
-               if (regset[0] >= 0)
+               if (regset_contains)
                  {
                    if (!first_pass)
                      {
-                       int i;
-                       status = tre_add_tag_left(mem, node, tag);
-                       tnfa->tag_directions[tag] = direction;
-                       if (minimal_tag >= 0)
-                         {
-                           DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
-                           for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-                           tnfa->minimal_tags[i] = tag;
-                           tnfa->minimal_tags[i + 1] = minimal_tag;
-                           tnfa->minimal_tags[i + 2] = -1;
-                           minimal_tag = -1;
-                           num_minimals++;
-                         }
-                       tre_purge_regset(regset, tnfa, tag);
+                       status = tre_merge_branches(mem, node, NULL, front_tag,
+                                                   tnfa->num_tags);
+                       if (status != REG_OK)
+                         break;
+                       status = tre_add_tag_left(mem, node, front_tag);
+                       if (status != REG_OK)
+                         break;
+                       if (regset_contains == REGSET_HAS_STARTS)
+                         tnfa->tag_directions[front_tag] = TRE_TAG_LEFT_MAXIMIZE;
+                       else
+                         tnfa->tag_directions[front_tag] = direction;
+                       DPRINT(("Setting t%d direction to %s\n", front_tag,
+                               tag_dir_str[tnfa->tag_directions[front_tag]]));
+                       tre_purge_regset(regset, tnfa, front_tag);
                      }
 
-                   DPRINT(("  num_tags++\n"));
-                   regset[0] = -1;
-                   tag = next_tag;
-                   num_tags++;
-                   next_tag++;
-                 }
-
-               if (node->num_submatches > 0)
-                 {
-                   /* The next two tags are reserved for markers. */
-                   next_tag++;
-                   tag = next_tag;
-                   next_tag++;
+                   regset[0] = REGSET_UNSET;
+                   regset_contains = 0;
                  }
 
                break;
              }
-           }
-
-         if (node->submatch_id >= 0)
-           {
-             int i;
-             /* Push this submatch on the parents stack. */
-             for (i = 0; parents[i] >= 0; i++);
-             parents[i] = node->submatch_id;
-             parents[i + 1] = -1;
-           }
+           } /* end switch (node->type) */
 
          break; /* end case: ADDTAGS_RECURSE */
 
        case ADDTAGS_AFTER_ITERATION:
          {
-           int minimal = 0;
+           tre_iteration_t *iter;
+           tre_ast_node_t *orig;
            int enter_tag;
+
            node = tre_stack_pop_voidptr(stack);
+           orig = node->original ? node->original : node;
+           iter = (tre_iteration_t *)orig->obj;
+           enter_tag = tre_stack_pop_int(stack);
+           if (iter->minimal)
+             minimal_tag = enter_tag;
+
+           DPRINT(("After iteration\n"));
            if (first_pass)
              {
-               node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
-                 + tre_stack_pop_int(stack);
-               minimal_tag = -1;
+               node->num_tags = iter->arg->num_tags + tre_stack_pop_int(stack);
+               DPRINT(("  ADDTAGS_AFTER_ITERATION: node->num_tags = %d\n",
+                       node->num_tags));
              }
            else
              {
-               minimal = tre_stack_pop_int(stack);
-               enter_tag = tre_stack_pop_int(stack);
-               if (minimal)
-                 minimal_tag = enter_tag;
-             }
+               /* node->last_matched_branch will have the start tag (the tag
+                  just *before* the iteration).  iter->arg->last_matched_branch
+                  will have the tag(s) inside the iteration, the ones that
+                  may need to be reset if the iteration doesn't match.  So
+                  before we merge iter->arg into node, we need to set up
+                  a new tre_last_matched_t and tre_last_matched_branch_t,
+                  using any of the inside tags as cmp_tag (we choose the first
+                  tag found by bit_ffs).  If there are no inside tags, we
+                  don't bother creating the extra structures. */
+               tre_last_matched_branch_pre_t *b =
+                                               iter->arg->last_matched_branch;
+
+               if (b && b->n_tags > 0)
+                 {
+                   tre_last_matched_pre_t *u;
 
-           DPRINT(("After iteration\n"));
-           if (!first_pass)
-             {
-               DPRINT(("  Setting direction to %s\n",
-                       minimal ? "minimize" : "maximize"));
-               if (minimal)
-                 direction = TRE_TAG_MINIMIZE;
+                   bit_ffs(b->tags, num_tags, &b->cmp_tag);
+                   DPRINT(("  ADDTAGS_AFTER_ITERATION: n_tags=%d "
+                           "cmp_tag = %d\n", b->n_tags, b->cmp_tag));
+
+                   u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t) +
+                                      sizeof(tre_last_matched_branch_pre_t)
+                                      + bitstr_size(tnfa->num_tags));
+                   if (!u)
+                     {
+                       status = REG_ESPACE;
+                       break;
+                     }
+                   u->branches = b;
+                   u->n_branches = 1;
+                   u->start_tag = b->cmp_tag;
+                   u->tot_branches = b->tot_branches;
+                   u->tot_last_matched = 1 + b->tot_last_matched;
+                   u->tot_tags = b->tot_tags;
+
+                   b = (tre_last_matched_branch_pre_t *)(u + 1);
+                   b->last_matched = u;
+                   b->n_last_matched = 1;
+                   b->tot_branches = 1 + u->tot_branches;
+                   b->tot_last_matched = u->tot_last_matched;
+                   b->tot_tags = u->tot_tags;
+
+                   iter->arg->last_matched_branch = b;
+                 }
+               status = tre_merge_branches(mem, node, iter->arg, 0,
+                                           tnfa->num_tags);
+               if (status != REG_OK)
+                 break;
+
+               if (iter->minimal)
+                 {
+                   /* Add a union with a left EMPTY literal and the right
+                      being iter->arg.  This should force the tags inside
+                      the minimal iteration to prefer being unset */
+                   if (iter->min == 0 && iter->max <= 1)
+                     {
+                       tre_ast_node_t *u, *e;
+
+                       e = tre_ast_new_literal(mem, EMPTY, -1, -1);
+                       if (e == NULL)
+                         {
+                           status = REG_ESPACE;
+                           break;
+                         }
+                       u = tre_ast_new_union(mem, e, iter->arg);
+                       if (u == NULL)
+                         {
+                           status = REG_ESPACE;
+                           break;
+                         }
+                       iter->arg = u;
+                     }
+
+                   direction = TRE_TAG_MINIMIZE;
+                 }
                else
                  direction = TRE_TAG_MAXIMIZE;
+               DPRINT(("  Setting direction to %s\n", tag_dir_str[direction]));
              }
            break;
          }
@@ -544,12 +1005,29 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
          }
 
        case ADDTAGS_AFTER_CAT_RIGHT:
-         DPRINT(("After cat right\n"));
-         node = tre_stack_pop_voidptr(stack);
-         if (first_pass)
-           node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
-             + ((tre_catenation_t *)node->obj)->right->num_tags;
-         break;
+         {
+           tre_catenation_t *cat;
+
+           DPRINT(("After cat right\n"));
+           node = tre_stack_pop_voidptr(stack);
+           cat = node->obj;
+           if (first_pass)
+             {
+               node->num_tags = cat->left->num_tags + cat->right->num_tags;
+               DPRINT(("  ADDTAGS_AFTER_CAT_RIGHT: node->num_tags = %d\n",
+                       node->num_tags));
+             }
+           else
+             {
+               status = tre_merge_branches(mem, cat->left, cat->right, 0,
+                                           tnfa->num_tags);
+               if (status != REG_OK)
+                 break;
+               status = tre_merge_branches(mem, node, cat->left, 0,
+                                           tnfa->num_tags);
+             }
+           break;
+         }
 
        case ADDTAGS_AFTER_UNION_LEFT:
          DPRINT(("After union left\n"));
@@ -557,53 +1035,223 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
             the right operand the items currently in the array are
             invisible.  The original bottom was saved at ADDTAGS_UNION and
             will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
-         while (*regset >= 0)
+         while (*regset != REGSET_UNSET)
            regset++;
+         regset_contains = 0;
          break;
 
        case ADDTAGS_AFTER_UNION_RIGHT:
          {
-           int added_tags, tag_left, tag_right;
-           tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
-           tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
+           int added_tags;
+           tre_ast_node_t *orig;
+           tre_union_t *uni;
+           /* Note: node may not be a UNION, but a CATENATION with a left
+            * tag.  So that is why we pass the original node->obj on the
+            * stack, to get the union's true values. */
+
            DPRINT(("After union right\n"));
            node = tre_stack_pop_voidptr(stack);
+           orig = node->original ? node->original : node;
+           uni = (tre_union_t *)orig->obj;
            added_tags = tre_stack_pop_int(stack);
            if (first_pass)
              {
-               node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
-                 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
-                 + ((node->num_submatches > 0) ? 2 : 0);
+               node->num_tags = uni->left->num_tags + uni->right->num_tags
+                                + added_tags;
+               if (uni->left_tag > 0)
+                 node->num_tags++;
+               if (uni->right_tag > 0)
+                 node->num_tags++;
+               DPRINT(("  ADDTAGS_AFTER_UNION_RIGHT: node->num_tags = %d\n",
+                       node->num_tags));
              }
            regset = tre_stack_pop_voidptr(stack);
-           tag_left = tre_stack_pop_int(stack);
-           tag_right = tre_stack_pop_int(stack);
 
            /* Add tags after both children, the left child gets a smaller
               tag than the right child.  This guarantees that we prefer
               the left child over the right child. */
            /* XXX - This is not always necessary (if the children have
               tags which must be seen for every match of that child). */
-           /* XXX - Check if this is the only place where tre_add_tag_right
-              is used.  If so, use tre_add_tag_left (putting the tag before
-              the child as opposed after the child) and throw away
-              tre_add_tag_right. */
-           if (node->num_submatches > 0)
+           if (!first_pass && node->make_branches)
              {
-               if (!first_pass)
+               tre_last_matched_branch_pre_t *lb =
+                 uni->left->last_matched_branch;
+               tre_last_matched_branch_pre_t *rb =
+                 uni->right->last_matched_branch;
+               tre_last_matched_pre_t *lu =
+                 uni->left->last_matched_in_progress;
+               tre_last_matched_pre_t *ru =
+                 uni->right->last_matched_in_progress;
+               tre_last_matched_pre_t *u;
+               /* We don't need to call tre_merge_branches because these
+                * tags don't participate in submatch ranges, so don't need
+                * to be recorded.  But we do set the cmp_tag entry of the
+                * tre_last_matched_branch_pre_t, so we might call
+                * tre_merge_branches if we need to create an empty
+                * tre_last_matched_branch_pre_t. */
+               if (uni->left_tag > 0)
+                 {
+                   DPRINT(("Setting t%d direction to maximize\n",
+                           uni->left_tag));
+                   status = tre_add_tag_right(mem, uni->left, uni->left_tag);
+                   if (status != REG_OK)
+                     break;
+                   tnfa->tag_directions[uni->left_tag] = TRE_TAG_MAXIMIZE;
+                   if (!lb)
+                     {
+                       status = tre_merge_branches(mem, uni->left, NULL, -1,
+                                                   tnfa->num_tags);
+                       if (status != REG_OK)
+                         break;
+                       lb = uni->left->last_matched_branch;
+                     }
+                   lb->cmp_tag = uni->left_tag;
+                 }
+               if (uni->right_tag > 0)
                  {
-                   status = tre_add_tag_right(mem, left, tag_left);
-                   tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
-                   status = tre_add_tag_right(mem, right, tag_right);
-                   tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
+                   DPRINT(("Setting t%d direction to maximize\n",
+                           uni->right_tag));
+                   status = tre_add_tag_right(mem, uni->right, uni->right_tag);
+                   if (status != REG_OK)
+                     break;
+                   tnfa->tag_directions[uni->right_tag] = TRE_TAG_MAXIMIZE;
+                   if (!rb)
+                     {
+                       status = tre_merge_branches(mem, uni->right, NULL, -1,
+                                                   tnfa->num_tags);
+                       if (status != REG_OK)
+                         break;
+                       rb = uni->right->last_matched_branch;
+                     }
+                   rb->cmp_tag = uni->right_tag;
                  }
-               DPRINT(("  num_tags += 2\n"));
-               num_tags += 2;
+               /* Now merge the tre_last_matched_branch_pre_t into a
+                  tre_last_matched_pre_t */
+               if (lu == NULL)
+                 {
+                   if (ru == NULL)
+                     {
+                       /* Create a new tre_last_matched_pre_t */
+                       u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t));
+                       if (!u)
+                         {
+                           status = REG_ESPACE;
+                           break;
+                         }
+                       u->tot_last_matched = 1;
+
+                       if (lb)
+                         {
+                           u->branches = lb;
+                           u->n_branches = 1;
+                           u->tot_branches += lb->tot_branches;
+                           u->tot_last_matched += lb->tot_last_matched;
+                           u->tot_tags += lb->tot_tags;
+                           if (rb)
+                             {
+                               lb->next = rb;
+                               u->n_branches++;
+                               u->tot_branches += rb->tot_branches;
+                               u->tot_last_matched += rb->tot_last_matched;
+                               u->tot_tags += rb->tot_tags;
+                             }
+                         }
+                       else if (rb)
+                         {
+                           u->branches = rb;
+                           u->n_branches = 1;
+                           u->tot_branches += rb->tot_branches;
+                           u->tot_last_matched += rb->tot_last_matched;
+                           u->tot_tags += rb->tot_tags;
+                         }
+                     }
+                   else
+                     {
+                       /* Use ru, and add lb */
+                       u = ru;
+                       if (lb)
+                         {
+                           lb->next = u->branches;
+                           u->branches = lb;
+                           u->n_branches++;
+                           u->tot_branches += lb->tot_branches;
+                           u->tot_last_matched += lb->tot_last_matched;
+                           u->tot_tags += lb->tot_tags;
+                         }
+                     }
+                 }
+               else if (ru == NULL)
+                 {
+                   /* Use lu, and add rb */
+                   u = lu;
+                   if (rb)
+                     {
+                       rb->next = u->branches;
+                       u->branches = rb;
+                       u->n_branches++;
+                       u->tot_branches += rb->tot_branches;
+                       u->tot_last_matched += rb->tot_last_matched;
+                       u->tot_tags += rb->tot_tags;
+                     }
+                 }
+               else
+                 {
+                   /* Merge lu and ru into lu */
+                   if (lu->branches)
+                     {
+                       if (ru->branches)
+                         {
+                           tre_last_matched_branch_pre_t *b = lu->branches;
+                           while (b->next) b = b->next;
+                           b->next = ru->branches;
+                           lu->n_branches += ru->n_branches;
+                         }
+                     }
+                   else if (ru->branches)
+                     {
+                       lu->branches = ru->branches;
+                       lu->n_branches = ru->n_branches;
+                     }
+                   lu->tot_branches += ru->tot_branches;
+                   lu->tot_last_matched += ru->tot_last_matched - 1;
+                   lu->tot_tags += ru->tot_tags;
+                   u = lu;
+                 }
+               node->last_matched_in_progress = u;
              }
            direction = TRE_TAG_MAXIMIZE;
            break;
          }
 
+       case ADDTAGS_AFTER_UNION_TOP: /* only called when not first_pass */
+         {
+           tre_last_matched_branch_pre_t *b;
+           tre_last_matched_pre_t *u;
+           int start_tag;
+
+           DPRINT(("After union top\n"));
+           node = tre_stack_pop_voidptr(stack);
+           start_tag = tre_stack_pop_int(stack);
+           b = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t)
+                              + bitstr_size(tnfa->num_tags));
+           if (!b)
+             {
+               status = REG_ESPACE;
+               break;
+             }
+
+           u = node->last_matched_in_progress;
+           u->start_tag = start_tag;
+           b->tot_branches = 1 + u->tot_branches;
+           b->tot_last_matched = u->tot_last_matched;
+           b->tot_tags = u->tot_tags;
+           b->last_matched = u;
+           b->n_last_matched = 1;
+           node->last_matched_branch = b;
+           node->last_matched_in_progress = NULL;
+           break;
+         }
+
        default:
          assert(0);
          break;
@@ -611,31 +1259,384 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
        } /* end switch(symbol) */
     } /* end while(tre_stack_num_objects(stack) > bottom) */
 
+  if (status != REG_OK)
+    {
+      DPRINT(("Error during %s pass\n", first_pass ? "first" : "second"));
+      goto error_post_compile;
+    }
+
   if (!first_pass)
-    tre_purge_regset(regset, tnfa, tag);
+    {
+      int i;
+      if (num_tags != tnfa->num_tags)
+       {
+         DPRINT(("num_tags(%d) != tnfa->num_tags(%d)\n", num_tags,
+                 tnfa->num_tags));
+         status = REG_BADPAT;
+         goto error_post_compile;
+       }
 
-  if (!first_pass && minimal_tag >= 0)
+      tre_purge_regset(regset, tnfa, tag);
+      DPRINT(("Setting t%d to %s\n", num_tags,
+             tag_dir_str[direction]));
+      tnfa->tag_directions[num_tags] = direction;
+
+      if (rtp > reorder_tags + 2 * tnfa->num_reorder_tags)
+       {
+         DPRINT(("Processed %d reorder tags instead of %d\n",
+                 (int)(rtp - reorder_tags) / 2, tnfa->num_reorder_tags));
+         status = REG_BADPAT;
+         goto error_post_compile;
+       }
+    *rtp = -1;
+#if TRE_DEBUG
+      if (reorder_tags[0] >= 0)
+       {
+         DPRINT(("reorder_tags:\n"));
+         for (rtp = reorder_tags; *rtp >= 0;)
+           {
+             DPRINT(("%d after ", *rtp++));
+             DPRINT(("%d\n", *rtp++));
+           }
+       }
+       else
+         DPRINT(("No reorder_tags\n"));
+#endif /* TRE_DEBUG */
+
+      /* Initialize to_reorder */
+      for (i = 0; i < num_tags; i++)
+       to_reorder[i] = i;
+      /* Use to_seq_order to convert reorder_tags values, and use those to
+       * reorder to_reorder */
+      for (rtp = reorder_tags; *rtp >= 0;)
+       {
+         int j, high, low;
+         int ti = *rtp++;
+
+         /* Skip reordering the final tag */
+         if (ti >= num_tags)
+           {
+             DPRINT(("Skipping reorder of %d\n", ti));
+             rtp++;
+             continue;
+           }
+         /* The number of the tag to reorder */
+         high = to_reorder[ti];
+         /* Reorder after this tag */
+         low = to_reorder[*rtp++];
+
+         DPRINT(("ti=%d high=%d low=%d\n", ti, high, low));
+         if (low > high)
+           {
+             DPRINT(("Tag %d already before %d\n", high, low));
+             continue;
+           }
+         for (j = 0; j < num_tags; j++)
+           if (to_reorder[j] > low && to_reorder[j] < high)
+             to_reorder[j]++;
+         to_reorder[ti] = low + 1;
+#ifdef TRE_DEBUG
+         DPRINT(("to_reorder=("));
+         for (j = 0; j < num_tags; j++)
+           {
+             DPRINT(("%d", to_reorder[j]));
+             if (j < num_tags - 1)
+               DPRINT((","));
+           }
+         DPRINT((")\n"));
+#endif /* TRE_DEBUG */
+       }
+      /* Determine if reordering in really necessary */
+      {
+       int need_reorder = 0;
+       for (i = 0; i < num_tags; i++)
+         if(to_reorder[i] != i)
+           {
+             need_reorder = 1;
+             break;
+           }
+       /* If need_reorder is not set, free reorder_tags, and set to NULL,
+        * indicating no reordering is needed */
+       if (!need_reorder)
+         {
+           DPRINT(("Don't need to reorder\n"));
+           xfree(reorder_tags);
+           reorder_tags = NULL;
+         }
+      }
+    }
+
+  if (reorder_tags)
     {
       int i;
-      DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
-      for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
-      tnfa->minimal_tags[i] = tag;
-      tnfa->minimal_tags[i + 1] = minimal_tag;
-      tnfa->minimal_tags[i + 2] = -1;
-      minimal_tag = -1;
-      num_minimals++;
+      tre_tag_direction_t *new_tag_directions;
+#if TRE_DEBUG
+      DPRINT(("to_reorder:"));
+      for (i = 0; i < num_tags; i++)
+       DPRINT((" %d->%d", i, to_reorder[i]));
+      DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+
+      DPRINT(("Reordering submatch_data\n"));
+      for (i = 0; i < tnfa->num_submatches; i++)
+       {
+#if TRE_DEBUG
+         int so = tnfa->submatch_data[i].so_tag;
+         int eo = tnfa->submatch_data[i].eo_tag;
+#endif /* TRE_DEBUG */
+         tnfa->submatch_data[i].so_tag =
+           to_reorder[tnfa->submatch_data[i].so_tag];
+         tnfa->submatch_data[i].eo_tag =
+           tnfa->submatch_data[i].eo_tag < num_tags ?
+           to_reorder[tnfa->submatch_data[i].eo_tag] :
+           tnfa->submatch_data[i].eo_tag;
+         DPRINT(("pmatch[%d]: {%d, %d}->{%d, %d}\n", i, so, eo,
+                 tnfa->submatch_data[i].so_tag,
+                 tnfa->submatch_data[i].eo_tag));
+       }
+
+      DPRINT(("Reordering tag_directions\n"));
+      /* We only allocate num_tags directions and reorder them.  The
+       * num_tags-th direction (end tag) is left unchanged. */
+      new_tag_directions = xmalloc(sizeof(*new_tag_directions) * num_tags);
+      if (new_tag_directions == NULL)
+       {
+         status = REG_ESPACE;
+         goto error_post_compile;
+       }
+      for (i = 0; i < num_tags; i++)
+       {
+         new_tag_directions[to_reorder[i]] = tnfa->tag_directions[i];
+       }
+#if TRE_DEBUG
+      for (i = 0; i < num_tags; i++)
+       {
+         DPRINT(("t%d %s->%s\n", i,
+                 tag_dir_str[tnfa->tag_directions[i]],
+                 tag_dir_str[new_tag_directions[i]]));
+       }
+       DPRINT(("t%d %s->%s\n", num_tags,
+               tag_dir_str[tnfa->tag_directions[num_tags]],
+               tag_dir_str[tnfa->tag_directions[num_tags]]));
+#endif /* TRE_DEBUG */
+      memcpy(tnfa->tag_directions, new_tag_directions, sizeof(*new_tag_directions) * num_tags);
+      xfree(new_tag_directions);
+
+      DPRINT(("Reordering minimal_tags\n"));
+      for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
+       tnfa->minimal_tags[i] = tnfa->minimal_tags[i] < num_tags ?
+                               to_reorder[tnfa->minimal_tags[i]] :
+                               tnfa->minimal_tags[i];
+
+      DPRINT(("Reordering AST tags\n"));
+      STACK_PUSH(stack, voidptr, tree);
+      while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
+       {
+         node = tre_stack_pop_voidptr(stack);
+
+         switch (node->type)
+           {
+           case LITERAL:
+             {
+               tre_literal_t *lit = (tre_literal_t *)node->obj;
+               if (IS_TAG(lit))
+                 lit->code_max = to_reorder[lit->code_max];
+               break;
+             }
+
+           case UNION:
+             {
+               tre_union_t *uni = (tre_union_t *)node->obj;
+               STACK_PUSHX(stack, voidptr, uni->right);
+               STACK_PUSHX(stack, voidptr, uni->left);
+               break;
+             }
+
+           case CATENATION:
+             {
+               tre_catenation_t *cat = (tre_catenation_t *)node->obj;
+               STACK_PUSHX(stack, voidptr, cat->right);
+               STACK_PUSHX(stack, voidptr, cat->left);
+               break;
+             }
+
+           case ITERATION:
+             {
+               tre_iteration_t *iter = (tre_iteration_t *)node->obj;
+               STACK_PUSHX(stack, voidptr, iter->arg);
+               break;
+             }
+
+           default:
+             assert(0);
+             break;
+           }
+       }
+       if (status != REG_OK)
+         {
+           DPRINT(("Error while reordering tags\n"));
+           goto error_post_compile;
+         }
+    }
+
+
+  if (!first_pass)
+    {
+      if (tree->last_matched_branch)
+       {
+         tre_last_matched_branch_t *buf, *b, *bb;
+         tre_last_matched_branch_pre_t *bp;
+         tre_last_matched_t *u, *uu;
+         tre_last_matched_pre_t *up;
+         int *t;
+         int i;
+#ifdef TRE_DEBUG
+         tre_last_matched_branch_t *_b;
+         tre_last_matched_t *_u;
+         int *_t;
+
+         DPRINT(("last_match_branch_pre:\n"));
+         print_last_match_branch_pre(tree->last_matched_branch, 0, num_tags);
+#endif /* TRE_DEBUG */
+         buf = (tre_last_matched_branch_t *)xcalloc(1,
+                                    tree->last_matched_branch->tot_branches
+                                    * sizeof(tre_last_matched_branch_t) +
+                                    tree->last_matched_branch->tot_last_matched
+                                    * sizeof(tre_last_matched_t) +
+                                    tree->last_matched_branch->tot_tags *
+                                    sizeof(int));
+         if (!buf)
+           {
+             status = REG_ESPACE;
+             goto error_post_compile;
+           }
+
+         b = buf;
+         u = (tre_last_matched_t *)(b +
+             tree->last_matched_branch->tot_branches);
+         t = (int *)(u + tree->last_matched_branch->tot_last_matched);
+#ifdef TRE_DEBUG
+         _b = b;
+         _u = u;
+         _t = t;
+#endif /* TRE_DEBUG */
+         DPRINT(("Copying info_pre to info\n"));
+         STACK_PUSH(stack, voidptr, tree->last_matched_branch);
+         STACK_PUSH(stack, int, 1);
+         STACK_PUSH(stack, int, COPY_LAST_MATCHED_BRANCH);
+
+         while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
+           {
+             switch (tre_stack_pop_int(stack))
+               {
+               case COPY_LAST_MATCHED_BRANCH:
+                 i = tre_stack_pop_int(stack);
+                 /* The tre_last_matched_branch_pre_t * is still on the
+                    stack */
+                 STACK_PUSHX(stack, voidptr, b);
+                 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT);
+                 b += i;
+                 break;
+
+               case COPY_LAST_MATCHED_BRANCH_NEXT:
+                 bb = tre_stack_pop_voidptr(stack);
+                 bp = tre_stack_pop_voidptr(stack);
+                 bb->n_last_matched = bp->n_last_matched;
+                 bb->cmp_tag = bp->cmp_tag;
+                 if (bp->n_tags > 0)
+                   {
+                     int n;
+                     n = bb->n_tags = bp->n_tags;
+                     bb->tags = t;
+                     for (i = 0; i < num_tags; i++)
+                       if (bit_test(bp->tags, i))
+                         {
+                           *t++ = i;
+                           if (--n <= 0)
+                             break;
+                         }
+                   }
+                 if (bp->next)
+                   {
+                     STACK_PUSHX(stack, voidptr, bp->next);
+                     STACK_PUSHX(stack, voidptr, bb + 1);
+                     STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT);
+                   }
+                 if (bp->n_last_matched > 0)
+                   {
+                     bb->last_matched = u;
+                     STACK_PUSHX(stack, voidptr, bp->last_matched);
+                     STACK_PUSHX(stack, int, bp->n_last_matched);
+                     STACK_PUSHX(stack, int, COPY_LAST_MATCHED);
+                   }
+                 break;
+
+               case COPY_LAST_MATCHED:
+                 i = tre_stack_pop_int(stack);
+                 /* The tre_last_matched_pre_t * is still on the stack */
+                 STACK_PUSHX(stack, voidptr, u);
+                 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT);
+                 u += i;
+                 break;
+
+               case COPY_LAST_MATCHED_NEXT:
+                 uu = tre_stack_pop_voidptr(stack);
+                 up = tre_stack_pop_voidptr(stack);
+                 uu->n_branches = up->n_branches;
+                 uu->branches = b;
+                 uu->start_tag = up->start_tag;
+                 if (up->next)
+                   {
+                     STACK_PUSHX(stack, voidptr, up->next);
+                     STACK_PUSHX(stack, voidptr, uu + 1);
+                     STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT);
+                   }
+                 STACK_PUSHX(stack, voidptr, up->branches);
+                 STACK_PUSHX(stack, int, up->n_branches);
+                 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH);
+                 break;
+               }
+           }
+         if (status != REG_OK)
+           goto error_post_compile;
+#ifdef TRE_DEBUG
+         DPRINT(("last_matched_branch:\n"));
+         print_last_match_branch(buf, 0);
+         if (b != _b + tree->last_matched_branch->tot_branches)
+           DPRINT(("b/%p != _b + tree->last_matched_branch->tot_branches/%p\n",
+                   b, _b + tree->last_matched_branch->tot_branches));
+         if (u != _u + tree->last_matched_branch->tot_last_matched)
+           DPRINT(("u/%p != _u + "
+                   "tree->last_matched_branch->tot_last_matched/%p\n",
+                   u, _u + tree->last_matched_branch->tot_last_matched));
+         if (t != _t + tree->last_matched_branch->tot_tags)
+           DPRINT(("t/%p != _t + tree->last_matched_branch->tot_tags/%p\n",
+                   t, _t + tree->last_matched_branch->tot_tags));
+#endif /* TRE_DEBUG */
+         tnfa->last_matched_branch = buf;
+       }
+#ifdef TRE_DEBUG
+      else
+       DPRINT(("No last_match_branch_pre\n"));
+#endif /* TRE_DEBUG */
     }
 
   DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n",
          first_pass? "First pass" : "Second pass", num_tags));
-
+#ifdef TRE_DEBUG
+  tre_ast_print(tree);
+#endif /* TRE_DEBUG */
+  DPRINT(("tre_add_tags: tree->num_tags=%d num_tags=%d\n", tree->num_tags,
+         num_tags));
   assert(tree->num_tags == num_tags);
   tnfa->end_tag = num_tags;
   tnfa->num_tags = num_tags;
   tnfa->num_minimals = num_minimals;
+error_post_compile:
+  xfree(reorder_tags);
+error_reorder_tags:
   xfree(orig_regset);
-  xfree(parents);
-  xfree(saved_states);
+error_regset:
   return status;
 }
 
@@ -691,6 +1692,9 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
                int pos = lit->position;
                int min = lit->code_min;
                int max = lit->code_max;
+               tre_bracket_match_list_t *list = !IS_SPECIAL(lit) ?
+                                                 lit->u.bracket_match_list :
+                                                 NULL;
                if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
                  {
                    /* XXX - e.g. [ab] has only one position but two
@@ -709,7 +1713,8 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
                         && first_tag)
                  {
                    /* Maximize the first tag. */
-                   tag_directions[max] = TRE_TAG_MAXIMIZE;
+                   if (tag_directions[max] == TRE_TAG_LEFT_MAXIMIZE)
+                     tag_directions[max] = TRE_TAG_MAXIMIZE;
                    first_tag = 0;
                  }
                *result = tre_ast_new_literal(mem, min, max, pos);
@@ -718,6 +1723,10 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
 
                if (pos > *max_pos)
                  *max_pos = pos;
+
+               if (!IS_SPECIAL(lit))
+                 ((tre_literal_t *)(*result)->obj)->u.bracket_match_list
+                     = list;
                break;
              }
            case UNION:
@@ -807,15 +1816,21 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
   int pos_add = 0;
   int pos_add_total = 0;
   int max_pos = 0;
+#ifdef TRE_APPROX
   /* Current approximate matching parameters. */
   int params[TRE_PARAM_LAST];
   /* Approximate parameter nesting level. */
   int params_depth = 0;
+#endif /* TRE_APPROX */
   int iter_depth = 0;
+#ifdef TRE_APPROX
   int i;
+#endif /* TRE_APPROX */
 
+#ifdef TRE_APPROX
   for (i = 0; i < TRE_PARAM_LAST; i++)
     params[i] = TRE_PARAM_DEFAULT;
+#endif /* TRE_APPROX */
 
   STACK_PUSHR(stack, voidptr, ast);
   STACK_PUSHR(stack, int, EXPAND_RECURSE);
@@ -893,7 +1908,20 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
            int pos_add_last;
            pos_add = tre_stack_pop_int(stack);
            pos_add_last = pos_add;
-           if (iter->min > 1 || iter->max > 1)
+           /* Originally (in tre_parse_bound), if min == 0 && max == 0, we
+              immediate replace the whole iteration with EMPTY.  This
+              unfortunately drops any submatches, and messes up setting the
+              pmatch values (we can get tags of -1, and tag values in the
+              billions).  So we left it there and replace with EMPTY here. */
+           if (iter->min == 0 && iter->max == 0)
+             {
+               tre_ast_node_t *empty = tre_ast_new_literal(mem, EMPTY, -1, -1);
+               if (empty == NULL)
+                 return REG_ESPACE;
+               node->obj = empty->obj;
+               node->type = empty->type;
+             }
+           else if (iter->min > 1 || iter->max > 1)
              {
                tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
                int j;
@@ -975,6 +2003,7 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
            if (iter_depth == 0)
              pos_add = pos_add_total;
 
+#ifdef TRE_APPROX
            /* If approximate parameters are specified, surround the result
               with two parameter setting nodes.  The one on the left sets
               the specified parameters, and the one on the right restores
@@ -1021,6 +2050,7 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
                if (params_depth > *max_depth)
                  *max_depth = params_depth;
              }
+#endif /* TRE_APPROX */
            break;
          }
        default:
@@ -1065,7 +2095,7 @@ tre_set_empty(tre_mem_t mem)
 
 static tre_pos_and_tags_t *
 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
-           tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
+           tre_bracket_match_list_t *bracket_match_list, int backref)
 {
   tre_pos_and_tags_t *new_set;
 
@@ -1076,8 +2106,7 @@ tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
   new_set[0].position = position;
   new_set[0].code_min = code_min;
   new_set[0].code_max = code_max;
-  new_set[0].class = class;
-  new_set[0].neg_classes = neg_classes;
+  new_set[0].bracket_match_list = bracket_match_list;
   new_set[0].backref = backref;
   new_set[1].position = -1;
   new_set[1].code_min = -1;
@@ -1108,8 +2137,7 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
       new_set[s1].code_min = set1[s1].code_min;
       new_set[s1].code_max = set1[s1].code_max;
       new_set[s1].assertions = set1[s1].assertions | assertions;
-      new_set[s1].class = set1[s1].class;
-      new_set[s1].neg_classes = set1[s1].neg_classes;
+      new_set[s1].bracket_match_list = set1[s1].bracket_match_list;
       new_set[s1].backref = set1[s1].backref;
       if (set1[s1].tags == NULL && tags == NULL)
        new_set[s1].tags = NULL;
@@ -1153,8 +2181,7 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
       new_set[s1 + s2].code_max = set2[s2].code_max;
       /* XXX - why not | assertions here as well? */
       new_set[s1 + s2].assertions = set2[s2].assertions;
-      new_set[s1 + s2].class = set2[s2].class;
-      new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
+      new_set[s1 + s2].bracket_match_list = set2[s2].bracket_match_list;
       new_set[s1 + s2].backref = set2[s2].backref;
       if (set2[s2].tags == NULL)
        new_set[s1 + s2].tags = NULL;
@@ -1344,11 +2371,11 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
                       lastpos = {i}. */
                    node->nullable = 0;
                    node->firstpos = tre_set_one(mem, lit->position, 0,
-                                            TRE_CHAR_MAX, 0, NULL, -1);
+                                            TRE_CHAR_MAX, NULL, -1);
                    if (!node->firstpos)
                      return REG_ESPACE;
                    node->lastpos = tre_set_one(mem, lit->position, 0,
-                                               TRE_CHAR_MAX, 0, NULL,
+                                               TRE_CHAR_MAX, NULL,
                                                (int)lit->code_max);
                    if (!node->lastpos)
                      return REG_ESPACE;
@@ -1372,13 +2399,13 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
                    node->nullable = 0;
                    node->firstpos =
                      tre_set_one(mem, lit->position, (int)lit->code_min,
-                                 (int)lit->code_max, 0, NULL, -1);
+                                 (int)lit->code_max, NULL, -1);
                    if (!node->firstpos)
                      return REG_ESPACE;
                    node->lastpos = tre_set_one(mem, lit->position,
                                                (int)lit->code_min,
                                                (int)lit->code_max,
-                                               lit->u.class, lit->neg_classes,
+                                               lit->u.bracket_match_list,
                                                -1);
                    if (!node->lastpos)
                      return REG_ESPACE;
@@ -1436,14 +2463,89 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
 
        case NFL_POST_ITERATION:
          {
+           int num_tags, *tags, assertions, params_seen;
+           int *params;
+           reg_errcode_t status;
            tre_iteration_t *iter = (tre_iteration_t *)node->obj;
 
+           /* From Ville Laurikari's original 2001 Master's thesis, the
+              firstpos(n) and lastpos(n) of an iteration is just the
+              corresponding values of the iteration's argument.  Unfortunately,
+              this isn't sufficient for the following BRE:
+
+                  \(a*\)*b\(\1\)    matched against    ab
+
+              The backreference wants to force the first subexpression to
+              be the empty string, but there is no transition for this.  So
+              we need to modify the lastpos(n) of an iteration to be the
+              equivalent of that of catentation.  Using the same notation as
+              in the thesis, lastpos(n) is redefined as:
+
+                  if nullable(c1) then
+                      lastpos(c1) U
+                      addtags(lastpos(c1),
+                              emptymatch(c1))
+                  else
+                      lastpos(c1)
+
+              where c1 is the argument node.  firstpos(n) remains the same. */
+
+           /* Compute lastpos. */
            if (iter->min == 0 || iter->arg->nullable)
-             node->nullable = 1;
+             {
+               node->nullable = 1;
+               if (iter->arg->nullable)
+                 {
+                   /* The arg matches the empty string.  Make a first pass
+                      with tre_match_empty() to get the number of tags and
+                      parameters. */
+                   status = tre_match_empty(stack, iter->arg,
+                                            NULL, NULL, NULL, &num_tags,
+                                            &params_seen);
+                   if (status != REG_OK)
+                     return status;
+                   /* Allocate arrays for the tags and parameters. */
+                   tags = xmalloc(sizeof(int) * (num_tags + 1));
+                   if (!tags)
+                     return REG_ESPACE;
+                   tags[0] = -1;
+                   assertions = 0;
+                   params = NULL;
+                   if (params_seen)
+                     {
+                       params = tre_mem_alloc(mem, sizeof(*params)
+                                              * TRE_PARAM_LAST);
+                       if (!params)
+                         {
+                           xfree(tags);
+                           return REG_ESPACE;
+                         }
+                     }
+                   /* Second pass with tre_mach_empty() to get the list of
+                      tags and parameters. */
+                   status = tre_match_empty(stack, iter->arg, tags,
+                                            &assertions, params, NULL, NULL);
+                   if (status != REG_OK)
+                     {
+                       xfree(tags);
+                       return status;
+                     }
+                   node->lastpos =
+                     tre_set_union(mem, iter->arg->lastpos, iter->arg->lastpos,
+                                   tags, assertions, params);
+                   xfree(tags);
+                   if (!node->lastpos)
+                     return REG_ESPACE;
+                 }
+               else
+                 node->lastpos = iter->arg->lastpos;
+             }
            else
-             node->nullable = 0;
+             {
+               node->nullable = 0;
+               node->lastpos = iter->arg->lastpos;
+             }
            node->firstpos = iter->arg->firstpos;
-           node->lastpos = iter->arg->lastpos;
            break;
          }
 
@@ -1624,30 +2726,23 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
            trans->state = transitions + offs[p2->position];
            trans->state_id = p2->position;
            trans->assertions = p1->assertions | p2->assertions
-             | (p1->class ? ASSERT_CHAR_CLASS : 0)
-             | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
+             | (p1->bracket_match_list != NULL ? ASSERT_BRACKET_MATCH : 0);
            if (p1->backref >= 0)
              {
-               assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
+               assert((trans->assertions & ASSERT_BRACKET_MATCH) == 0);
                assert(p2->backref < 0);
                trans->u.backref = p1->backref;
                trans->assertions |= ASSERT_BACKREF;
              }
-           else
-             trans->u.class = p1->class;
-           if (p1->neg_classes != NULL)
+           if (p1->bracket_match_list != NULL)
              {
-               for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
-               trans->neg_classes =
-                 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
-               if (trans->neg_classes == NULL)
+               trans->u.bracket_match_list =
+                 xmalloc(SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list));
+               if (trans->u.bracket_match_list == NULL)
                  return REG_ESPACE;
-               for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
-                 trans->neg_classes[i] = p1->neg_classes[i];
-               trans->neg_classes[i] = (tre_ctype_t)0;
+               memcpy(trans->u.bracket_match_list, p1->bracket_match_list,
+                      SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list));
              }
-           else
-             trans->neg_classes = NULL;
 
            /* Find out how many tags this transition has. */
            i = 0;
@@ -1748,10 +2843,9 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
                DPRINT((", assert %d", trans->assertions));
              if (trans->assertions & ASSERT_BACKREF)
                DPRINT((", backref %d", trans->u.backref));
-             else if (trans->u.class)
-               DPRINT((", class %ld", (long)trans->u.class));
-             if (trans->neg_classes)
-               DPRINT((", neg_classes %p", trans->neg_classes));
+             else if (trans->assertions & ASSERT_BRACKET_MATCH)
+               DPRINT((", bracket_match_list %p",
+                       trans->u.bracket_match_list));
              if (trans->params)
                {
                  DPRINT((", "));
@@ -1852,7 +2946,8 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
 
 
 int
-tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
+tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags,
+           locale_t loc)
 {
   tre_stack_t *stack;
   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
@@ -1861,7 +2956,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
   int i, add = 0;
   tre_tnfa_transition_t *transitions, *initial;
   tre_tnfa_t *tnfa = NULL;
-  tre_submatch_data_t *submatch_data;
+  tre_submatch_data_t *submatch_data = NULL;
   tre_tag_direction_t *tag_directions = NULL;
   reg_errcode_t errcode;
   tre_mem_t mem;
@@ -1888,8 +2983,15 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
   parse_ctx.stack = stack;
   parse_ctx.re = regex;
   parse_ctx.len = n;
+  /* Only allow REG_UNGREEDY to be set if both REG_ENHANCED and REG_EXTENDED
+     are also set */
+  if ((cflags & (REG_ENHANCED | REG_EXTENDED)) != (REG_ENHANCED | REG_EXTENDED))
+    cflags &= ~REG_UNGREEDY;
   parse_ctx.cflags = cflags;
   parse_ctx.max_backref = -1;
+  parse_ctx.loc = loc;
+  parse_ctx.submatch_id_invisible = SUBMATCH_ID_INVISIBLE_START;
+
   DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
   errcode = tre_parse(&parse_ctx);
   if (errcode != REG_OK)
@@ -1917,10 +3019,14 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
   tnfa->have_approx = parse_ctx.have_approx;
   tnfa->num_submatches = parse_ctx.submatch_id;
+  tnfa->num_submatches_invisible = parse_ctx.submatch_id_invisible
+                                  - SUBMATCH_ID_INVISIBLE_START;
+  tnfa->num_reorder_tags = parse_ctx.num_reorder_tags;
+  tnfa->loc = parse_ctx.loc;
 
   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
      regexp does not have back references, this can be skipped. */
-  if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
+  if (tnfa->num_reorder_tags > 0 || !(cflags & REG_NOSUB))
     {
       DPRINT(("tre_compile: setting up tags\n"));
 
@@ -1942,7 +3048,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
          memset(tag_directions, -1,
                 sizeof(*tag_directions) * (tnfa->num_tags + 1));
        }
-      tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
+      tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 3,
                                   sizeof(tnfa->minimal_tags));
       if (tnfa->minimal_tags == NULL)
        ERROR_EXIT(REG_ESPACE);
@@ -1951,6 +3057,12 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
                              sizeof(*submatch_data));
       if (submatch_data == NULL)
        ERROR_EXIT(REG_ESPACE);
+      /* Set the eo_tag value to -1 to indicate that that corresponding
+       * submatch has not be completed yet */
+      for (i = 0; i < parse_ctx.submatch_id; i++)
+       {
+         submatch_data[i].eo_tag = -1;
+       }
       tnfa->submatch_data = submatch_data;
 
       errcode = tre_add_tags(mem, stack, tree, tnfa);
@@ -1961,10 +3073,8 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
       for (i = 0; i < parse_ctx.submatch_id; i++)
        DPRINT(("pmatch[%d] = {t%d, t%d}\n",
                i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
-      for (i = 0; i < tnfa->num_tags; i++)
-       DPRINT(("t%d is %s\n", i,
-               tag_directions[i] == TRE_TAG_MINIMIZE ?
-               "minimized" : "maximized"));
+      for (i = 0; i <= tnfa->num_tags; i++)
+       DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]]));
 #endif /* TRE_DEBUG */
     }
 
@@ -1990,6 +3100,13 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 #ifdef TRE_DEBUG
   tre_ast_print(tree);
   DPRINT(("Number of states: %d\n", parse_ctx.position));
+  if (submatch_data)
+    for (i = 0; i < parse_ctx.submatch_id; i++)
+      DPRINT(("pmatch[%d] = {t%d, t%d}\n",
+             i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
+  if (tag_directions)
+    for (i = 0; i <= tnfa->num_tags; i++)
+      DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]]));
 #endif /* TRE_DEBUG */
 
   errcode = tre_compute_nfl(mem, stack, tree);
@@ -2026,52 +3143,36 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
   if (errcode != REG_OK)
     ERROR_EXIT(errcode);
 
-  /* If in eight bit mode, compute a table of characters that can be the
-     first character of a match. */
+
+  /* Set first_char only if there is only one character that can be the
+     first character of a match */
   tnfa->first_char = -1;
-  if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
+  if (!tmp_ast_l->nullable)
     {
-      int count = 0;
-      tre_cint_t k;
-      DPRINT(("Characters that can start a match:"));
-      tnfa->firstpos_chars = xcalloc(256, sizeof(char));
-      if (tnfa->firstpos_chars == NULL)
-       ERROR_EXIT(REG_ESPACE);
-      for (p = tree->firstpos; p->position >= 0; p++)
+      int scanning = 1;
+      for (p = tree->firstpos; scanning && p->position >= 0; p++)
        {
          tre_tnfa_transition_t *j = transitions + offs[p->position];
          while (j->state != NULL)
            {
-             for (k = j->code_min; k <= j->code_max && k < 256; k++)
+             if (j->code_min <= j->code_max)
                {
-                 DPRINT((" %d", k));
-                 tnfa->firstpos_chars[k] = 1;
-                 count++;
+                 if (j->code_max != j->code_min || j->code_min == -1 || tnfa->first_char != -1)
+                   {
+                     tnfa->first_char = -1;
+                     scanning = 0;
+                     break;
+                   }
+                 tnfa->first_char = j->code_min;
                }
              j++;
            }
        }
-      DPRINT(("\n"));
-#define TRE_OPTIMIZE_FIRST_CHAR 1
-#if TRE_OPTIMIZE_FIRST_CHAR
-      if (count == 1)
-       {
-         for (k = 0; k < 256; k++)
-           if (tnfa->firstpos_chars[k])
-             {
-               DPRINT(("first char must be %d\n", k));
-               tnfa->first_char = k;
-               xfree(tnfa->firstpos_chars);
-               tnfa->firstpos_chars = NULL;
-               break;
-             }
-       }
-#endif
-
+#ifdef TRE_DEBUG
+      if (tnfa->first_char >= 0)
+       DPRINT(("first char must be %d\n", tnfa->first_char));
+#endif /* TRE_DEBUG */
     }
-  else
-    tnfa->firstpos_chars = NULL;
-
 
   p = tree->firstpos;
   i = 0;
@@ -2150,14 +3251,19 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
   tnfa->num_states = parse_ctx.position;
   tnfa->cflags = cflags;
 
-  DPRINT(("final state %p\n", (void *)tnfa->final));
+  DPRINT(("final state %d (%p)\n", tree->lastpos[0].position,
+         (void *)tnfa->final));
 
   tre_mem_destroy(mem);
   tre_stack_destroy(stack);
   xfree(counts);
   xfree(offs);
 
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  preg->re_magic = RE_MAGIC;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+  xlocale_retain(tnfa->loc);
   return REG_OK;
 
  error_exit:
@@ -2169,7 +3275,12 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
     xfree(counts);
   if (offs != NULL)
     xfree(offs);
+
+  /* Set tnfa into preg, so that calling tre_free() will free the contents
+     of tnfa.  NULL out the loc field since we never retained the locale. */
   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
+  if(tnfa) tnfa->loc = NULL;
+
   tre_free(preg);
   return errcode;
 }
@@ -2184,17 +3295,21 @@ tre_free(regex_t *preg)
   unsigned int i;
   tre_tnfa_transition_t *trans;
 
+#ifdef TRE_USE_SYSTEM_REGEX_H
+  preg->re_magic = 0;
+#endif /* TRE_USE_SYSTEM_REGEX_H */
   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   if (!tnfa)
     return;
+  preg->TRE_REGEX_T_FIELD = NULL;
 
   for (i = 0; i < tnfa->num_transitions; i++)
     if (tnfa->transitions[i].state)
       {
        if (tnfa->transitions[i].tags)
          xfree(tnfa->transitions[i].tags);
-       if (tnfa->transitions[i].neg_classes)
-         xfree(tnfa->transitions[i].neg_classes);
+       if (tnfa->transitions[i].assertions & ASSERT_BRACKET_MATCH)
+         xfree(tnfa->transitions[i].u.bracket_match_list);
        if (tnfa->transitions[i].params)
          xfree(tnfa->transitions[i].params);
       }
@@ -2215,18 +3330,20 @@ tre_free(regex_t *preg)
 
   if (tnfa->submatch_data)
     {
-      for (i = 0; i < tnfa->num_submatches; i++)
-       if (tnfa->submatch_data[i].parents)
-         xfree(tnfa->submatch_data[i].parents);
       xfree(tnfa->submatch_data);
     }
 
   if (tnfa->tag_directions)
     xfree(tnfa->tag_directions);
-  if (tnfa->firstpos_chars)
-    xfree(tnfa->firstpos_chars);
   if (tnfa->minimal_tags)
     xfree(tnfa->minimal_tags);
+
+  if (tnfa->loc)
+    xlocale_release(tnfa->loc);
+
+  if (tnfa->last_matched_branch)
+    xfree(tnfa->last_matched_branch);
+
   xfree(tnfa);
 }
 
index 51d5ac9..355ec4d 100644 (file)
@@ -16,10 +16,10 @@ typedef struct {
   int code_max;
   int *tags;
   int assertions;
-  tre_ctype_t class;
-  tre_ctype_t *neg_classes;
+  tre_bracket_match_list_t *bracket_match_list;
   int backref;
   int *params;
+  locale_t loc;
 } tre_pos_and_tags_t;
 
 #endif /* TRE_COMPILE_H */
index dacde8c..1dec1c2 100644 (file)
@@ -18,7 +18,9 @@
 #endif /* !HAVE_WCTYPE_H */
 
 #include <ctype.h>
+#include <bitstring.h>
 #include "tre.h"
+#include "xlocale_private.h"
 
 #ifdef TRE_DEBUG
 #include <stdio.h>
@@ -31,6 +33,7 @@
 
 #ifdef HAVE_MBRTOWC
 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
+#define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l)))
 #else /* !HAVE_MBRTOWC */
 #ifdef HAVE_MBTOWC
 #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
@@ -52,6 +55,7 @@ typedef wint_t tre_cint_t;
 
 #ifdef TRE_MULTIBYTE
 #define TRE_MB_CUR_MAX MB_CUR_MAX
+#define TRE_MB_CUR_MAX_L MB_CUR_MAX_L
 #else /* !TRE_MULTIBYTE */
 #define TRE_MB_CUR_MAX 1
 #endif /* !TRE_MULTIBYTE */
@@ -75,6 +79,14 @@ typedef wint_t tre_cint_t;
 #define tre_toupper towupper
 #define tre_strlen  wcslen
 
+#define tre_isalnum_l iswalnum_l
+#define tre_isdigit_l iswdigit_l
+#define tre_islower_l iswlower_l
+#define tre_isupper_l iswupper_l
+#define tre_isxdigit_l iswxdigit_l
+#define tre_tolower_l towlower_l
+#define tre_toupper_l towupper_l
+
 #else /* !TRE_WCHAR */
 
 /* 8 bit characters. */
@@ -115,6 +127,8 @@ typedef short tre_cint_t;
 typedef wctype_t tre_ctype_t;
 #define tre_isctype iswctype
 #define tre_ctype   wctype
+#define tre_isctype_l iswctype_l
+#define tre_ctype_l   wctype_l
 #else /* !TRE_USE_SYSTEM_WCTYPE */
 /* Define our own versions of iswctype() and wctype(). */
 typedef int (*tre_ctype_t)(tre_cint_t);
@@ -122,7 +136,7 @@ typedef int (*tre_ctype_t)(tre_cint_t);
 tre_ctype_t tre_ctype(const char *name);
 #endif /* !TRE_USE_SYSTEM_WCTYPE */
 
-typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
+typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t;
 
 /* Returns number of bytes to add to (char *)ptr to make it
    properly aligned for the type. */
@@ -143,6 +157,47 @@ typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
 #define STRF "s"
 #endif /* !TRE_WCHAR */
 
+/* Types to handle bracket expressions. */
+typedef enum {
+  TRE_BRACKET_MATCH_TYPE_UNUSED = 0,
+  TRE_BRACKET_MATCH_TYPE_CHAR,         /* Single character value */
+  TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,  /* Collation range begin */
+  TRE_BRACKET_MATCH_TYPE_RANGE_END,    /* Collation range end */
+  TRE_BRACKET_MATCH_TYPE_CLASS,                /* Character class */
+  TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,  /* Collation equivalence value */
+} tre_bracket_match_type_t;
+
+typedef struct {
+  tre_bracket_match_type_t type;
+  tre_cint_t value;
+} tre_bracket_match_t;
+
+#define TRE_BRACKET_MATCH_FLAG_NEGATE  1
+
+typedef struct {
+  int num_bracket_matches;
+  int flags;
+  tre_bracket_match_t bracket_matches[0];
+} tre_bracket_match_list_t;
+
+#define SIZEOF_BRACKET_MATCH_LIST_N(n) (sizeof(tre_bracket_match_list_t) + \
+                                        sizeof(tre_bracket_match_t) * (n))
+#define SIZEOF_BRACKET_MATCH_LIST(l)   SIZEOF_BRACKET_MATCH_LIST_N( \
+                                        (l)->num_bracket_matches)
+
+/* The "count" field is the number of time the tag was set, initially zero.
+   The "first" field contains the first set value (when "count" equals 1).
+   The "value" field contains the current value of the tag, if "count" is
+   greater than zero (the tag's current value is -1 if "count" is zero).
+   The "touch" field is the touch value, a montonically increasing value
+   (maintained by the caller) set each time the tag itself is set. */
+typedef struct {
+  int count;
+  int first;
+  int value;
+  int touch;
+} tre_tag_t;
+
 /* TNFA transition type. A TNFA state is an array of transitions,
    the terminator is a transition with NULL `state'. */
 typedef struct tnfa_transition tre_tnfa_transition_t;
@@ -163,32 +218,30 @@ struct tnfa_transition {
   int assertions;
   /* Assertion parameters. */
   union {
-    /* Character class assertion. */
-    tre_ctype_t class;
+    /* Bracket matches. */
+    tre_bracket_match_list_t *bracket_match_list;
     /* Back reference assertion. */
     int backref;
   } u;
-  /* Negative character class assertions. */
-  tre_ctype_t *neg_classes;
 };
 
 
 /* Assertions. */
 #define ASSERT_AT_BOL            1   /* Beginning of line. */
 #define ASSERT_AT_EOL            2   /* End of line. */
-#define ASSERT_CHAR_CLASS        4   /* Character class in `class'. */
-#define ASSERT_CHAR_CLASS_NEG    8   /* Character classes in `neg_classes'. */
-#define ASSERT_AT_BOW           16   /* Beginning of word. */
-#define ASSERT_AT_EOW           32   /* End of word. */
-#define ASSERT_AT_WB            64   /* Word boundary. */
-#define ASSERT_AT_WB_NEG       128   /* Not a word boundary. */
-#define ASSERT_BACKREF         256   /* A back reference in `backref'. */
-#define ASSERT_LAST            256
+#define ASSERT_BRACKET_MATCH     4   /* Matches in `bracket_match_list'. */
+#define ASSERT_AT_BOW            8   /* Beginning of word. */
+#define ASSERT_AT_EOW           16   /* End of word. */
+#define ASSERT_AT_WB            32   /* Word boundary. */
+#define ASSERT_AT_WB_NEG        64   /* Not a word boundary. */
+#define ASSERT_BACKREF         128   /* A back reference in `backref'. */
+#define ASSERT_LAST            128
 
 /* Tag directions. */
 typedef enum {
   TRE_TAG_MINIMIZE = 0,
-  TRE_TAG_MAXIMIZE = 1
+  TRE_TAG_MAXIMIZE,
+  TRE_TAG_LEFT_MAXIMIZE,
 } tre_tag_direction_t;
 
 /* Parameters that can be changed dynamically while matching. */
@@ -218,66 +271,95 @@ struct tre_submatch_data {
   int so_tag;
   /* Tag that gives the value for rm_eo (submatch end offset). */
   int eo_tag;
-  /* List of submatches this submatch is contained in. */
-  int *parents;
 };
 
 typedef struct tre_submatch_data tre_submatch_data_t;
 
 
+/* LAST MATCHED structures */
+struct __previous_type;
+struct __previous_branch_type;
+struct __current_type;
+struct __current_branch_type;
+
+typedef struct __previous_branch_type {
+  struct __previous_branch_type *next;
+  struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
+  int tot_branches;
+  int tot_last_matched;
+  int tot_tags;
+  bitstr_t tags[0];
+} tre_last_matched_branch_pre_t;
+
+typedef struct __previous_type {
+  struct __previous_type *next;
+  tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag;
+  int tot_branches;
+  int tot_last_matched;
+  int tot_tags;
+} tre_last_matched_pre_t;
+
+typedef struct __current_branch_type {
+  int *tags;
+  struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags;
+} tre_last_matched_branch_t;
+
+typedef struct __current_type {
+  tre_last_matched_branch_t *branches; int n_branches; int start_tag;
+} tre_last_matched_t;
+
 /* TNFA definition. */
 typedef struct tnfa tre_tnfa_t;
 
 struct tnfa {
   tre_tnfa_transition_t *transitions;
-  unsigned int num_transitions;
   tre_tnfa_transition_t *initial;
   tre_tnfa_transition_t *final;
   tre_submatch_data_t *submatch_data;
-  char *firstpos_chars;
-  int first_char;
-  unsigned int num_submatches;
   tre_tag_direction_t *tag_directions;
   int *minimal_tags;
+  tre_last_matched_branch_t *last_matched_branch;
+  locale_t loc;
+  unsigned int num_transitions;
+  int first_char;
+  unsigned int num_submatches;
+  unsigned int num_submatches_invisible;
   int num_tags;
   int num_minimals;
   int end_tag;
   int num_states;
   int cflags;
   int have_backrefs;
+  int num_reorder_tags;
   int have_approx;
   int params_depth;
 };
 
 int
-tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
+tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags,
+           locale_t loc);
 
 void
 tre_free(regex_t *preg);
 
-void
-tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-               const tre_tnfa_t *tnfa, int *tags, int match_eo);
-
 reg_errcode_t
-tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
-                     tre_str_type_t type, int *match_tags, int eflags,
-                     int *match_end_ofs);
+tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
+               const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo);
 
 reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
-                     tre_str_type_t type, int *match_tags, int eflags,
+                     tre_str_type_t type, tre_tag_t *match_tags, int eflags,
                      int *match_end_ofs);
 
 reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-                      int len, tre_str_type_t type, int *match_tags,
+                      int len, tre_str_type_t type, tre_tag_t *match_tags,
                       int eflags, int *match_end_ofs);
 
 #ifdef TRE_APPROX
 reg_errcode_t
 tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
-                   tre_str_type_t type, int *match_tags,
+                   tre_str_type_t type, tre_tag_t *match_tags,
                    regamatch_t *match, regaparams_t params,
                    int eflags, int *match_end_ofs);
 #endif /* TRE_APPROX */
index eab13a7..726c4c0 100644 (file)
 #include <config.h>
 #endif /* HAVE_CONFIG_H */
 
+/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
+   info while running */
+#undef TRE_USE_ALLOCA
+
 #ifdef TRE_USE_ALLOCA
 /* AIX requires this to be the first thing in the file.         */
 #ifndef __GNUC__
@@ -75,6 +79,7 @@ char *alloca ();
 
 typedef struct {
   int pos;
+  unsigned int pos_add_next;
   const char *str_byte;
 #ifdef TRE_WCHAR
   const wchar_t *str_wide;
@@ -82,7 +87,7 @@ typedef struct {
   tre_tnfa_transition_t *state;
   int state_id;
   int next_c;
-  int *tags;
+  tre_tag_t *tags;
 #ifdef TRE_MBSTATE
   mbstate_t mbstate;
 #endif /* TRE_MBSTATE */
@@ -122,10 +127,9 @@ typedef struct tre_backtrack_struct {
 #endif /* !TRE_USE_ALLOCA */
 
 
-#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
+#define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
   do                                                                         \
     {                                                                        \
-      int i;                                                                 \
       if (!stack->next)                                                              \
        {                                                                     \
          tre_backtrack_t s;                                                  \
@@ -144,7 +148,7 @@ typedef struct tre_backtrack_struct {
          s->prev = stack;                                                    \
          s->next = NULL;                                                     \
          s->item.tags = tre_bt_mem_alloc(mem,                                \
-                                         sizeof(*tags) * tnfa->num_tags);    \
+                                         num_tags * sizeof(*tags));          \
          if (!s->item.tags)                                                  \
            {                                                                 \
              tre_bt_mem_destroy(mem);                                        \
@@ -162,13 +166,13 @@ typedef struct tre_backtrack_struct {
       else                                                                   \
        stack = stack->next;                                                  \
       stack->item.pos = (_pos);                                                      \
+      stack->item.pos_add_next = (_pos_add_next);                            \
       stack->item.str_byte = (_str_byte);                                    \
       BT_STACK_WIDE_IN(_str_wide);                                           \
       stack->item.state = (_state);                                          \
       stack->item.state_id = (_state_id);                                    \
       stack->item.next_c = (_next_c);                                        \
-      for (i = 0; i < tnfa->num_tags; i++)                                   \
-       stack->item.tags[i] = (_tags)[i];                                     \
+      memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags)));         \
       BT_STACK_MBSTATE_IN;                                                   \
     }                                                                        \
   while (/*CONSTCOND*/0)
@@ -176,17 +180,14 @@ typedef struct tre_backtrack_struct {
 #define BT_STACK_POP()                                                       \
   do                                                                         \
     {                                                                        \
-      int i;                                                                 \
       assert(stack->prev);                                                   \
       pos = stack->item.pos;                                                 \
-      if (type == STR_USER)                                                   \
-        str_source->rewind(pos + pos_add_next, str_source->context);          \
+      pos_add_next = stack->item.pos_add_next;                               \
       str_byte = stack->item.str_byte;                                       \
       BT_STACK_WIDE_OUT;                                                     \
       state = stack->item.state;                                             \
       next_c = stack->item.next_c;                                           \
-      for (i = 0; i < tnfa->num_tags; i++)                                   \
-       tags[i] = stack->item.tags[i];                                        \
+      memcpy(tags, stack->item.tags, num_tags * sizeof(*tags));               \
       BT_STACK_MBSTATE_OUT;                                                  \
       stack = stack->prev;                                                   \
     }                                                                        \
@@ -197,7 +198,7 @@ typedef struct tre_backtrack_struct {
 
 reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-                      int len, tre_str_type_t type, int *match_tags,
+                      int len, tre_str_type_t type, tre_tag_t *match_tags,
                       int eflags, int *match_end_ofs)
 {
   /* State variables required by GET_NEXT_WCHAR. */
@@ -214,7 +215,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   int reg_notbol = eflags & REG_NOTBOL;
   int reg_noteol = eflags & REG_NOTEOL;
   int reg_newline = tnfa->cflags & REG_NEWLINE;
-  int str_user_end = 0;
+  int i;
 
   /* These are used to remember the necessary values of the above
      variables to return to the position where the current search
@@ -232,7 +233,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   /* End offset of best match so far, or -1 if no match found yet. */
   int match_eo = -1;
   /* Tag arrays. */
-  int *next_tags, *tags = NULL;
+  int *next_tags;
+  tre_tag_t *tags = NULL;
   /* Current TNFA state. */
   tre_tnfa_transition_t *state;
   int *states_seen = NULL;
@@ -245,7 +247,12 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 
   tre_tnfa_transition_t *trans_i;
   regmatch_t *pmatch = NULL;
-  int ret;
+  reg_errcode_t ret;
+
+  int num_tags = tnfa->num_tags;
+  int touch = 1;
+  char *buf;
+  int tbytes;
 
 #ifdef TRE_MBSTATE
   memset(&mbstate, '\0', sizeof(mbstate));
@@ -265,57 +272,48 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
   DPRINT(("len = %d\n", len));
 
+  {
+    int pbytes, sbytes, total_bytes;
+    char *tmp_buf;
+    /* Compute the length of the block we need. */
+    tbytes = sizeof(*tags) * num_tags;
+    pbytes = sizeof(*pmatch) * tnfa->num_submatches;
+    sbytes = sizeof(*states_seen) * tnfa->num_states;
+    total_bytes =
+      (sizeof(long) - 1) * 2 /* for alignment paddings */
+      + tbytes + pbytes + sbytes;
+
+    DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes));
+    /* Allocate the memory. */
 #ifdef TRE_USE_ALLOCA
-  tags = alloca(sizeof(*tags) * tnfa->num_tags);
-  pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
-  states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
+    buf = alloca(total_bytes);
 #else /* !TRE_USE_ALLOCA */
-  if (tnfa->num_tags)
-    {
-      tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
-      if (!tags)
-       {
-         ret = REG_ESPACE;
-         goto error_exit;
-       }
-    }
-  if (tnfa->num_submatches)
-    {
-      pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
-      if (!pmatch)
-       {
-         ret = REG_ESPACE;
-         goto error_exit;
-       }
-    }
-  if (tnfa->num_states)
-    {
-      states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
-      if (!states_seen)
-       {
-         ret = REG_ESPACE;
-         goto error_exit;
-       }
-    }
+    buf = xmalloc((unsigned)total_bytes);
 #endif /* !TRE_USE_ALLOCA */
+    if (buf == NULL)
+      return REG_ESPACE;
+
+    /* Get the various pointers within tmp_buf (properly aligned). */
+    tags = (void *)buf;
+    tmp_buf = buf + tbytes;
+    tmp_buf += ALIGN(tmp_buf, long);
+    pmatch = (void *)tmp_buf;
+    tmp_buf += pbytes;
+    tmp_buf += ALIGN(tmp_buf, long);
+    states_seen = (void *)tmp_buf;
+  }
 
  retry:
   {
-    int i;
-    for (i = 0; i < tnfa->num_tags; i++)
-      {
-       tags[i] = -1;
-       if (match_tags)
-         match_tags[i] = -1;
-      }
+    memset(tags, 0, num_tags * sizeof(*tags));
+    if (match_tags)
+      memset(match_tags, 0, num_tags * sizeof(*match_tags));
     for (i = 0; i < tnfa->num_states; i++)
       states_seen[i] = 0;
   }
 
   state = NULL;
   pos = pos_start;
-  if (type == STR_USER)
-    str_source->rewind(pos + pos_add_next, str_source->context);
   GET_NEXT_WCHAR();
   pos_start = pos;
   next_c_start = next_c;
@@ -347,20 +345,26 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
        {
          /* Backtrack to this state. */
          DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
-         BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
+         BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state,
                        trans_i->state_id, next_c, tags, mbstate);
          {
            int *tmp = trans_i->tags;
            if (tmp)
-             while (*tmp >= 0)
-               stack->item.tags[*tmp++] = pos;
+             {
+               while (*tmp >= 0)
+                 tre_tag_set(stack->item.tags, *tmp++, pos, touch);
+               touch++;
+             }
          }
        }
     }
 
   if (next_tags)
-    for (; *next_tags >= 0; next_tags++)
-      tags[*next_tags] = pos;
+    {
+      for (; *next_tags >= 0; next_tags++)
+       tre_tag_set(tags, *next_tags, pos, touch);
+      touch++;
+    }
 
 
   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
@@ -376,22 +380,103 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
       int empty_br_match;
 
       DPRINT(("start loop\n"));
+
+      if (match_eo >= 0 && tnfa->num_minimals)
+       {
+         int skip = 0;
+#ifdef TRE_DEBUG
+         DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
+                 match_eo));
+         tre_print_tags(match_tags, tnfa->num_tags);
+         DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+         for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
+           {
+             int end = tnfa->minimal_tags[i];
+             int start = tnfa->minimal_tags[i + 1];
+             DPRINT(("  Minimal start %d, end %d\n", start, end));
+             if (tre_minimal_tag_order(start, end, match_tags, tags) > 0)
+               {
+                 skip = 1;
+                 break;
+               }
+           }
+         if (!skip)
+           {
+#ifdef TRE_DEBUG
+             DPRINT(("  Keeping tags="));
+             tre_print_tags(tags, tnfa->num_tags);
+             DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+           }
+         else
+           {
+#ifdef TRE_DEBUG
+             DPRINT(("  Throwing out tags="));
+             tre_print_tags(tags, tnfa->num_tags);
+             DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+             goto backtrack;
+           }
+       }
+
       if (state == tnfa->final)
        {
-         DPRINT(("  match found, %d %d\n", match_eo, pos));
+         DPRINT(("  match found, match_eo=%d pos=%d\n", match_eo, pos));
+
+         if (match_eo >= 0 && tnfa->num_minimals)
+           {
+             int compare = 0;
+#ifdef TRE_DEBUG
+             DPRINT(("Checking minimal conditions: match_eo=%d "
+                     "match_tags=", match_eo));
+             tre_print_tags(match_tags, tnfa->num_tags);
+             DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+             for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
+               {
+                 int end = tnfa->minimal_tags[i];
+                 int start = tnfa->minimal_tags[i + 1];
+                 DPRINT(("  Minimal start %d, end %d\n", start, end));
+                 if ((compare = tre_minimal_tag_order(start, end,
+                                          match_tags, tags)) != 0)
+                   break;
+               }
+             if (compare > 0)
+               {
+#ifdef TRE_DEBUG
+                 DPRINT(("      Throwing out new match, tags="));
+                 tre_print_tags(tags, tnfa->num_tags);
+                 DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+                 goto backtrack;
+               }
+             else if (compare < 0)
+               {
+#ifdef TRE_DEBUG
+                 DPRINT(("      Throwing out old match, tags="));
+                 tre_print_tags(match_tags, tnfa->num_tags);
+                 DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+                 match_eo = -1;
+               }
+           }
+
          if (match_eo < pos
              || (match_eo == pos
                  && match_tags
                  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
                                   tags, match_tags)))
            {
-             int i;
              /* This match wins the previous match. */
-             DPRINT(("  win previous\n"));
+#ifdef TRE_DEBUG
+             DPRINT(("  win previous tags="));
+             tre_print_tags(tags, tnfa->num_tags);
+             DPRINT(("\n"));
+#endif /* TRE_DEBUG */
              match_eo = pos;
              if (match_tags)
-               for (i = 0; i < tnfa->num_tags; i++)
-                 match_tags[i] = tags[i];
+               memcpy(match_tags, tags, num_tags * sizeof(*tags));
            }
          /* Our TNFAs never have transitions leaving from the final state,
             so we jump right to backtracking. */
@@ -401,12 +486,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 #ifdef TRE_DEBUG
       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
              state));
-      {
-       int i;
-       for (i = 0; i < tnfa->num_tags; i++)
-         DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
-       DPRINT(("\n"));
-      }
+      tre_print_tags(tags, tnfa->num_tags);
+      DPRINT(("\n"));
 #endif /* TRE_DEBUG */
 
       /* Go to the next character in the input string. */
@@ -424,8 +505,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
          DPRINT(("  should match back reference %d\n", bt));
          /* Get the substring we need to match against.  Remember to
             turn off REG_NOSUB temporarily. */
-         tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB,
+         ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
                          tnfa, tags, pos);
+         if (ret != REG_OK) goto error_exit;
          so = pmatch[bt].rm_so;
          eo = pmatch[bt].rm_eo;
          bt_len = eo - so;
@@ -440,14 +522,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 
            if (type == STR_BYTE)
              {
-               DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
+               DPRINT(("  substring (len %d) is [%d, %d]: '%.*s'\n",
                        bt_len, so, eo, bt_len, (char*)string + so));
                DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
              }
 #ifdef TRE_WCHAR
            else if (type == STR_WIDE)
              {
-               DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
+               DPRINT(("  substring (len %d) is [%d, %d]: '%.*" STRF "'\n",
                        bt_len, so, eo, bt_len, (wchar_t*)string + so));
                DPRINT(("  current string is '%.*" STRF "'\n",
                        slen, str_wide - 1));
@@ -456,19 +538,19 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
          }
 #endif
 
-         if (len < 0)
+         if (so < 0)
+           {
+             result = 1; /* Back reference of nomatch doesn't match */
+           }
+         else if (len < 0)
            {
-             if (type == STR_USER)
-               result = str_source->compare((unsigned)so, (unsigned)pos,
-                                            (unsigned)bt_len,
-                                            str_source->context);
 #ifdef TRE_WCHAR
-             else if (type == STR_WIDE)
+             if (type == STR_WIDE)
                result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
                                 (size_t)bt_len);
-#endif /* TRE_WCHAR */
              else
-               result = strncmp((const char*)string + so, str_byte - 1,
+#endif /* TRE_WCHAR */
+             result = strncmp((const char*)string + so, str_byte - 1,
                                 (size_t)bt_len);
            }
          else if (len - pos < bt_len)
@@ -517,12 +599,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
          /* Check for end of string. */
          if (len < 0)
            {
-             if (type == STR_USER)
-               {
-                 if (str_user_end)
-                   goto backtrack;
-               }
-             else if (next_c == L'\0')
+             if (next_c == L'\0')
                goto backtrack;
            }
          else
@@ -568,12 +645,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
                     jump back here if needed. */
                  DPRINT(("  saving state %d for backtracking\n",
                          trans_i->state_id));
-                 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
-                               trans_i->state_id, next_c, tags, mbstate);
+                 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide,
+                               trans_i->state, trans_i->state_id, next_c,
+                               tags, mbstate);
                  {
                    int *tmp;
                    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
-                     stack->item.tags[*tmp] = pos;
+                     tre_tag_set(stack->item.tags, *tmp, pos, touch);
+                   touch++;
                  }
 #if 0 /* XXX - it's important not to look at all transitions here to keep
         the stack small! */
@@ -590,8 +669,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 
          /* Update the tag values. */
          if (next_tags)
-           while (*next_tags >= 0)
-             tags[*next_tags++] = pos;
+           {
+             while (*next_tags >= 0)
+               tre_tag_set(tags, *next_tags++, pos, touch);
+             touch++;
+           }
        }
       else
        {
@@ -600,7 +682,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
          if (stack->prev)
            {
              DPRINT(("  backtracking\n"));
-             if (stack->item.state->assertions && ASSERT_BACKREF)
+             if (stack->item.state->assertions & ASSERT_BACKREF)
                {
                  DPRINT(("  states_seen[%d] = 0\n",
                          stack->item.state_id));
@@ -613,20 +695,23 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
            {
              /* Try starting from a later position in the input string. */
              /* Check for end of string. */
-             if (len < 0)
+             if (pos == pos_start)
                {
-                 if (next_c == L'\0')
+                 if (len < 0)
                    {
-                     DPRINT(("end of string.\n"));
-                     break;
+                     if (next_c == L'\0')
+                       {
+                         DPRINT(("end of string.\n"));
+                         break;
+                       }
                    }
-               }
-             else
-               {
-                 if (pos >= len)
+                 else
                    {
-                     DPRINT(("end of string.\n"));
-                     break;
+                     if (pos >= len)
+                       {
+                         DPRINT(("end of string.\n"));
+                         break;
+                       }
                    }
                }
              DPRINT(("restarting from next start position\n"));
@@ -654,12 +739,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
  error_exit:
   tre_bt_mem_destroy(mem);
 #ifndef TRE_USE_ALLOCA
-  if (tags)
-    xfree(tags);
-  if (pmatch)
-    xfree(pmatch);
-  if (states_seen)
-    xfree(states_seen);
+  if (buf)
+    xfree(buf);
 #endif /* !TRE_USE_ALLOCA */
 
   return ret;
index 0d6ec17..7e30ebd 100644 (file)
 #include <config.h>
 #endif /* HAVE_CONFIG_H */
 
+/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
+   info while running */
+#undef TRE_USE_ALLOCA
+
 #ifdef TRE_USE_ALLOCA
 /* AIX requires this to be the first thing in the file.         */
 #ifndef __GNUC__
@@ -69,34 +73,33 @@ char *alloca ();
 
 typedef struct {
   tre_tnfa_transition_t *state;
-  int *tags;
+  tre_tag_t *tags;
 } tre_tnfa_reach_t;
 
 typedef struct {
   int pos;
-  int **tags;
+  tre_tag_t **tags;
 } tre_reach_pos_t;
 
 
 #ifdef TRE_DEBUG
 static void
-tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
+tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
 {
-  int i;
+  DPRINT((" %p", (void *)state));
+  if (num_tags > 0)
+    {
+      DPRINT(("/"));
+      tre_print_tags(tags, num_tags);
+    }
+}
 
+static void
+tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
+{
   while (reach->state != NULL)
     {
-      DPRINT((" %p", (void *)reach->state));
-      if (num_tags > 0)
-       {
-         DPRINT(("/"));
-         for (i = 0; i < num_tags; i++)
-           {
-             DPRINT(("%d:%d", i, reach->tags[i]));
-             if (i < (num_tags-1))
-               DPRINT((","));
-           }
-       }
+      tre_print_reach1(reach->state, reach->tags, num_tags);
       reach++;
     }
   DPRINT(("\n"));
@@ -106,7 +109,7 @@ tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
 
 reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
-                     tre_str_type_t type, int *match_tags, int eflags,
+                     tre_str_type_t type, tre_tag_t *match_tags, int eflags,
                      int *match_end_ofs)
 {
   /* State variables required by GET_NEXT_WCHAR. */
@@ -123,7 +126,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
   int reg_notbol = eflags & REG_NOTBOL;
   int reg_noteol = eflags & REG_NOTEOL;
   int reg_newline = tnfa->cflags & REG_NEWLINE;
-  int str_user_end = 0;
 
   char *buf;
   tre_tnfa_transition_t *trans_i;
@@ -133,9 +135,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
   int num_tags, i;
 
   int match_eo = -1;      /* end offset of match (-1 if no match found yet) */
-  int new_match = 0;
-  int *tmp_tags = NULL;
-  int *tmp_iptr;
+#ifdef TRE_DEBUG
+  int once;
+#endif /* TRE_DEBUG */
+  tre_tag_t *tmp_tags = NULL;
+  tre_tag_t *tmp_iptr;
+  int tbytes;
+  int touch = 1;
 
 #ifdef TRE_MBSTATE
   memset(&mbstate, '\0', sizeof(mbstate));
@@ -153,17 +159,17 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
      everything in a single large block from the stack frame using alloca()
      or with malloc() if alloca is unavailable. */
   {
-    int tbytes, rbytes, pbytes, xbytes, total_bytes;
+    int rbytes, pbytes, total_bytes;
     char *tmp_buf;
     /* Compute the length of the block we need. */
     tbytes = sizeof(*tmp_tags) * num_tags;
     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
     pbytes = sizeof(*reach_pos) * tnfa->num_states;
-    xbytes = sizeof(int) * num_tags;
     total_bytes =
       (sizeof(long) - 1) * 4 /* for alignment paddings */
-      + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
+      + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
 
+    DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
     /* Allocate the memory. */
 #ifdef TRE_USE_ALLOCA
     buf = alloca(total_bytes);
@@ -190,9 +196,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
     for (i = 0; i < tnfa->num_states; i++)
       {
        reach[i].tags = (void *)tmp_buf;
-       tmp_buf += xbytes;
+       tmp_buf += tbytes;
        reach_next[i].tags = (void *)tmp_buf;
-       tmp_buf += xbytes;
+       tmp_buf += tbytes;
       }
   }
 
@@ -200,15 +206,103 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
     reach_pos[i].pos = -1;
 
   /* If only one character can start a match, find it first. */
-  if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
+  if (tnfa->first_char >= 0 && str_byte)
     {
       const char *orig_str = str_byte;
       int first = tnfa->first_char;
+      int found_high_bit = 0;
 
-      if (len >= 0)
-       str_byte = memchr(orig_str, first, (size_t)len);
-      else
-       str_byte = strchr(orig_str, first);
+
+      if (type == STR_BYTE)
+       {
+         if (len >= 0)
+           str_byte = memchr(orig_str, first, (size_t)len);
+         else
+           str_byte = strchr(orig_str, first);
+       }
+      else if (type == STR_MBS)
+       {
+         /*
+          * If the match character is ASCII, try to match the character
+          * directly, but if a high bit character is found, we stop there.
+          */
+         if (first < 0x80)
+           {
+             if (len >= 0)
+               {
+                 int i;
+                 for (i = 0; ; str_byte++, i++)
+                   {
+                     if (i >= len)
+                       {
+                         str_byte = NULL;
+                         break;
+                       }
+                     if (*str_byte == first)
+                       break;
+                     if (*str_byte & 0x80)
+                       {
+                         found_high_bit = 1;
+                         break;
+                       }
+                   }
+               }
+             else
+               {
+                 for (; ; str_byte++)
+                   {
+                     if (!*str_byte)
+                       {
+                         str_byte = NULL;
+                         break;
+                       }
+                     if (*str_byte == first)
+                       break;
+                     if (*str_byte & 0x80)
+                       {
+                         found_high_bit = 1;
+                         break;
+                       }
+                   }
+               }
+           }
+         else
+           {
+             if (len >= 0)
+               {
+                 int i;
+                 for (i = 0; ; str_byte++, i++)
+                   {
+                     if (i >= len)
+                       {
+                         str_byte = NULL;
+                         break;
+                       }
+                     if (*str_byte & 0x80)
+                       {
+                         found_high_bit = 1;
+                         break;
+                       }
+                   }
+               }
+             else
+               {
+                 for (; ; str_byte++)
+                   {
+                     if (!*str_byte)
+                       {
+                         str_byte = NULL;
+                         break;
+                       }
+                     if (*str_byte & 0x80)
+                       {
+                         found_high_bit = 1;
+                         break;
+                       }
+                   }
+               }
+           }
+       }
       if (str_byte == NULL)
        {
 #ifndef TRE_USE_ALLOCA
@@ -218,51 +312,35 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
          return REG_NOMATCH;
        }
       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
-      if (str_byte >= orig_str + 1)
-       prev_c = (unsigned char)*(str_byte - 1);
-      next_c = (unsigned char)*str_byte;
-      pos = str_byte - orig_str;
-      if (len < 0 || pos < len)
-       str_byte++;
-    }
-  else
-    {
-      GET_NEXT_WCHAR();
-      pos = 0;
-    }
-
-#if 0
-  /* Skip over characters that cannot possibly be the first character
-     of a match. */
-  if (tnfa->firstpos_chars != NULL)
-    {
-      char *chars = tnfa->firstpos_chars;
-
-      if (len < 0)
+      if (!found_high_bit)
        {
-         const char *orig_str = str_byte;
-         /* XXX - use strpbrk() and wcspbrk() because they might be
-            optimized for the target architecture.  Try also strcspn()
-            and wcscspn() and compare the speeds. */
-         while (next_c != L'\0' && !chars[next_c])
-           {
-             next_c = *str_byte++;
-           }
-         prev_c = *(str_byte - 2);
-         pos += str_byte - orig_str;
-         DPRINT(("skipped %d chars\n", str_byte - orig_str));
+         if (str_byte >= orig_str + 1)
+           prev_c = (unsigned char)*(str_byte - 1);
+         next_c = (unsigned char)*str_byte;
+         pos = str_byte - orig_str;
+         if (len < 0 || pos < len)
+           str_byte++;
        }
       else
        {
-         while (pos <= len && !chars[next_c])
-           {
-             prev_c = next_c;
-             next_c = (unsigned char)(*str_byte++);
-             pos++;
-           }
+         if (str_byte == orig_str)
+           goto no_first_optimization;
+         /*
+          * Back up one character, fix up the position, then call
+          * GET_NEXT_WCHAR() to process the multibyte character.
+          */
+         /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
+         next_c = (unsigned char)*(str_byte - 1);
+         pos = (str_byte - 1) - orig_str;
+         GET_NEXT_WCHAR();
        }
     }
-#endif
+  else
+    {
+no_first_optimization:
+      GET_NEXT_WCHAR();
+      pos = 0;
+    }
 
   DPRINT(("length: %d\n", len));
   DPRINT(("pos:chr/code | states and tags\n"));
@@ -290,23 +368,23 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
 
                  DPRINT((" %p", (void *)trans_i->state));
                  reach_next_i->state = trans_i->state;
-                 for (i = 0; i < num_tags; i++)
-                   reach_next_i->tags[i] = -1;
+                 memset(reach_next_i->tags, 0, tbytes);
                  tag_i = trans_i->tags;
                  if (tag_i)
-                   while (*tag_i >= 0)
-                     {
-                       if (*tag_i < num_tags)
-                         reach_next_i->tags[*tag_i] = pos;
-                       tag_i++;
-                     }
+                   {
+                     while (*tag_i >= 0)
+                       {
+                         if (*tag_i < num_tags)
+                           tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
+                         tag_i++;
+                       }
+                       touch++;
+                   }
                  if (reach_next_i->state == tnfa->final)
                    {
                      DPRINT(("  found empty match\n"));
                      match_eo = pos;
-                     new_match = 1;
-                     for (i = 0; i < num_tags; i++)
-                       match_tags[i] = reach_next_i->tags[i];
+                     memcpy(match_tags, reach_next_i->tags, tbytes);
                    }
                  reach_pos[trans_i->state_id].pos = pos;
                  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
@@ -327,12 +405,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
       /* Check for end of string. */
       if (len < 0)
        {
-         if (type == STR_USER)
-           {
-             if (str_user_end)
-               break;
-           }
-         else if (next_c == L'\0')
+         if (next_c == L'\0')
            break;
        }
       else
@@ -346,8 +419,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
 #ifdef TRE_DEBUG
       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
       tre_print_reach(tnfa, reach_next, num_tags);
-      DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
-      tre_print_reach(tnfa, reach_next, num_tags);
 #endif /* TRE_DEBUG */
 
       /* Swap `reach' and `reach_next'. */
@@ -355,51 +426,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
       reach = reach_next;
       reach_next = reach_i;
 
-      /* For each state in `reach', weed out states that don't fulfill the
-        minimal matching conditions. */
-      if (tnfa->num_minimals && new_match)
-       {
-         new_match = 0;
-         reach_next_i = reach_next;
-         for (reach_i = reach; reach_i->state; reach_i++)
-           {
-             int skip = 0;
-             for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
-               {
-                 int end = tnfa->minimal_tags[i];
-                 int start = tnfa->minimal_tags[i + 1];
-                 DPRINT(("  Minimal start %d, end %d\n", start, end));
-                 if (end >= num_tags)
-                   {
-                     DPRINT(("  Throwing %p out.\n", reach_i->state));
-                     skip = 1;
-                     break;
-                   }
-                 else if (reach_i->tags[start] == match_tags[start]
-                          && reach_i->tags[end] < match_tags[end])
-                   {
-                     DPRINT(("  Throwing %p out because t%d < %d\n",
-                             reach_i->state, end, match_tags[end]));
-                     skip = 1;
-                     break;
-                   }
-               }
-             if (!skip)
-               {
-                 reach_next_i->state = reach_i->state;
-                 tmp_iptr = reach_next_i->tags;
-                 reach_next_i->tags = reach_i->tags;
-                 reach_i->tags = tmp_iptr;
-                 reach_next_i++;
-               }
-           }
-         reach_next_i->state = NULL;
-
-         /* Swap `reach' and `reach_next'. */
-         reach_i = reach;
-         reach = reach_next;
-         reach_next = reach_i;
-       }
+#ifdef TRE_DEBUG
+      once = 0;
+#endif /* TRE_DEBUG */
 
       /* For each state in `reach' see if there is a transition leaving with
         the current input symbol to a state not yet in `reach_next', and
@@ -422,16 +451,57 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
                    }
 
                  /* Compute the tags after this transition. */
-                 for (i = 0; i < num_tags; i++)
-                   tmp_tags[i] = reach_i->tags[i];
+                 memcpy(tmp_tags, reach_i->tags, tbytes);
                  tag_i = trans_i->tags;
                  if (tag_i != NULL)
-                   while (*tag_i >= 0)
-                     {
-                       if (*tag_i < num_tags)
-                         tmp_tags[*tag_i] = pos;
-                       tag_i++;
-                     }
+                   {
+                     while (*tag_i >= 0)
+                       {
+                         if (*tag_i < num_tags)
+                           tre_tag_set(tmp_tags, *tag_i, pos, touch);
+                         tag_i++;
+                       }
+                       touch++;
+                   }
+
+                 /* For each new transition, weed out those that don't
+                    fulfill the minimal matching conditions. */
+                 if (tnfa->num_minimals && match_eo >= 0)
+                   {
+                     int skip = 0;
+#ifdef TRE_DEBUG
+                     if (!once)
+                       {
+                         DPRINT(("Checking minimal conditions: match_eo=%d "
+                                 "match_tags=", match_eo));
+                         tre_print_tags(match_tags, num_tags);
+                         DPRINT(("\n"));
+                         once++;
+                       }
+#endif /* TRE_DEBUG */
+                     for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
+                       {
+                          int end = tnfa->minimal_tags[i];
+                          int start = tnfa->minimal_tags[i + 1];
+                          DPRINT(("  Minimal start %d, end %d\n", start, end));
+                          if (tre_minimal_tag_order(start, end, match_tags,
+                                                    tmp_tags) > 0)
+                            {
+                               skip = 1;
+                               break;
+                            }
+                       }
+                     if (skip)
+                       {
+#ifdef TRE_DEBUG
+                          DPRINT(("     Throwing out"));
+                          tre_print_reach1(reach_i->state, tmp_tags,
+                                            num_tags);
+                          DPRINT(("\n"));
+#endif /* TRE_DEBUG */
+                          continue;
+                       }
+                   }
 
                  if (reach_pos[trans_i->state_id].pos < pos)
                    {
@@ -446,13 +516,16 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
                      if (reach_next_i->state == tnfa->final
                          && (match_eo == -1
                              || (num_tags > 0
-                                 && reach_next_i->tags[0] <= match_tags[0])))
+                                 && tre_tag_get(reach_next_i->tags, 0) <=
+                                 tre_tag_get(match_tags, 0))))
                        {
-                         DPRINT(("  found match %p\n", trans_i->state));
+#ifdef TRE_DEBUG
+                         DPRINT(("  found match"));
+                         tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
+                         DPRINT(("\n"));
+#endif /* TRE_DEBUG */
                          match_eo = pos;
-                         new_match = 1;
-                         for (i = 0; i < num_tags; i++)
-                           match_tags[i] = reach_next_i->tags[i];
+                         memcpy(match_tags, reach_next_i->tags, tbytes);
                        }
                      reach_next_i++;
 
@@ -472,11 +545,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
                          *reach_pos[trans_i->state_id].tags = tmp_tags;
                          if (trans_i->state == tnfa->final)
                            {
-                             DPRINT(("  found better match\n"));
+#ifdef TRE_DEBUG
+                             DPRINT(("  found better match"));
+                             tre_print_reach1(trans_i->state, tmp_tags, num_tags);
+                             DPRINT(("\n"));
+#endif /* TRE_DEBUG */
                              match_eo = pos;
-                             new_match = 1;
-                             for (i = 0; i < num_tags; i++)
-                               match_tags[i] = tmp_tags[i];
+                             memcpy(match_tags, tmp_tags, tbytes);
                            }
                          tmp_tags = tmp_iptr;
                        }
@@ -489,12 +564,21 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
 
   DPRINT(("match end offset = %d\n", match_eo));
 
+  *match_end_ofs = match_eo;
+#ifdef TRE_DEBUG
+  if (match_tags)
+    {
+      DPRINT(("Winning tags="));
+      tre_print_tags_all(match_tags, num_tags);
+      DPRINT((" touch=%d\n", touch));
+    }
+#endif /* TRE_DEBUG */
+
 #ifndef TRE_USE_ALLOCA
   if (buf)
     xfree(buf);
 #endif /* !TRE_USE_ALLOCA */
 
-  *match_end_ofs = match_eo;
   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
 }
 
index 70db745..34da403 100644 (file)
@@ -6,6 +6,8 @@
 
 */
 
+#include "tre-internal.h"
+
 #define str_source ((const tre_str_source*)string)
 
 #ifdef TRE_WCHAR
 
 /* Wide character and multibyte support. */
 
+/*
+ * Because all multibyte encodings are exclusively single-shift encoding,
+ * with the shift codes having the high bit set, we can make an optimization
+ * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
+ * is detected, and just do a direct copy for ASCII characters.
+ */
 #define GET_NEXT_WCHAR()                                                     \
   do {                                                                       \
     prev_c = next_c;                                                         \
-    if (type == STR_BYTE)                                                    \
+    switch (type)                                                            \
       {                                                                              \
+      case STR_BYTE:                                                         \
        pos++;                                                                \
        if (len >= 0 && pos >= len)                                           \
          next_c = '\0';                                                      \
        else                                                                  \
          next_c = (unsigned char)(*str_byte++);                              \
-      }                                                                              \
-    else if (type == STR_WIDE)                                               \
-      {                                                                              \
+       break;                                                                \
+      case STR_WIDE:                                                         \
        pos++;                                                                \
        if (len >= 0 && pos >= len)                                           \
          next_c = L'\0';                                                     \
        else                                                                  \
          next_c = *str_wide++;                                               \
-      }                                                                              \
-    else if (type == STR_MBS)                                                \
-      {                                                                              \
-        pos += pos_add_next;                                                 \
-       if (str_byte == NULL)                                                 \
-         next_c = L'\0';                                                     \
+       break;                                                                \
+      case STR_MBS:                                                          \
+       pos += pos_add_next;                                                  \
+       if (__builtin_expect(len >= 0 && pos >= len, 0))                      \
+         {                                                                   \
+           next_c = L'\0';                                                   \
+           pos_add_next = 1;                                                 \
+         }                                                                   \
+       else if (__builtin_expect(!(*str_byte & 0x80), 1))                    \
+         {                                                                   \
+           next_c = (unsigned char)(*str_byte++);                            \
+           pos_add_next = 1;                                                 \
+         }                                                                   \
        else                                                                  \
          {                                                                   \
            size_t w;                                                         \
              max = len - pos;                                                \
            else                                                              \
              max = 32;                                                       \
-           if (max <= 0)                                                     \
+           w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,       \
+                             tnfa->loc);                                     \
+           if (w == (size_t)-1 || w == (size_t)-2)                           \
+             return REG_ILLSEQ;                                              \
+           if (w == 0 && len >= 0)                                           \
              {                                                               \
-               next_c = L'\0';                                               \
                pos_add_next = 1;                                             \
+               next_c = 0;                                                   \
+               str_byte++;                                                   \
              }                                                               \
            else                                                              \
              {                                                               \
-               w = tre_mbrtowc(&next_c, str_byte, (size_t)max, &mbstate);    \
-               if (w == (size_t)-1 || w == (size_t)-2)                       \
-                 return REG_NOMATCH;                                         \
-               if (w == 0 && len >= 0)                                       \
-                 {                                                           \
-                   pos_add_next = 1;                                         \
-                   next_c = 0;                                               \
-                   str_byte++;                                               \
-                 }                                                           \
-               else                                                          \
-                 {                                                           \
-                   pos_add_next = w;                                         \
-                   str_byte += w;                                            \
-                 }                                                           \
+               pos_add_next = w;                                             \
+               str_byte += w;                                                \
              }                                                               \
          }                                                                   \
-      }                                                                              \
-    else if (type == STR_USER)                                               \
-      {                                                                              \
-        pos += pos_add_next;                                                 \
-       str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
-                                                 str_source->context);       \
+       break;                                                                \
       }                                                                              \
   } while(/*CONSTCOND*/0)
 
 #else /* !TRE_MULTIBYTE */
 
 /* Wide character support, no multibyte support. */
+#error TRE_MULTIBYTE undefined
 
 #define GET_NEXT_WCHAR()                                                     \
   do {                                                                       \
        else                                                                  \
          next_c = *str_wide++;                                               \
       }                                                                              \
-    else if (type == STR_USER)                                               \
-      {                                                                              \
-        pos += pos_add_next;                                                 \
-       str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
-                                                 str_source->context);       \
-      }                                                                              \
   } while(/*CONSTCOND*/0)
 
 #endif /* !TRE_MULTIBYTE */
 #else /* !TRE_WCHAR */
 
 /* No wide character or multibyte support. */
+#error TRE_WCHAR undefined
 
 #define GET_NEXT_WCHAR()                                                     \
   do {                                                                       \
        else                                                                  \
          next_c = (unsigned char)(*str_byte++);                              \
       }                                                                              \
-    else if (type == STR_USER)                                               \
-      {                                                                              \
-       pos += pos_add_next;                                                  \
-       str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
-                                                str_source->context);        \
-      }                                                                              \
   } while(/*CONSTCOND*/0)
 
 #endif /* !TRE_WCHAR */
 
 
 
-#define IS_WORD_CHAR(c)         ((c) == L'_' || tre_isalnum(c))
+/* Assumes tre_tnfa_t *tnfa in scope */
+#define IS_WORD_CHAR(c)         ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
 
 #define CHECK_ASSERTIONS(assertions)                                         \
   (((assertions & ASSERT_AT_BOL)                                             \
           || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
 
 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
-  (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
-       && !(tnfa->cflags & REG_ICASE)                                         \
-       && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
-    || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
-        && (tnfa->cflags & REG_ICASE)                                         \
-        && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
-       && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
-    || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
-        && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
-                                      tnfa->cflags & REG_ICASE)))
+  ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
+    && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
+                                      tnfa))
 
 
 
 
-/* Returns 1 if `t1' wins `t2', 0 otherwise. */
 inline static int
-tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
-             int *t1, int *t2)
+tre_tag_get(const tre_tag_t *tags, int i)
+{
+  tags += i;
+  return tags->count > 0 ? tags->value : -1;
+}
+
+inline static void
+tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
+{
+  tags += i;
+  if (tags->count++ == 0)
+    tags->first = val;
+  tags->value = val;
+  tags->touch = touch;
+}
+
+inline static void
+tre_tag_reset(tre_tag_t *tags, int i)
+{
+  tags[i].count = 0;
+}
+
+inline static int
+tre_tag_touch_get(const tre_tag_t *tags, int i)
+{
+  return tags[i].touch;
+}
+
+#ifdef TRE_DEBUG
+inline static void
+tre_print_tags(const tre_tag_t *tags, int num_tags)
 {
   int i;
-  for (i = 0; i < num_tags; i++)
+  for (i = 0; i < num_tags; i++, tags++)
     {
-      if (tag_directions[i] == TRE_TAG_MINIMIZE)
+      switch(tags->count)
        {
-         if (t1[i] < t2[i])
-           return 1;
-         if (t1[i] > t2[i])
+       case 0:
+         DPRINT(("%d:(0,-1)", i));
+         break;
+       case 1:
+         DPRINT(("%d:(1,%d)", i, tags->first));
+         break;
+       default:
+         DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
+                 tags->value));
+         break;
+       }
+      if (i < (num_tags - 1))
+       DPRINT((" "));
+    }
+}
+
+inline static void
+tre_print_tags_all(const tre_tag_t *tags, int num_tags)
+{
+  int i;
+  for (i = 0; i < num_tags; i++, tags++)
+    {
+      switch(tags->count)
+       {
+       case 0:
+         DPRINT(("%d:(0,-1)/%d", i, tags->touch));
+         break;
+       case 1:
+         DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
+         break;
+       default:
+         DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
+                 tags->value, tags->touch));
+         break;
+       }
+      if (i < (num_tags - 1))
+       DPRINT((" "));
+    }
+}
+#endif /* TRE_DEBUG */
+
+/* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
+ * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
+inline static int
+tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
+                     const tre_tag_t *tags2)
+{
+  const tre_tag_t *t1, *t2;
+
+  t1 = tags1 + start;
+  t2 = tags2 + start;
+  /* We need both start tags to be set */
+  if (t1->count == 0 || t2->count == 0)
+    return 0;
+
+  /* The start tags must be equal */
+  if (t1->value != t2->value)
+    return 0;
+
+  t1 = tags1 + end;
+  t2 = tags2 + end;
+  /* For the end tags, we prefer set over unset, because unset means that
+   * the end tag is still growing */
+  if (t1->count == 0)
+    {
+      /* if t2 is set, t1 loses since it is unset */
+      if (t2->count != 0)
+       return -1;
+    }
+  /* if t2 not set, t1 wins since it is set */
+  else if (t2->count == 0)
+    return 1;
+
+  /* least current value wins */
+  return t2->value - t1->value;
+}
+
+/* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
+ * (t1 loses if < 0, t1 wins if > 0) */
+inline static int
+tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
+               const tre_tag_t *t2)
+{
+  int diff;
+
+  t1 += i;
+  t2 += i;
+  switch (dir)
+    {
+    case TRE_TAG_MINIMIZE:
+      /* least current value wins (because tags are initialized to all zeros,
+       * unset wins over set; also, tre_minimal_tag_order() will have already
+       * been run, which checks for being unset) */
+      return t2->value - t1->value;
+
+    case TRE_TAG_MAXIMIZE:
+      /* prefer set */
+      if (t1->count == 0)
+       {
+         /* if neither t1 and t2 are set, try next tag */
+         if (t2->count == 0)
            return 0;
+         /* t2 is set, t1 loses since it is unset */
+         return -1;
        }
-      else
+      /* if t2 not set, t1 wins since it is set */
+      else if (t2->count == 0)
+       return 1;
+      /* greatest initial value wins */
+      if ((diff = t1->first - t2->first) != 0)
+       return diff;
+      /* least number of times the tag was set, wins */
+      if ((diff = t2->count - t1->count) != 0)
+       return diff;
+      /* if the tags were only set once, they only have initial values */
+      if (t1->count == 1)
+       return 0;
+      /* greatest current value wins */
+      return t1->value - t2->value;
+
+    case TRE_TAG_LEFT_MAXIMIZE:
+      /* prefer set */
+      if (t1->count == 0)
        {
-         if (t1[i] > t2[i])
-           return 1;
-         if (t1[i] < t2[i])
+         /* if neither t1 and t2 are set, try next tag */
+         if (t2->count == 0)
            return 0;
+         /* t2 is set, t1 loses since it is unset */
+         return -1;
        }
+      /* if t2 not set, t1 wins since it is set */
+      else if (t2->count == 0)
+       return 1;
+      /* least initial value wins */
+      if ((diff = t2->first - t1->first) != 0)
+       return diff;
+      /* least number of times the tag was set, wins */
+      if ((diff = t2->count - t1->count) != 0)
+       return diff;
+      /* if the tags were only set once, they only have initial values */
+      if (t1->count == 1)
+       return 0;
+      /* greatest current value wins */
+      return t1->value - t2->value;
+
+    default:
+      /* Shouldn't happen: only assert if TRE_DEBUG defined */
+      assert(0);
+      break;
+    }
+  return 0;
+}
+
+#ifdef TRE_DEBUG
+#define _MORE_DEBUGGING
+#endif /* TRE_DEBUG */
+
+/* Returns 1 if `t1' wins `t2', 0 otherwise. */
+inline static int
+#ifdef _MORE_DEBUGGING
+_tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
+             const tre_tag_t *t1, const tre_tag_t *t2)
+#else /* !_MORE_DEBUGGING */
+tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
+             const tre_tag_t *t1, const tre_tag_t *t2)
+#endif /* !_MORE_DEBUGGING */
+{
+  int i, ret;
+
+  for (i = 0; i < num_tags; i++)
+    {
+      if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
+       return (ret > 0);
     }
   /*  assert(0);*/
   return 0;
 }
 
+#ifdef _MORE_DEBUGGING
+inline static int
+tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
+             const tre_tag_t *t1, const tre_tag_t *t2)
+{
+  int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
+  DPRINT(("tre_tag_order: "));
+  tre_print_tags(t1, num_tags);
+  DPRINT((" %s ", ret ? "wins" : "doesn't win"));
+  tre_print_tags(t2, num_tags);
+  DPRINT(("\n"));
+  return ret;
+}
+#endif /* _MORE_DEBUGGING */
+
+int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
+
 inline static int
-tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
+tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
+                 const tre_tnfa_t * __restrict tnfa)
 {
-  DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase));
-  while (*classes != (tre_ctype_t)0)
-    if ((!icase && tre_isctype(wc, *classes))
-       || (icase && (tre_isctype(tre_toupper(wc), *classes)
-                     || tre_isctype(tre_tolower(wc), *classes))))
-      return 1; /* Match. */
-    else
-      classes++;
-  return 0; /* No match. */
+  int match = 0;
+  int i;
+  tre_bracket_match_t *b;
+  tre_cint_t uc, lc;
+  int we, ue, le, got_equiv = 0;
+  int icase = ((tnfa->cflags & REG_ICASE) != 0);
+
+  DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
+  if (icase)
+    {
+      if (tre_islower_l(wc, tnfa->loc))
+       {
+         lc = wc;
+         uc = tre_toupper_l(wc, tnfa->loc);
+       }
+      else if (tre_isupper_l(wc, tnfa->loc))
+       {
+         uc = wc;
+         lc = tre_tolower_l(wc, tnfa->loc);
+       }
+      else
+       {
+         icase = 0;
+       }
+    }
+  for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
+       i++, b++)
+    {
+      switch (b->type)
+       {
+       case TRE_BRACKET_MATCH_TYPE_CHAR:
+         if (icase)
+           match = (b->value == uc || b->value == lc);
+         else
+           match = (b->value == wc);
+         break;
+       case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
+         {
+           tre_cint_t start = b->value, end;
+           if (++i >= list->num_bracket_matches ||
+               (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
+             {
+               DPRINT(("tre_bracket_match: no following range end\n"));
+               assert(0);
+               goto error;
+             }
+           end = b->value;
+           if (!got_equiv)
+             {
+               if (icase)
+                 {
+                   ue = __collate_equiv_value(tnfa->loc, &uc, 1);
+                   le = __collate_equiv_value(tnfa->loc, &lc, 1);
+                 }
+               else
+                   we = __collate_equiv_value(tnfa->loc, &wc, 1);
+               got_equiv = 1;
+             }
+           if (icase)
+             match = ((start <= ue && ue <= end) ||
+                     (start <= le && le <= end));
+           else
+             match = (start <= we && we <= end);
+           break;
+         }
+       case TRE_BRACKET_MATCH_TYPE_RANGE_END:
+         DPRINT(("tre_bracket_match: range end without preceeding start\n"));
+         assert(0);
+         break;
+       case TRE_BRACKET_MATCH_TYPE_CLASS:
+         if (icase)
+           match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
+                    tre_isctype_l(lc, b->value, tnfa->loc));
+         else
+           match = (tre_isctype_l(wc, b->value, tnfa->loc));
+         break;
+       case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
+         if (!got_equiv)
+           {
+             if (icase)
+               {
+                 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
+                 le = __collate_equiv_value(tnfa->loc, &lc, 1);
+               }
+             else
+                 we = __collate_equiv_value(tnfa->loc, &wc, 1);
+             got_equiv = 1;
+           }
+         if (icase)
+           match = (b->value == ue || b->value == le);
+         else
+           match = (b->value == we);
+         break;
+       default:
+         DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
+         assert(0);
+         break;
+       }
+       if (match)
+         break;
+    }
+error:
+  if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
+    if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
+    match = !match;
+  }
+  return match;
 }
index 9282725..76e66ee 100644 (file)
@@ -19,6 +19,7 @@
 #include <string.h>
 #include <assert.h>
 #include <limits.h>
+#include <stddef.h>
 
 #include "xmalloc.h"
 #include "tre-mem.h"
 #include "tre-stack.h"
 #include "tre-parse.h"
 
+#include "xlocale_private.h"
+#include "collate.h"
+
+/* BSD compatibility:
+     Before looking up a collating symbol, check if the name matches in
+     the character names (cnames) array; if so, use the corresponding
+     character.
+
+     Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
+     the implementation choice that for ERE, a non-numeric character following
+     a left brace that would normally be a bound, causes the left brace to be
+     literal. */
+#define BSD_COMPATIBILITY
+#ifdef BSD_COMPATIBILITY
+#include "cname.h"
+#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+#endif /* BSD_COMPATIBILITY */
 
 /* Characters with special meanings in regexp syntax. */
 #define CHAR_PIPE         L'|'
@@ -92,86 +110,34 @@ tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
 }
 
 static reg_errcode_t
-tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
-        tre_ast_node_t ***items)
+tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
+        tre_bracket_match_list_t **items)
 {
-  reg_errcode_t status;
-  tre_ast_node_t **array = *items;
+  reg_errcode_t status = REG_OK;
+  tre_bracket_match_list_t *array = *items;
+  int i = array->num_bracket_matches;
   /* Allocate more space if necessary. */
-  if (*i >= *max_i)
+  if (i >= *max_i)
     {
-      tre_ast_node_t **new_items;
-      DPRINT(("out of array space, i = %d\n", *i));
+      tre_bracket_match_list_t *new_items;
+      DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
       /* If the array is already 1024 items large, give up -- there's
         probably an error in the regexp (e.g. not a '\0' terminated
         string and missing ']') */
-      if (*max_i > 1024)
+      if (*max_i >= 1024)
        return REG_ESPACE;
       *max_i *= 2;
-      new_items = xrealloc(array, sizeof(*items) * *max_i);
+      new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
       if (new_items == NULL)
        return REG_ESPACE;
       *items = array = new_items;
     }
-  array[*i] = tre_ast_new_literal(mem, min, max, -1);
-  status = array[*i] == NULL ? REG_ESPACE : REG_OK;
-  (*i)++;
-  return status;
-}
-
-
-/* Expands a character class to character ranges. */
-static reg_errcode_t
-tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
-                int *i, int *max_i, int cflags)
-{
-  reg_errcode_t status = REG_OK;
-  tre_cint_t c;
-  int j, min = -1, max = 0;
-  assert(TRE_MB_CUR_MAX == 1);
-
-  DPRINT(("  expanding class to character ranges\n"));
-  for (j = 0; (j < 256) && (status == REG_OK); j++)
-    {
-      c = j;
-      if (tre_isctype(c, class)
-         || ((cflags & REG_ICASE)
-             && (tre_isctype(tre_tolower(c), class)
-                 || tre_isctype(tre_toupper(c), class))))
-{
-         if (min < 0)
-           min = c;
-         max = c;
-       }
-      else if (min >= 0)
-       {
-         DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
-         status = tre_new_item(mem, min, max, i, max_i, items);
-         min = -1;
-       }
-    }
-  if (min >= 0 && status == REG_OK)
-    status = tre_new_item(mem, min, max, i, max_i, items);
+  array->bracket_matches[i].type = type;
+  array->bracket_matches[i].value = val;
+  array->num_bracket_matches++;
   return status;
 }
 
-
-static int
-tre_compare_items(const void *a, const void *b)
-{
-  const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
-  const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
-  tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
-  int a_min = l_a->code_min, b_min = l_b->code_min;
-
-  if (a_min < b_min)
-    return -1;
-  else if (a_min > b_min)
-    return 1;
-  else
-    return 0;
-}
-
 #ifndef TRE_USE_SYSTEM_WCTYPE
 
 /* isalnum() and the rest may be macros, so wrap them to functions. */
@@ -236,336 +202,672 @@ tre_ctype_t tre_ctype(const char *name)
 }
 #endif /* !TRE_USE_SYSTEM_WCTYPE */
 
-/* Maximum number of character classes that can occur in a negated bracket
-   expression. */
-#define MAX_NEG_CLASSES 64
+#define REST(re) (int)(ctx->re_end - (re)), (re)
 
-/* Maximum length of character class names. */
-#define MAX_CLASS_NAME
+#define START_COLLATING_SYMBOLS                16
+#define MAX_COLLATING_SYMBOL_LEN       4
 
-#define REST(re) (int)(ctx->re_end - (re)), (re)
+typedef struct {
+  const tre_char_t *start;
+  int len;
+} tre_collating_symbol;
+
+#ifdef BSD_COMPATIBILITY
+static wchar_t
+tre_search_cnames(const wchar_t *name, size_t len)
+{
+  size_t low = 0;
+  size_t high = NCNAMES - 1;
+  size_t cur;
+  int cmp;
 
+  while(low <= high)
+    {
+      cur = (low + high) / 2;
+      cmp = wcsncmp(name, cnames[cur].name, len);
+      if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
+      if (cmp > 0) low = cur + 1;
+      else high = cur - 1;
+    }
+  return (wchar_t)-1;
+}
+#endif /* BSD_COMPATIBILITY */
+
+/* Scan the contents of a bracket expression, and create a
+ * tre_bracket_match_list_t encoding the bracket expression.  If during
+ * the scan, multi-character collating symbols are detected, switch
+ * into a mode to collect those MCCSs into a tre_collating_symbol
+ * list and pass them back.  tre_parse_bracket will use that to
+ * create a new string composed of a union of the bracket expression
+ * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
+ * call tre_parse (recursive) to parse that new string (which will
+ * call tre_parse_bracket and tre_parse_bracket_items again. */
 static reg_errcode_t
-tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
-                       tre_ctype_t neg_classes[], int *num_neg_classes,
-                       tre_ast_node_t ***items, int *num_items,
-                       int *items_size)
+tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
+                       int *items_size, tre_collating_symbol **result)
 {
   const tre_char_t *re = ctx->re;
-  reg_errcode_t status = REG_OK;
-  tre_ctype_t class = (tre_ctype_t)0;
-  int i = *num_items;
+  const tre_char_t *re_end = ctx->re_end;
+  tre_collating_symbol *col_syms = NULL;
+  tre_collating_symbol *cp = NULL;
+  int n_col_syms = 0;
+  reg_errcode_t status;
   int max_i = *items_size;
-  int skip;
+  int other = 0;  /* contains content other than multi-character collating
+                  * symbols */
+  int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
+  tre_cint_t min, c;
+  int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
+  int collect_MCCS = 0;
+  const tre_char_t *start;
 
-  /* Build an array of the items in the bracket expression. */
-  while (status == REG_OK)
+  for ( ;re < re_end; re++)
     {
-      skip = 0;
-      if (re == ctx->re_end)
-       {
-         status = REG_EBRACK;
-       }
-      else if (*re == CHAR_RBRACKET && re > ctx->re)
+      switch (*re)
        {
-         DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n", REST(re)));
-         re++;
+       case CHAR_MINUS:
+         /* A first hyphen */
+         if (re == ctx->re)
+           {
+             DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
+             min = CHAR_MINUS;
+             other++;
+             range = 0;
+             break;
+           }
+         /* The hyphen is the end range */
+         if (range > 0)
+           {
+             DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
+             c = CHAR_MINUS;
+             goto process_end_range;
+           }
+         if (re + 1 >= re_end)
+           {
+             status = REG_EBRACK;
+             goto error;
+           }
+         /* The hyphen is at the end */
+         if (re[1] == CHAR_RBRACKET)
+           {
+             DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
+             c = CHAR_MINUS;
+             goto process_begin_range;
+           }
+         /* Two ranges are not allowed to share an endpoint, or begin
+          * range is illegal. */
+         if (range < 0)
+           {
+             status = REG_ERANGE;
+             goto error;
+           }
+         range = 1; /* Expect end range */
+         DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
          break;
-       }
-      else
-       {
-         tre_cint_t min = 0, max = 0;
 
-         class = (tre_ctype_t)0;
-         if (re + 2 < ctx->re_end
-             && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
+       case CHAR_LBRACKET:
+         if (re + 1 >= re_end)
            {
-             DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", REST(re)));
-             min = *re;
-             max = *(re + 2);
-             re += 3;
-             /* XXX - Should use collation order instead of encoding values
-                in character ranges. */
-             if (min > max)
-               status = REG_ERANGE;
+             status = REG_EBRACK;
+             goto error;
            }
-         else if (re + 1 < ctx->re_end
-                  && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
-           status = REG_ECOLLATE;
-         else if (re + 1 < ctx->re_end
-                  && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
-           status = REG_ECOLLATE;
-         else if (re + 1 < ctx->re_end
-                  && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
+         switch (re[1])
            {
-             char tmp_str[64];
-             const tre_char_t *endptr = re + 2;
-             int len;
-             DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", REST(re)));
-             while (endptr < ctx->re_end && *endptr != CHAR_COLON)
-               endptr++;
-             if (endptr != ctx->re_end)
-               {
-                 len = MIN(endptr - re - 2, 63);
-#ifdef TRE_WCHAR
+           case CHAR_PERIOD:
+             {
+               re += 2;
+               start = re;
+               for (;; re++)
+                 {
+                   if (re >= re_end)
+                     {
+                       status = REG_ECOLLATE;
+                       goto error;
+                     }
+                   if (*re == CHAR_PERIOD)
+                     {
+                       if (re + 1 >= re_end)
+                         {
+                           status = REG_ECOLLATE;
+                           goto error;
+                         }
+                       /* Found end */
+                       if (re[1] == CHAR_RBRACKET)
+                         {
+                           DPRINT(("tre_parse_bracket:   collating "
+                                   "symbol: '%.*" STRF "'\n",
+                                   REST(start - 2)));
+                           /* Empty name */
+                           if (re == start)
+                             {
+                               status = REG_ECOLLATE;
+                               goto error;
+                             }
+#ifdef BSD_COMPATIBILITY
+                           /* Check if the name is in cnames; if so, use
+                              the corresponding code */
+                           c = tre_search_cnames(start, re - start);
+                           if (c != (wchar_t)-1)
+                             {
+                               re++;
+                               goto process_single_character;
+                             }
+#endif /* BSD_COMPATIBILITY */
+                           /* Verify this is a known sequence */
+                           if (__collate_equiv_value(ctx->loc, start,
+                                                         re - start) <= 0)
+                             {
+                               status = REG_ECOLLATE;
+                               goto error;
+                             }
+                           /* Process single character collating symbols */
+                           if (re - start == 1)
+                             {
+                               c = *start;
+                               re++;
+                               goto process_single_character;
+                             }
+                           /* Inverted MCCSs are undefined */
+                           if (invert)
+                             {
+                               status = REG_ECOLLATE;
+                               goto error;
+                             }
+                           /* Can't have MCCSs as an endpoint to a range */
+                           if (range > 0)
+                             {
+                               status = REG_ERANGE;
+                               goto error;
+                             }
+                           range = -1;
+                           /* Switch into MCCS collection mode (if not
+                            * already there */
+#if TRE_DEBUG
+                           if (!collect_MCCS)
+                             {
+                               collect_MCCS = 1;
+                               DPRINT(("tre_parse_bracket: Detected MCCS\n"));
+                             }
+#else /* !TRE_DEBUG */
+                           collect_MCCS = 1;
+#endif /* !TRE_DEBUG */
+                           /* Allocate a memory block the first time */
+                           if (!cp)
+                             {
+                               if ((col_syms = xmalloc(sizeof(*col_syms) *
+                                           (START_COLLATING_SYMBOLS + 2)))
+                                           == NULL)
+                                 return REG_ESPACE;
+                               cp = col_syms + 1;
+                               n_col_syms = START_COLLATING_SYMBOLS;
+                             }
+                           /* Enlarge the memory block is more is needed */
+                           if ((cp - col_syms) - 1 >= n_col_syms)
+                             {
+                               int i = n_col_syms;
+                               tre_collating_symbol *tmp =
+                                   xrealloc(col_syms, sizeof(*col_syms) *
+                                            ((n_col_syms *= 2) + 2));
+                               if (tmp == NULL)
+                                 {
+                                   xfree(col_syms);
+                                   return REG_ESPACE;
+                                 }
+                               DPRINT(("tre_list_collating_symbols: "
+                                       "Enlarging col_syms to %d\n",
+                                       n_col_syms));
+                               col_syms = tmp;
+                               cp = col_syms + i + 1;
+                             }
+                           cp->start = start;
+                           cp->len = re - start;
+                           cp++;
+                           re++;
+                           break;
+                         }
+                     }
+                 }
+               break;
+             }
+
+           case CHAR_EQUAL:
+           case CHAR_COLON:
+             {
+               /* Process equivalence and character classes */
+               tre_char_t kind = re[1];
+
+               /* Can't have a class as an endpoint to a range */
+               if (range > 0)
+                 {
+                   status = REG_ERANGE;
+                   goto error;
+                 }
+               if (!collect_MCCS && range == 0)
+                 {
+                   status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
+                                         min, &max_i, items);
+                   if (status != REG_OK)
+                     goto error;
+                 }
+               range = -1;
+               re += 2;
+               start = re;
+               for (;; re++)
                  {
-                   tre_char_t tmp_wcs[64];
-                   wcsncpy(tmp_wcs, re + 2, (size_t)len);
-                   tmp_wcs[len] = L'\0';
+                   if (re >= re_end)
+                     {
+                       status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
+                       goto error;
+                     }
+                   if (*re == kind)
+                     {
+                       if (re + 1 >= re_end)
+                         {
+                           status = kind == CHAR_EQUAL ? REG_ECOLLATE :
+                                                         REG_ECTYPE;
+                           goto error;
+                         }
+                       /* Found end */
+                       if (re[1] == CHAR_RBRACKET)
+                         {
+                           if (re == start)
+                             {
+                               /* Empty class name */
+                               status = kind == CHAR_EQUAL ? REG_ECOLLATE :
+                                                             REG_ECTYPE;
+                               goto error;
+                             }
+                           /* Process equivalence class */
+                           if (kind == CHAR_EQUAL)
+                             {
+                               int equiv;
+
+                               DPRINT(("tre_parse_bracket:   equivalence: '%.*"
+                                       STRF "'\n", REST(start - 2)));
+
+                               /* While we find the collation value even for
+                                  multi-character collating elements , we
+                                  don't (yet) match any collation values
+                                  against multi-character sequences.  We'd have
+                                  to enumerate those multi-character sequences
+                                  and like multi-character collating symbols,
+                                  create a union of those sequences with the
+                                  rest of the bracket expression.  While
+                                  doable, a bracket expression matching
+                                  multiple characters, that doesn't explicitly
+                                  contain multi-character sequences, might
+                                  be unexpected, so we punt for now. */
+                               if ((equiv = __collate_equiv_value(ctx->loc,
+                                            start, re - start)) <= 0)
+                                 {
+                                   /* The standard says that if no collating
+                                      element if found, we use the collating
+                                      symbol itself.  But __collate_equiv_value
+                                      doesn't make a distinction between
+                                      an element that is in a equvalence
+                                      class with others, or is the only member,
+                                      so we already know there is no collating
+                                      symbol.  (Note that in the case of a
+                                      collating element whose collation value
+                                      is unique, matching against the
+                                      collating element itself, or against
+                                      its collation value, is equivalent.) */
+#ifdef BSD_COMPATIBILITY
+                                   /* Check if the name is in cnames; if so,
+                                      use the corresponding code */
+                                   c = tre_search_cnames(start, re - start);
+                                   if (c != (wchar_t)-1)
+                                     {
+                                       re++;
+                                       goto process_single_character;
+                                     }
+#endif /* BSD_COMPATIBILITY */
+                                   status = REG_ECOLLATE;
+                                   goto error;
+                                 }
+                               if (!collect_MCCS)
+                                 {
+                                   status = tre_new_item(ctx->mem,
+                                            TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
+                                            equiv, &max_i, items);
+                                   if (status != REG_OK)
+                                     goto error;
+                                 }
+                             }
+                           else
+                             {
+                               /* Process character class */
+                               DPRINT(("tre_parse_bracket:  class: '%.*" STRF
+                                       "'\n", REST(start - 2)));
+                               if (!collect_MCCS)
+                                 {
+                                   char tmp_str[64];
+                                   tre_ctype_t class;
+                                   int len = MIN(re - start, 63);
+#ifdef TRE_WCHAR
+                                   {
+                                     tre_char_t tmp_wcs[64];
+                                     wcsncpy(tmp_wcs, start, (size_t)len);
+                                     tmp_wcs[len] = L'\0';
 #if defined HAVE_WCSRTOMBS
-                   {
-                     mbstate_t state;
-                     const tre_char_t *src = tmp_wcs;
-                     memset(&state, '\0', sizeof(state));
-                     len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
-                   }
+                                     {
+                                       mbstate_t state;
+                                       const tre_char_t *src = tmp_wcs;
+                                       memset(&state, '\0', sizeof(state));
+                                       len = wcsrtombs_l(tmp_str, &src,
+                                                     sizeof(tmp_str), &state,
+                                                     ctx->loc);
+                                     }
 #elif defined HAVE_WCSTOMBS
-                   len = wcstombs(tmp_str, tmp_wcs, 63);
+                                     len = wcstombs(tmp_str, tmp_wcs, 63);
 #endif /* defined HAVE_WCSTOMBS */
-                 }
+                                   }
 #else /* !TRE_WCHAR */
-                 strncpy(tmp_str, (const char*)re + 2, len);
+                                   strncpy(tmp_str, (const char*)start, len);
 #endif /* !TRE_WCHAR */
-                 tmp_str[len] = '\0';
-                 DPRINT(("  class name: %s\n", tmp_str));
-                 class = tre_ctype(tmp_str);
-                 if (!class)
-                   status = REG_ECTYPE;
-                 /* Optimize character classes for 8 bit character sets. */
-                 if (status == REG_OK && TRE_MB_CUR_MAX == 1)
-                   {
-                     status = tre_expand_ctype(ctx->mem, class, items,
-                                               &i, &max_i, ctx->cflags);
-                     class = (tre_ctype_t)0;
-                     skip = 1;
-                   }
-                 re = endptr + 2;
-               }
-             else
-               status = REG_ECTYPE;
-             min = 0;
-             max = TRE_CHAR_MAX;
+                                   tmp_str[len] = '\0';
+                                   DPRINT(("  class name: %s\n", tmp_str));
+                                   class = tre_ctype_l(tmp_str, ctx->loc);
+                                   if (!class)
+                                     {
+                                       status = REG_ECTYPE;
+                                       goto error;
+                                     }
+                                   status = tre_new_item(ctx->mem,
+                                            TRE_BRACKET_MATCH_TYPE_CLASS,
+                                            class, &max_i, items);
+                                   if (status != REG_OK)
+                                     goto error;
+                                 }
+                             }
+                           re++;
+                           break;
+                         }
+                     }
+                 }
+               other++;
+               break;
+             }
+
+           default:
+             DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
+             c = CHAR_LBRACKET;
+             goto process_single_character;
+             break;
            }
-         else
+         break;
+
+       case CHAR_RBRACKET:
+         /* A first right bracket */
+         if (re == ctx->re)
            {
              DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
-             if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
-                 && ctx->re != re)
-               /* Two ranges are not allowed to share and endpoint. */
-               status = REG_ERANGE;
-             min = max = *re++;
+             min = CHAR_RBRACKET;
+             range = 0;
+             other++;
+             break;
            }
-
-         if (status != REG_OK)
-           break;
-
-         if (class && negate)
-           if (*num_neg_classes >= MAX_NEG_CLASSES)
-             status = REG_ESPACE;
-           else
-             neg_classes[(*num_neg_classes)++] = class;
-         else if (!skip)
+         /* Done */
+         if (collect_MCCS)
+           {
+             DPRINT(("tre_parse_bracket:       done: '%.*" STRF "'\n",
+                     REST(re)));
+             if (col_syms)
+               {
+                 /* Mark the character following the right bracket.  Set len
+                  * to whether there are other things besides the
+                  * multi-character collating symbols */
+                 col_syms->start = re + 1;
+                 col_syms->len = other;
+                 /* Mark the end of the list */
+                 cp->start = NULL;
+               }
+             *result = col_syms;
+             return REG_OK;
+           }
+         /* range > 0 is not possible, since we did a lookahead after the
+          * hyphen */
+         if (range == 0)
            {
-             status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
+             status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
+                                   min, &max_i, items);
              if (status != REG_OK)
-               break;
-             ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
+               goto error;
            }
+         DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n", REST(re)));
+         *items_size = max_i;
+         ctx->re = re + 1;
+         return REG_OK;
 
-         /* Add opposite-case counterpoints if REG_ICASE is present.
-            This is broken if there are more than two "same" characters. */
-         if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
+       default:
+         DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
+         c = *re;
+process_single_character:
+         /* Process single character */
+         if (range > 0)
            {
-             tre_cint_t cmin, ccurr;
+             int mine, maxe;
 
-             DPRINT(("adding opposite-case counterpoints\n"));
-             while (min <= max)
+process_end_range:
+             /* Get collation equivalence values */
+             mine = __collate_equiv_value(ctx->loc, &min, 1);
+             maxe = __collate_equiv_value(ctx->loc, &c, 1);
+             if (maxe < mine)
                {
-                 if (tre_islower(min))
-                   {
-                     cmin = ccurr = tre_toupper(min++);
-                     while (tre_islower(min) && tre_toupper(min) == ccurr + 1
-                            && min <= max)
-                       ccurr = tre_toupper(min++);
-                     status = tre_new_item(ctx->mem, cmin, ccurr,
-                                           &i, &max_i, items);
-                   }
-                 else if (tre_isupper(min))
+                 status = REG_ERANGE;
+                 goto error;
+               }
+             if (!collect_MCCS)
+               {
+                 status = tre_new_item(ctx->mem,
+                                       TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
+                                       mine, &max_i, items);
+                 if (status != REG_OK)
+                   goto error;
+                 status = tre_new_item(ctx->mem,
+                                       TRE_BRACKET_MATCH_TYPE_RANGE_END,
+                                       maxe, &max_i, items);
+                 if (status != REG_OK)
+                   goto error;
+               }
+             range = -1;
+           }
+         else
+           {
+process_begin_range:
+             if (!collect_MCCS)
+               {
+                 if (range == 0)
                    {
-                     cmin = ccurr = tre_tolower(min++);
-                     while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
-                            && min <= max)
-                       ccurr = tre_tolower(min++);
-                     status = tre_new_item(ctx->mem, cmin, ccurr,
-                                           &i, &max_i, items);
+                     status = tre_new_item(ctx->mem,
+                                           TRE_BRACKET_MATCH_TYPE_CHAR,
+                                           min, &max_i, items);
+                     if (status != REG_OK)
+                       goto error;
                    }
-                 else min++;
-                 if (status != REG_OK)
-                   break;
+                 min = c;
                }
-             if (status != REG_OK)
-               break;
+             range = 0;
            }
+         other++;
+         break;
        }
     }
-  *num_items = i;
-  *items_size = max_i;
-  ctx->re = re;
+  status = REG_EBRACK;
+error:
+  DPRINT(("tre_parse_bracket:  error: '%.*" STRF "', status=%d\n",
+         REST(re), status));
+  if (col_syms)
+    xfree(col_syms);
   return status;
 }
 
+#ifdef TRE_DEBUG
+static const char *bracket_match_type_str[] = {
+  "unused",
+  "char",
+  "range begin",
+  "range end",
+  "class",
+  "equivalence value",
+};
+#endif /* TRE_DEBUG */
+
 static reg_errcode_t
 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 {
-  tre_ast_node_t *node = NULL;
-  int negate = 0;
+  tre_ast_node_t *node;
   reg_errcode_t status = REG_OK;
-  tre_ast_node_t **items, *u, *n;
-  int i = 0, j, max_i = 32, curr_max, curr_min;
-  tre_ctype_t neg_classes[MAX_NEG_CLASSES];
-  int num_neg_classes = 0;
+  tre_bracket_match_list_t *items;
+  int max_i = 32;
+  tre_collating_symbol *col_syms = NULL;
+
+  /* Handle special cases [[:<:]] and [[:>:]] */
+  if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
+      && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
+      && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
+      && ctx->re[5] == CHAR_RBRACKET)
+    {
+      *result = tre_ast_new_literal(ctx->mem, ASSERTION,
+                     (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
+                     -1);
+      DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
+             "[[:<:]]" : "[[:>:]]"));
+      ctx->re += 6;
+      return *result ? REG_OK : REG_ESPACE;
+    }
 
   /* Start off with an array of `max_i' elements. */
-  items = xmalloc(sizeof(*items) * max_i);
+  items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
   if (items == NULL)
     return REG_ESPACE;
 
   if (*ctx->re == CHAR_CARET)
     {
       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
-      negate = 1;
+      items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
       ctx->re++;
     }
 
-  status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
-                                  &items, &i, &max_i);
+  status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
 
   if (status != REG_OK)
     goto parse_bracket_done;
 
-  /* Sort the array if we need to negate it. */
-  if (negate)
-    qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
-
-  curr_max = curr_min = 0;
-  /* Build a union of the items in the array, negated if necessary. */
-  for (j = 0; j < i && status == REG_OK; j++)
+  /* If there are collating symbols, split off the multi-character ones
+   * into a union of the bracket expression (without the collating symbols)
+   * and the multiple-character sequences.  We create an equivalent input
+   * string and run tre_parse() recursively */
+  if (col_syms)
     {
-      int min, max;
-      tre_literal_t *l = items[j]->obj;
-      min = l->code_min;
-      max = l->code_max;
-
-      DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
-             (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
-
-      if (negate)
+      tre_char_t *str, *sp;
+      tre_collating_symbol *cp;
+      tre_parse_ctx_t subctx;
+
+      /* Allocate a new string.  We start with the size of the original
+       * bracket expression (minus 1) and add 2 (for a leading "[" and
+       * a trailing nil; don't need a "^", since it is illegal to have
+       * inverted MCCSs).  Since a multi-character collating symbols
+       * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
+       * need to worry about the new string getting too long. */
+      xfree(items);
+      str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
+      if (str == NULL)
        {
-         if (min < curr_max)
-           {
-             /* Overlap. */
-             curr_max = MAX(max + 1, curr_max);
-             DPRINT(("overlap, curr_max = %d\n", curr_max));
-             l = NULL;
-           }
-         else
-           {
-             /* No overlap. */
-             curr_max = min - 1;
-             if (curr_max >= curr_min)
-               {
-                 DPRINT(("no overlap\n"));
-                 l->code_min = curr_min;
-                 l->code_max = curr_max;
-               }
-             else
-               {
-                 DPRINT(("no overlap, zero room\n"));
-                 l = NULL;
-               }
-             curr_min = curr_max = max + 1;
-           }
+         xfree(col_syms);
+         return REG_ESPACE;
        }
-
-      if (l != NULL)
+      sp = str;
+      if (col_syms->len > 0)
        {
-         int k;
-         DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
-         l->position = ctx->position;
-         if (num_neg_classes > 0)
+         /* There are other items in the bracket expression besides the
+          * multi-character collating symbols, so create a new bracket
+          * expression with only those other itmes. */
+         const tre_char_t *re;
+         ptrdiff_t i;
+
+         *sp++ = '[';
+         re = ctx->re;
+         for (cp = col_syms + 1; cp->start; cp++)
            {
-             l->neg_classes = tre_mem_alloc(ctx->mem,
-                                            (sizeof(l->neg_classes)
-                                             * (num_neg_classes + 1)));
-             if (l->neg_classes == NULL)
+             /* The "- 2" is to account for the "[." */
+             if ((i = ((cp->start - re) - 2)) > 0)
                {
-                 status = REG_ESPACE;
-                 break;
+                 memcpy(sp, re, sizeof(*sp) * i);
+                 sp += i;
                }
-             for (k = 0; k < num_neg_classes; k++)
-               l->neg_classes[k] = neg_classes[k];
-             l->neg_classes[k] = (tre_ctype_t)0;
-           }
-         else
-           l->neg_classes = NULL;
-         if (node == NULL)
-           node = items[j];
-         else
-           {
-             u = tre_ast_new_union(ctx->mem, node, items[j]);
-             if (u == NULL)
-               status = REG_ESPACE;
-             node = u;
+             /* The "+ 2" is to account for the ".]" */
+             re = cp->start + cp->len + 2;
            }
+           i = col_syms->start - re; /* Includes the trailing right bracket */
+           memcpy(sp, re, sizeof(*sp) * i);
+           sp += i;
+           *sp++ = '|';
+       }
+      for (cp = col_syms + 1; cp->start; cp++)
+       {
+         memcpy(sp, cp->start, sizeof(*sp) * cp->len);
+         sp += cp->len;
+         if (cp[1].start)
+           *sp++ = '|';
        }
+      *sp = 0;
+      DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
+             str));
+
+      memcpy(&subctx, ctx, sizeof(subctx));
+      subctx.re = str;
+      subctx.len = sp - str;
+      subctx.nofirstsub = 1;
+      subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
+      status = tre_parse(&subctx);
+      xfree(str);
+      if (status != REG_OK)
+       {
+         xfree(col_syms);
+         return status;
+       }
+      ctx->re = col_syms->start;
+      ctx->position = subctx.position;
+      xfree(col_syms);
+      *result = subctx.result;
+      DPRINT(("tre_parse_bracket: Returning to original string\n"));
+      return REG_OK;
     }
 
-  if (status != REG_OK)
-    goto parse_bracket_done;
-
-  if (negate)
+  DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
+  node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
+  if (node == NULL)
+    {
+      status = REG_ESPACE;
+      goto parse_bracket_done;
+    }
+  else
     {
-      int k;
-      DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
-      n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
-      if (n == NULL)
-       status = REG_ESPACE;
-      else
+      tre_literal_t *l = node->obj;
+      l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
+                                        SIZEOF_BRACKET_MATCH_LIST(items));
+      if (l->u.bracket_match_list == NULL)
        {
-         tre_literal_t *l = n->obj;
-         if (num_neg_classes > 0)
-           {
-             l->neg_classes = tre_mem_alloc(ctx->mem,
-                                            (sizeof(l->neg_classes)
-                                             * (num_neg_classes + 1)));
-             if (l->neg_classes == NULL)
-               {
-                 status = REG_ESPACE;
-                 goto parse_bracket_done;
-               }
-             for (k = 0; k < num_neg_classes; k++)
-               l->neg_classes[k] = neg_classes[k];
-             l->neg_classes[k] = (tre_ctype_t)0;
-           }
-         else
-           l->neg_classes = NULL;
-         if (node == NULL)
-           node = n;
-         else
-           {
-             u = tre_ast_new_union(ctx->mem, node, n);
-             if (u == NULL)
-               status = REG_ESPACE;
-             node = u;
-           }
+         status = REG_ESPACE;
+         goto parse_bracket_done;
        }
+      memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
     }
 
-  if (status != REG_OK)
-    goto parse_bracket_done;
-
 #ifdef TRE_DEBUG
-  tre_ast_print(node);
+  {
+    int i;
+    tre_bracket_match_t *b;
+    DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
+           items->num_bracket_matches, items->flags));
+    for (i = 0, b = items->bracket_matches;
+        i < items->num_bracket_matches; i++, b++)
+      {
+       DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
+               b->value));
+      }
+  }
 #endif /* TRE_DEBUG */
 
  parse_bracket_done:
@@ -598,25 +900,46 @@ tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
 static reg_errcode_t
 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 {
-  int min, max, i;
+  int min, max;
+#ifdef TRE_APPROX
+  int i;
   int cost_ins, cost_del, cost_subst, cost_max;
   int limit_ins, limit_del, limit_subst, limit_err;
-  const tre_char_t *r = ctx->re;
   const tre_char_t *start;
+#endif /* TRE_APPROX */
+  const tre_char_t *r = ctx->re;
   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
+#ifdef TRE_APPROX
   int approx = 0;
   int costs_set = 0;
   int counts_set = 0;
 
   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
+#endif /* TRE_APPROX */
 
   /* Parse number (minimum repetition count). */
   min = -1;
-  if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
+  if (r >= ctx->re_end)
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+    return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
+#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+    return REG_EBRACE;
+#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+  if (*r >= L'0' && *r <= L'9') {
     DPRINT(("tre_parse:          min count: '%.*" STRF "'\n", REST(r)));
     min = tre_parse_int(&r, ctx->re_end);
   }
+#ifndef TRE_APPROX
+  else
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+      /* For ERE, return REG_NOMATCH to signal that the lbrace should
+         be treated as a literal */
+      return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
+#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+      return REG_BADBR;
+#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+#endif /* !TRE_APPROX */
 
   /* Parse comma and second number (maximum repetition count). */
   max = min;
@@ -628,10 +951,11 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
     }
 
   /* Check that the repeat counts are sane. */
-  if ((max >= 0 && min > max) || max > RE_DUP_MAX)
+  if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
     return REG_BADBR;
 
 
+#ifdef TRE_APPROX
   /*
    '{'
      optionally followed immediately by a number == minimum repcount
@@ -785,8 +1109,9 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
            }
        }
   } while (start != r);
+#endif /* TRE_APPROX */
 
-  /* Missing }. */
+  /*{*//* Missing }. */
   if (r >= ctx->re_end)
     return REG_EBRACE;
 
@@ -800,6 +1125,27 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
       if (r >= ctx->re_end || *r != CHAR_RBRACE)
        return REG_BADBR;
       r++;
+      /* Parse trailing '?' marking minimal repetition. */
+      if (r < ctx->re_end)
+       {
+         if (*r == CHAR_QUESTIONMARK)
+           {
+             /* Process the question mark only in enhanced mode.
+                Otherwise, the question mark is an error in ERE
+                or a literal in BRE */
+             if (ctx->cflags & REG_ENHANCED)
+               {
+                 minimal = !(ctx->cflags & REG_UNGREEDY);
+                 r++;
+               }
+             else return REG_BADRPT;
+           }
+         else if (*r == CHAR_STAR || *r == CHAR_PLUS)
+           {
+             /* These are reserved for future extensions. */
+             return REG_BADRPT;
+           }
+       }
     }
   else
     {
@@ -808,108 +1154,118 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
          || *(r + 1) != CHAR_RBRACE)
        return REG_BADBR;
       r += 2;
-    }
-
-
-  /* Parse trailing '?' marking minimal repetition. */
-  if (r < ctx->re_end)
-    {
-      if (*r == CHAR_QUESTIONMARK)
-       {
-         minimal = !(ctx->cflags & REG_UNGREEDY);
-         r++;
-       }
-      else if (*r == CHAR_STAR || *r == CHAR_PLUS)
+      if (r < ctx->re_end && *r == CHAR_STAR)
        {
-         /* These are reserved for future extensions. */
+         /* This is reserved for future extensions. */
          return REG_BADRPT;
        }
     }
 
+  if (minimal)
+    ctx->num_reorder_tags++;
+
+  if (!result) goto parse_bound_exit;
   /* Create the AST node(s). */
-  if (min == 0 && max == 0)
-    {
-      *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-      if (*result == NULL)
-       return REG_ESPACE;
-    }
-  else
-    {
-      if (min < 0 && max < 0)
-       /* Only approximate parameters set, no repetitions. */
-       min = max = 1;
+  /* Originally, if min == 0 && max == 0, we immediately replace the whole
+     iteration with EMPTY.  This unfortunately drops any submatches, and
+     messes up setting the pmatch values (we can get tags of -1, and
+     tag values in the billions).  So we leave it and process this case as
+     usual, and wait until tre_expand_ast() to replace with EMPTY */
+#ifdef TRE_APPROX
+  if (min < 0 && max < 0)
+    /* Only approximate parameters set, no repetitions. */
+    min = max = 1;
+#endif /* TRE_APPROX */
+
+  *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
+  if (!*result)
+    return REG_ESPACE;
 
-      *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
-      if (!*result)
-       return REG_ESPACE;
+#ifdef TRE_APPROX
+  /* If approximate matching parameters are set, add them to the
+     iteration node. */
+  if (approx || costs_set || counts_set)
+    {
+      int *params;
+      tre_iteration_t *iter = (*result)->obj;
 
-      /* If approximate matching parameters are set, add them to the
-        iteration node. */
-      if (approx || costs_set || counts_set)
+      if (costs_set || counts_set)
        {
-         int *params;
-         tre_iteration_t *iter = (*result)->obj;
-
-         if (costs_set || counts_set)
+         if (limit_ins == TRE_PARAM_UNSET)
            {
-             if (limit_ins == TRE_PARAM_UNSET)
-               {
-                 if (cost_ins == TRE_PARAM_UNSET)
-                   limit_ins = 0;
-                 else
-                   limit_ins = INT_MAX;
-               }
-
-             if (limit_del == TRE_PARAM_UNSET)
-               {
-                 if (cost_del == TRE_PARAM_UNSET)
-                   limit_del = 0;
-                 else
-                   limit_del = INT_MAX;
-               }
+             if (cost_ins == TRE_PARAM_UNSET)
+               limit_ins = 0;
+             else
+               limit_ins = INT_MAX;
+           }
 
-             if (limit_subst == TRE_PARAM_UNSET)
-               {
-                 if (cost_subst == TRE_PARAM_UNSET)
-                   limit_subst = 0;
-                 else
-                   limit_subst = INT_MAX;
-               }
+         if (limit_del == TRE_PARAM_UNSET)
+           {
+             if (cost_del == TRE_PARAM_UNSET)
+               limit_del = 0;
+             else
+               limit_del = INT_MAX;
            }
 
-         if (cost_max == TRE_PARAM_UNSET)
-           cost_max = INT_MAX;
-         if (limit_err == TRE_PARAM_UNSET)
-           limit_err = INT_MAX;
-
-         ctx->have_approx = 1;
-         params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
-         if (!params)
-           return REG_ESPACE;
-         for (i = 0; i < TRE_PARAM_LAST; i++)
-           params[i] = TRE_PARAM_UNSET;
-         params[TRE_PARAM_COST_INS] = cost_ins;
-         params[TRE_PARAM_COST_DEL] = cost_del;
-         params[TRE_PARAM_COST_SUBST] = cost_subst;
-         params[TRE_PARAM_COST_MAX] = cost_max;
-         params[TRE_PARAM_MAX_INS] = limit_ins;
-         params[TRE_PARAM_MAX_DEL] = limit_del;
-         params[TRE_PARAM_MAX_SUBST] = limit_subst;
-         params[TRE_PARAM_MAX_ERR] = limit_err;
-         iter->params = params;
+         if (limit_subst == TRE_PARAM_UNSET)
+           {
+             if (cost_subst == TRE_PARAM_UNSET)
+               limit_subst = 0;
+             else
+               limit_subst = INT_MAX;
+           }
        }
+
+      if (cost_max == TRE_PARAM_UNSET)
+       cost_max = INT_MAX;
+      if (limit_err == TRE_PARAM_UNSET)
+       limit_err = INT_MAX;
+
+      ctx->have_approx = 1;
+      params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
+      if (!params)
+       return REG_ESPACE;
+      for (i = 0; i < TRE_PARAM_LAST; i++)
+       params[i] = TRE_PARAM_UNSET;
+      params[TRE_PARAM_COST_INS] = cost_ins;
+      params[TRE_PARAM_COST_DEL] = cost_del;
+      params[TRE_PARAM_COST_SUBST] = cost_subst;
+      params[TRE_PARAM_COST_MAX] = cost_max;
+      params[TRE_PARAM_MAX_INS] = limit_ins;
+      params[TRE_PARAM_MAX_DEL] = limit_del;
+      params[TRE_PARAM_MAX_SUBST] = limit_subst;
+      params[TRE_PARAM_MAX_ERR] = limit_err;
+      iter->params = params;
     }
+#endif /* TRE_APPROX */
 
+parse_bound_exit:
+#ifdef TRE_APPROX
   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
          "limits [%d,%d,%d, total %d]\n",
          min, max, cost_ins, cost_del, cost_subst, cost_max,
          limit_ins, limit_del, limit_subst, limit_err));
+#else /* !TRE_APPROX */
+  DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
+#endif /* !TRE_APPROX */
 
 
   ctx->re = r;
   return REG_OK;
 }
 
+/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
+   non-self-contained options, like (?i), this causes ((?i)fu)bar to be
+   treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
+   Because we now set up tags for even non-capturing parenthesized
+   subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
+   pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
+   have it restore cflags after the subexpression, we don't need to have
+   a separate PARSE_RESTORE_CFLAGS, and then after processing the
+   non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
+   This has the side-benefit of now matching the perl behavior: the RE
+   foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
+   of foo AND (?i) (bar OR zap). */
 typedef enum {
   PARSE_RE = 0,
   PARSE_ATOM,
@@ -921,7 +1277,6 @@ typedef enum {
   PARSE_UNION,
   PARSE_POST_UNION,
   PARSE_POSTFIX,
-  PARSE_RESTORE_CFLAGS
 } tre_parse_re_stack_symbol_t;
 
 
@@ -935,16 +1290,23 @@ tre_parse(tre_parse_ctx_t *ctx)
   int bottom = tre_stack_num_objects(stack);
   int depth = 0;
   int temporary_cflags = 0;
+  int bre_branch_begin;
+#ifdef TRE_DEBUG
+  const tre_char_t *tmp_re;
+#endif
 
-  DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
-         ctx->len, ctx->re, ctx->len));
+  DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
+         ctx->len, ctx->re, ctx->len, ctx->cflags));
 
+  if (ctx->len <= 0) return REG_EMPTY;
   if (!ctx->nofirstsub)
     {
+      STACK_PUSH(stack, int, ctx->cflags);
       STACK_PUSH(stack, int, ctx->submatch_id);
       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
       ctx->submatch_id++;
     }
+  STACK_PUSH(stack, int, 0); // bre_branch_begin
   STACK_PUSH(stack, int, PARSE_RE);
   ctx->re_start = ctx->re;
   ctx->re_end = ctx->re + ctx->len;
@@ -954,38 +1316,40 @@ tre_parse(tre_parse_ctx_t *ctx)
      an explicit stack instead of recursive functions mostly because of
      two reasons: compatibility with systems which have an overflowable
      call stack, and efficiency (both in lines of code and speed).  */
-  while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
+  while (tre_stack_num_objects(stack) > bottom)
     {
-      if (status != REG_OK)
-       break;
       symbol = tre_stack_pop_int(stack);
       switch (symbol)
        {
        case PARSE_RE:
          /* Parse a full regexp.  A regexp is one or more branches,
             separated by the union operator `|'. */
+         bre_branch_begin = tre_stack_pop_int(stack);
+         if (
 #ifdef REG_LITERAL
-         if (!(ctx->cflags & REG_LITERAL)
-             && ctx->cflags & REG_EXTENDED)
+             !(ctx->cflags & REG_LITERAL) &&
 #endif /* REG_LITERAL */
+             ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
            STACK_PUSHX(stack, int, PARSE_UNION);
+         STACK_PUSHX(stack, int, bre_branch_begin);
          STACK_PUSHX(stack, int, PARSE_BRANCH);
          break;
 
        case PARSE_BRANCH:
          /* Parse a branch.  A branch is one or more pieces, concatenated.
             A piece is an atom possibly followed by a postfix operator. */
+         bre_branch_begin = tre_stack_pop_int(stack);
          STACK_PUSHX(stack, int, PARSE_CATENATION);
+         STACK_PUSHX(stack, int, bre_branch_begin);
          STACK_PUSHX(stack, int, PARSE_PIECE);
          break;
 
        case PARSE_PIECE:
          /* Parse a piece.  A piece is an atom possibly followed by one
             or more postfix operators. */
-#ifdef REG_LITERAL
-         if (!(ctx->cflags & REG_LITERAL))
-#endif /* REG_LITERAL */
-           STACK_PUSHX(stack, int, PARSE_POSTFIX);
+         bre_branch_begin = tre_stack_pop_int(stack);
+         STACK_PUSHX(stack, int, PARSE_POSTFIX);
+         STACK_PUSHX(stack, int, bre_branch_begin);
          STACK_PUSHX(stack, int, PARSE_ATOM);
          break;
 
@@ -1000,20 +1364,23 @@ tre_parse(tre_parse_ctx_t *ctx)
            if (!(ctx->cflags & REG_LITERAL))
              {
 #endif /* REG_LITERAL */
-               if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
+               if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
+                   ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
+                   && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
+                   *(ctx->re + 1) == CHAR_PIPE))
                  break;
                if ((ctx->cflags & REG_EXTENDED
                     && c == CHAR_RPAREN && depth > 0)
                    || (!(ctx->cflags & REG_EXTENDED)
-                       && (c == CHAR_BACKSLASH
-                           && *(ctx->re + 1) == CHAR_RPAREN)))
+                       && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
+                           && *(ctx->re + 1) == CHAR_RPAREN))
                  {
                    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
-                     status = REG_EPAREN;
+                     return REG_EPAREN;
                    DPRINT(("tre_parse:   group end: '%.*" STRF "'\n",
                            REST(ctx->re)));
                    depth--;
-                   if (!(ctx->cflags & REG_EXTENDED))
+                   if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
                      ctx->re += 2;
                    break;
                  }
@@ -1021,22 +1388,24 @@ tre_parse(tre_parse_ctx_t *ctx)
              }
 #endif /* REG_LITERAL */
 
-#ifdef REG_RIGHT_ASSOC
-           if (ctx->cflags & REG_RIGHT_ASSOC)
+#ifdef REG_LEFT_ASSOC
+           if (ctx->cflags & REG_LEFT_ASSOC)
              {
-               /* Right associative concatenation. */
+               /* Left associative concatenation. */
+               STACK_PUSHX(stack, int, PARSE_CATENATION);
                STACK_PUSHX(stack, voidptr, result);
                STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
-               STACK_PUSHX(stack, int, PARSE_CATENATION);
+               STACK_PUSHX(stack, int, 0); // bre_branch_begin
                STACK_PUSHX(stack, int, PARSE_PIECE);
              }
            else
-#endif /* REG_RIGHT_ASSOC */
+#endif /* REG_LEFT_ASSOC */
              {
-               /* Default case, left associative concatenation. */
-               STACK_PUSHX(stack, int, PARSE_CATENATION);
+               /* Default case, right associative concatenation. */
                STACK_PUSHX(stack, voidptr, result);
                STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
+               STACK_PUSHX(stack, int, PARSE_CATENATION);
+               STACK_PUSHX(stack, int, 0); // bre_branch_begin
                STACK_PUSHX(stack, int, PARSE_PIECE);
              }
            break;
@@ -1060,14 +1429,24 @@ tre_parse(tre_parse_ctx_t *ctx)
          if (ctx->cflags & REG_LITERAL)
            break;
 #endif /* REG_LITERAL */
+         if (!(ctx->cflags & REG_EXTENDED))
+           {
+             if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
+               break;
+             ctx->re++;
+           }
          switch (*ctx->re)
            {
            case CHAR_PIPE:
              DPRINT(("tre_parse:       union: '%.*" STRF "'\n",
                      REST(ctx->re)));
              STACK_PUSHX(stack, int, PARSE_UNION);
+             STACK_PUSHX(stack, voidptr, (void *)ctx->re);
              STACK_PUSHX(stack, voidptr, result);
              STACK_PUSHX(stack, int, PARSE_POST_UNION);
+             /* We need to pass a boolean (eventually) to PARSE_ATOM to
+                indicate if this is the beginning of a BRE extended branch. */
+             STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
              STACK_PUSHX(stack, int, PARSE_BRANCH);
              ctx->re++;
              break;
@@ -1077,6 +1456,8 @@ tre_parse(tre_parse_ctx_t *ctx)
              break;
 
            default:
+             if (!(ctx->cflags & REG_EXTENDED))
+               ctx->re--;
              break;
            }
          break;
@@ -1085,6 +1466,12 @@ tre_parse(tre_parse_ctx_t *ctx)
          {
            tre_ast_node_t *tmp_node;
            tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
+           const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
+           /* error on empty expression at end of union */
+           if (pipechar == ctx->re - 1)
+             {
+               return REG_EMPTY;
+             }
            tmp_node = tre_ast_new_union(ctx->mem, tree, result);
            if (!tmp_node)
              return REG_ESPACE;
@@ -1100,6 +1487,12 @@ tre_parse(tre_parse_ctx_t *ctx)
          if (ctx->cflags & REG_LITERAL)
            break;
 #endif /* REG_LITERAL */
+         int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
+         int rep_min = 0;
+         int rep_max = -1;
+#ifdef TRE_DEBUG
+         int lbrace_off;
+#endif
          switch (*ctx->re)
            {
            case CHAR_PLUS:
@@ -1110,598 +1503,803 @@ tre_parse(tre_parse_ctx_t *ctx)
            case CHAR_STAR:
              {
                tre_ast_node_t *tmp_node;
-               int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
-               int rep_min = 0;
-               int rep_max = -1;
 #ifdef TRE_DEBUG
-               const tre_char_t *tmp_re;
+               const char *tstr = "star";
+               tmp_re = ctx->re;
 #endif
 
+       handle_plus_or_question:
+               /* error on iteration of raw assertion (not in subexpression) */
+               if (result->type == LITERAL && result->submatch_id < 0 &&
+                   IS_ASSERTION((tre_literal_t *)result->obj))
+                 {
+                   if (!(ctx->cflags & REG_EXTENDED)) break;
+                   return REG_BADRPT;
+                 }
                if (*ctx->re == CHAR_PLUS)
-                 rep_min = 1;
+                 {
+                   rep_min = 1;
+#ifdef TRE_DEBUG
+                   tstr = "plus";
+#endif
+                 }
                if (*ctx->re == CHAR_QUESTIONMARK)
-                 rep_max = 1;
+                 {
+                   rep_max = 1;
 #ifdef TRE_DEBUG
-               tmp_re = ctx->re;
+                   tstr = "questionmark";
 #endif
+                 }
 
-               if (ctx->re + 1 < ctx->re_end)
+               if (ctx->cflags & REG_EXTENDED)
                  {
-                   if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
+                   if (ctx->re + 1 < ctx->re_end)
                      {
-                       minimal = !(ctx->cflags & REG_UNGREEDY);
-                       ctx->re++;
+                       if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
+                         {
+                           /* Process the question mark only in enhanced mode.
+                              Otherwise, the question mark is an error in ERE */
+                           if (ctx->cflags & REG_ENHANCED)
+                             {
+                               minimal = !(ctx->cflags & REG_UNGREEDY);
+                               ctx->re++;
+                             }
+                           else return REG_BADRPT;
+                         }
+                       else if (*(ctx->re + 1) == CHAR_STAR
+                                || *(ctx->re + 1) == CHAR_PLUS)
+                         {
+                           /* These are reserved for future extensions. */
+                           return REG_BADRPT;
+                         }
                      }
-                   else if (*(ctx->re + 1) == CHAR_STAR
-                            || *(ctx->re + 1) == CHAR_PLUS)
+                 }
+               else
+                 {
+                   if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
                      {
-                       /* These are reserved for future extensions. */
+                       /* This is reserved for future extensions. */
                        return REG_BADRPT;
                      }
-                 }
+                   if (ctx->re + 2 < ctx->re_end)
+                     {
+                       if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK)
+                         {
+                           /* Process the question mark only in enhanced mode.
+                              Otherwise, the question mark is a literal in BRE */
+                           if (ctx->cflags & REG_ENHANCED)
+                             {
+                               minimal = !(ctx->cflags & REG_UNGREEDY);
+                               ctx->re += 2;
+                             }
+                         }
+                       else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
+                         {
+                           /* This is reserved for future extensions. */
+                           return REG_BADRPT;
+                         }
+                     }
+                 }
 
-               DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
-                       minimal ? "  minimal" : "greedy", REST(tmp_re)));
+               if (minimal)
+                 ctx->num_reorder_tags++;
+
+               DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
+                       minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
+               if (result == NULL)
+                 {
+                   if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
+                   else goto parse_literal;
+                 }
                ctx->re++;
                tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
                                            minimal);
                if (tmp_node == NULL)
                  return REG_ESPACE;
                result = tmp_node;
+
+               /* Set the iterator with a submatch id in the invisible range
+                * (which will be overridden if a real submatch is needed) */
+               result->submatch_id = ctx->submatch_id_invisible++;
+
+#if 0
+               /* We don't allow multiple postfixes, but this might be needed
+                  to support approximate matching */
                STACK_PUSHX(stack, int, PARSE_POSTFIX);
+#endif
              }
              break;
 
            case CHAR_BACKSLASH:
              /* "\{" is special without REG_EXTENDED */
+             /* "\+" and "\?" are special with REG_ENHANCED for BRE */
              if (!(ctx->cflags & REG_EXTENDED)
-                 && ctx->re + 1 < ctx->re_end
-                 && *(ctx->re + 1) == CHAR_LBRACE)
+                 && ctx->re + 1 < ctx->re_end)
                {
-                 ctx->re++;
-                 goto parse_brace;
+                 switch (*(ctx->re + 1))
+                   {
+                   case CHAR_LBRACE:
+                     ctx->re++;
+#ifdef TRE_DEBUG
+                     lbrace_off = 2;
+#endif
+                     goto parse_brace;
+                   case CHAR_PLUS:
+                   case CHAR_QUESTIONMARK:
+                     if (ctx->cflags & REG_ENHANCED)
+                       {
+#ifdef TRE_DEBUG
+                         tmp_re = ctx->re;
+#endif
+                         ctx->re++;
+                         goto handle_plus_or_question;
+                       }
+                     break;
+                   }
+                 break;
                }
              else
                break;
 
            case CHAR_LBRACE:
-             /* "{" is literal without REG_EXTENDED */
-             if (!(ctx->cflags & REG_EXTENDED))
-               break;
+             {
+               int raw_assertion;
+
+               /* "{" is literal without REG_EXTENDED */
+               if (!(ctx->cflags & REG_EXTENDED))
+                 break;
+#ifdef TRE_DEBUG
+               lbrace_off = 1;
+#endif
 
            parse_brace:
-             DPRINT(("tre_parse:       bound: '%.*" STRF "'\n",
-                     REST(ctx->re)));
-             ctx->re++;
+               /* error on iteration of raw assertion (not in subexpression),
+                  but wait until after parsing bounds */
+               raw_assertion = (result->type == LITERAL
+                                && result->submatch_id < 0
+                                && IS_ASSERTION((tre_literal_t *)result->obj));
+               ctx->re++;
 
-             status = tre_parse_bound(ctx, &result);
-             if (status != REG_OK)
-               return status;
-             STACK_PUSHX(stack, int, PARSE_POSTFIX);
-             break;
+               status = tre_parse_bound(ctx, &result);
+#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
+               /* For ERE, if status is REG_NOMATCH, this mean the lbrace
+                  is to be treated as a literal. */
+               if (status == REG_NOMATCH)
+                 {
+                   ctx->re--;
+                   break;
+                 }
+#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
+               DPRINT(("tre_parse:     bound: '%.*" STRF "'\n",
+                       REST(ctx->re - lbrace_off)));
+               if (status != REG_OK)
+                 return status;
+               if (raw_assertion) return REG_BADRPT;
+
+               /* Set the iterator with a submatch id in the invisible range
+                * (which will be overridden if a real submatch is needed) */
+               if (result->type == ITERATION)
+                 result->submatch_id = ctx->submatch_id_invisible++;
+
+#if 0
+               /* We don't allow multiple postfixes, but this might be needed
+                  to support approximate matching */
+               STACK_PUSHX(stack, int, PARSE_POSTFIX);
+#endif
+               break;
+             }
            }
          break;
 
        case PARSE_ATOM:
-         /* Parse an atom.  An atom is a regular expression enclosed in `()',
-            an empty set of `()', a bracket expression, `.', `^', `$',
-            a `\' followed by a character, or a single character. */
+         {
+           /* Parse an atom.  An atom is a regular expression enclosed in `()',
+              an empty set of `()', a bracket expression, `.', `^', `$',
+              a `\' followed by a character, or a single character. */
 
-         /* End of regexp? (empty string). */
-         if (ctx->re >= ctx->re_end)
-           goto parse_literal;
+           /* The stack contains a boolean value, whether PARSE_ATOM is
+              being called just after the start of a group (left paren)
+              in a BRE */
+           bre_branch_begin = tre_stack_pop_int(stack);
+
+           /* End of regexp? (empty string). */
+           if (ctx->re >= ctx->re_end)
+             goto parse_literal;
 
 #ifdef REG_LITERAL
-         if (ctx->cflags & REG_LITERAL)
-           goto parse_literal;
+           if (ctx->cflags & REG_LITERAL)
+             goto parse_literal;
 #endif /* REG_LITERAL */
 
-         switch (*ctx->re)
-           {
-           case CHAR_LPAREN:  /* parenthesized subexpression */
+           switch (*ctx->re)
+             {
+             case CHAR_LPAREN:  /* parenthesized subexpression */
 
-             /* Handle "(?...)" extensions.  They work in a way similar
-                to Perls corresponding extensions. */
-             if (ctx->cflags & REG_EXTENDED
-                 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
-               {
-                 int new_cflags = ctx->cflags;
-                 int bit = 1;
-                 DPRINT(("tre_parse:   extension: '%.*" STRF "\n",
-                         REST(ctx->re)));
-                 ctx->re += 2;
-                 while (/*CONSTCOND*/1)
-                   {
-                     if (*ctx->re == L'i')
-                       {
-                         DPRINT(("tre_parse:       icase: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         if (bit)
-                           new_cflags |= REG_ICASE;
-                         else
-                           new_cflags &= ~REG_ICASE;
-                         ctx->re++;
-                       }
-                     else if (*ctx->re == L'n')
-                       {
-                         DPRINT(("tre_parse:     newline: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         if (bit)
-                           new_cflags |= REG_NEWLINE;
-                         else
-                           new_cflags &= ~REG_NEWLINE;
-                         ctx->re++;
-                       }
-#ifdef REG_RIGHT_ASSOC
-                     else if (*ctx->re == L'r')
-                       {
-                         DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         if (bit)
-                           new_cflags |= REG_RIGHT_ASSOC;
-                         else
-                           new_cflags &= ~REG_RIGHT_ASSOC;
-                         ctx->re++;
-                       }
-#endif /* REG_RIGHT_ASSOC */
+               /* Handle "(?...)" extensions.  They work in a way similar
+                  to Perls corresponding extensions. */
+               if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
+                   (REG_EXTENDED|REG_ENHANCED)
+                   && *(ctx->re + 1) == CHAR_QUESTIONMARK)
+                 {
+                   int new_cflags = ctx->cflags;
+                   int bit = 1;
+                   int invisible_submatch = 0;
+                   DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
+                           REST(ctx->re)));
+                   ctx->re += 2;
+                   while (/*CONSTCOND*/1)
+                     {
+                       if (*ctx->re == L'i')
+                         {
+                           DPRINT(("tre_parse:     icase: '%.*" STRF "'\n",
+                                   REST(ctx->re)));
+                           if (bit)
+                             new_cflags |= REG_ICASE;
+                           else
+                             new_cflags &= ~REG_ICASE;
+                           ctx->re++;
+                         }
+                       else if (*ctx->re == L'n')
+                         {
+                           DPRINT(("tre_parse:   newline: '%.*" STRF "'\n",
+                                   REST(ctx->re)));
+                           if (bit)
+                             new_cflags |= REG_NEWLINE;
+                           else
+                             new_cflags &= ~REG_NEWLINE;
+                           ctx->re++;
+                         }
+#ifdef REG_LEFT_ASSOC
+                       else if (*ctx->re == L'l')
+                         {
+                           DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
+                                   REST(ctx->re)));
+                           if (bit)
+                             new_cflags |= REG_LEFT_ASSOC;
+                           else
+                             new_cflags &= ~REG_LEFT_ASSOC;
+                           ctx->re++;
+                         }
+#endif /* REG_LEFT_ASSOC */
 #ifdef REG_UNGREEDY
-                     else if (*ctx->re == L'U')
-                       {
-                         DPRINT(("tre_parse:    ungreedy: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         if (bit)
-                           new_cflags |= REG_UNGREEDY;
-                         else
-                           new_cflags &= ~REG_UNGREEDY;
-                         ctx->re++;
-                       }
+                       else if (*ctx->re == L'U')
+                         {
+                           DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
+                                   REST(ctx->re)));
+                           if (bit)
+                             new_cflags |= REG_UNGREEDY;
+                           else
+                             new_cflags &= ~REG_UNGREEDY;
+                           ctx->re++;
+                         }
 #endif /* REG_UNGREEDY */
-                     else if (*ctx->re == CHAR_MINUS)
-                       {
-                         DPRINT(("tre_parse:    turn off: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         ctx->re++;
-                         bit = 0;
-                       }
-                     else if (*ctx->re == CHAR_COLON)
-                       {
-                         DPRINT(("tre_parse:    no group: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         ctx->re++;
-                         depth++;
-                         break;
-                       }
-                     else if (*ctx->re == CHAR_HASH)
-                       {
-                         DPRINT(("tre_parse:    comment: '%.*" STRF "\n",
-                                 REST(ctx->re)));
-                         /* A comment can contain any character except a
-                            right parenthesis */
-                         while (*ctx->re != CHAR_RPAREN
-                                && ctx->re < ctx->re_end)
+                       else if (*ctx->re == CHAR_MINUS)
+                         {
+                           DPRINT(("tre_parse:  turn off: '%.*" STRF "'\n",
+                                   REST(ctx->re)));
                            ctx->re++;
-                         if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
-                           {
+                           bit = 0;
+                         }
+                       else if (*ctx->re == CHAR_COLON)
+                         {
+                           DPRINT(("tre_parse:  no group: '%.*" STRF
+                                   "', (invisible submatch %d)\n",
+                                   REST(ctx->re), ctx->submatch_id_invisible));
+                           ctx->re++;
+                           depth++;
+                           invisible_submatch = 1;
+                           break;
+                     &nbs