From: John Marino Date: Thu, 6 Aug 2015 20:58:06 +0000 (+0200) Subject: TRE: Add local modifications to extend functionality X-Git-Tag: v4.5.0~961 X-Git-Url: https://gitweb.dragonflybsd.org/~tuxillo/dragonfly.git/commitdiff_plain/d5f8dde1e2bae450329b3ee8c25ef109b5713f07 TRE: Add local modifications to extend functionality The stock TRE regex library is very good, but it is missing three key functionalities: 1) collation support 2) equivalence classes 3) legacy support for character-name table Luckily, TRE was imported in Apple's Libc and they solved these issues. This commit represents the modifications Apple made (under the same 2-clause license the author Ville Laurikari issued) minus differences in xlocale support and the exclusion of "if 0" equivalent blocks. --- diff --git a/contrib/tre/lib/regcomp.c b/contrib/tre/lib/regcomp.c index 281b38e81f..fefd429bcd 100644 --- a/contrib/tre/lib/regcomp.c +++ b/contrib/tre/lib/regcomp.c @@ -19,7 +19,8 @@ #include "xmalloc.h" int -tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) +tre_regncomp_l(regex_t *preg, const char *regex, size_t n, int cflags, + locale_t loc) { int ret; #if TRE_WCHAR @@ -30,13 +31,15 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) if (wregex == NULL) return REG_ESPACE; + FIX_LOCALE(loc); + /* If the current locale uses the standard single byte encoding of characters, we don't do a multibyte string conversion. If we did, many applications which use the default locale would break since the default "C" locale uses the 7-bit ASCII character set, and all characters with the eighth bit set would be considered invalid. */ #if TRE_MULTIBYTE - if (TRE_MB_CUR_MAX == 1) + if (TRE_MB_CUR_MAX_L(loc) == 1) #endif /* TRE_MULTIBYTE */ { unsigned int i; @@ -50,7 +53,7 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) #if TRE_MULTIBYTE else { - int consumed; + size_t consumed; tre_char_t *wcptr = wregex; #ifdef HAVE_MBSTATE_T mbstate_t state; @@ -58,7 +61,7 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) #endif /* HAVE_MBSTATE_T */ while (n > 0) { - consumed = tre_mbrtowc(wcptr, regex, n, &state); + consumed = tre_mbrtowc_l(wcptr, regex, n, &state, loc); switch (consumed) { @@ -71,15 +74,11 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) return REG_BADPAT; } break; - case -1: + case (size_t)-1: + case (size_t)-2: DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno))); xfree(wregex); - return REG_BADPAT; - case -2: - /* The last character wasn't complete. Let's not call it a - fatal error. */ - consumed = n; - break; + return REG_ILLSEQ; } regex += consumed; n -= consumed; @@ -90,33 +89,70 @@ tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) #endif /* TRE_MULTIBYTE */ wregex[wlen] = L'\0'; - ret = tre_compile(preg, wregex, wlen, cflags); + ret = tre_compile(preg, wregex, wlen, cflags, loc); xfree(wregex); #else /* !TRE_WCHAR */ - ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags); + FIX_LOCALE(loc); + ret = tre_compile(preg, (const tre_char_t *)regex, n, cflags, loc); #endif /* !TRE_WCHAR */ return ret; } int -tre_regcomp(regex_t *preg, const char *regex, int cflags) +tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) { - return tre_regncomp(preg, regex, regex ? strlen(regex) : 0, cflags); + return tre_regncomp_l(preg, regex, n, cflags, __get_locale()); } +int +tre_regcomp_l(regex_t *preg, const char *regex, int cflags, locale_t loc) +{ + size_t len; + + if (cflags & REG_PEND) + { + if ((const char *)(preg->re_endp) < regex) + return REG_INVARG; + len = (const char *)(preg->re_endp) - regex; + } + else + len = strlen(regex); + return tre_regncomp_l(preg, regex, len, cflags, loc); +} + +int +tre_regcomp(regex_t *preg, const char *regex, int cflags) +{ + return tre_regcomp_l(preg, regex, cflags, __get_locale()); +} #ifdef TRE_WCHAR +int +tre_regwncomp_l(regex_t *preg, const wchar_t *regex, size_t n, int cflags, + locale_t loc) +{ + FIX_LOCALE(loc); + return tre_compile(preg, regex, n, cflags, loc); +} + int tre_regwncomp(regex_t *preg, const wchar_t *regex, size_t n, int cflags) { - return tre_compile(preg, regex, n, cflags); + return tre_compile(preg, regex, n, cflags, __get_locale()); +} + +int +tre_regwcomp_l(regex_t *preg, const wchar_t *regex, int cflags, locale_t loc) +{ + FIX_LOCALE(loc); + return tre_compile(preg, regex, wcslen(regex), cflags, loc); } int tre_regwcomp(regex_t *preg, const wchar_t *regex, int cflags) { - return tre_compile(preg, regex, regex ? wcslen(regex) : 0, cflags); + return tre_regwncomp(preg, regex, wcslen(regex), cflags); } #endif /* TRE_WCHAR */ diff --git a/contrib/tre/lib/regerror.c b/contrib/tre/lib/regerror.c index 701f701ed6..298bf46056 100644 --- a/contrib/tre/lib/regerror.c +++ b/contrib/tre/lib/regerror.c @@ -47,7 +47,10 @@ static const char *tre_error_messages[] = gettext_noop("Invalid contents of {}"), /* REG_BADBR */ gettext_noop("Invalid character range"), /* REG_ERANGE */ gettext_noop("Out of memory"), /* REG_ESPACE */ - gettext_noop("Invalid use of repetition operators") /* REG_BADRPT */ + gettext_noop("Invalid use of repetition operators"), /* REG_BADRPT */ + gettext_noop("Empty (sub)expression"), /* REG_EMPTY */ + gettext_noop("Invalid argument to regex routine"), /* REG_INVARG */ + gettext_noop("Illegal byte sequence") /* REG_ILLSEQ */ }; size_t diff --git a/contrib/tre/lib/regexec.c b/contrib/tre/lib/regexec.c index 54779ef459..a72bd9680a 100644 --- a/contrib/tre/lib/regexec.c +++ b/contrib/tre/lib/regexec.c @@ -45,23 +45,136 @@ char *alloca (); #include #include "tre-internal.h" +#include "tre-match-utils.h" #include "tre.h" #include "xmalloc.h" +/* For each tre_last_matched_t in the lm array, find the last matched branch by + comparing the touch value of the cmp_tag's. For all other branches, reset + the corresponding tags. If reset_all is non-zero, reset all tags in all + branches. Recurse into the nested last matched structures, clearing tags as + apprpriate. */ +static void +tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm, + int n, int start_tag, int reset_all) +{ + int max, i, reset; + tre_last_matched_branch_t *b; + + DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n", + n, start_tag, reset_all)); + for (; n-- > 0; lm++) + { + if (lm->n_branches == 1) + { + b = lm->branches; + if (start_tag > 0) + { + DPRINT((" b->cmp_tag=%d %d cmp_tag, + tre_tag_touch_get(tags, b->cmp_tag), + tre_tag_touch_get(tags, start_tag))); + reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < + tre_tag_touch_get(tags, start_tag)); + } + else + reset = 0; + + if (reset) + { + int *t; + + for (i = b->n_tags, t = b->tags; i > 0; i--, t++) + { + DPRINT((" Resetting t%d\n", *t)); + tre_tag_reset(tags, *t); + } + } + if (b->n_last_matched > 0) + tre_reset_last_matched_branches(tags, b->last_matched, + b->n_last_matched, + lm->start_tag, reset); + } + else + { + if (!reset_all) + { +#ifdef TRE_DEBUG + int last; +#endif /* TRE_DEBUG */ + max = 0; + for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) + { + int t = b->cmp_tag; + int touch = tre_tag_touch_get(tags, t); + if (touch > max) + { + max = touch; +#ifdef TRE_DEBUG + last = t; +#endif /* TRE_DEBUG */ + } + } + DPRINT((" Last touched end tag t%d=%d\n", last, max)); + } + + for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) + { + reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max); + if (reset) + { + int j; + int *t; + + for (j = b->n_tags, t = b->tags; j > 0; j--, t++) + { + DPRINT((" Resetting t%d\n", *t)); + tre_tag_reset(tags, *t); + } + } + if (b->n_last_matched > 0) + tre_reset_last_matched_branches(tags, b->last_matched, + b->n_last_matched, + lm->start_tag, reset); + } + } + } +} + + /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match endpoint values. */ -void +reg_errcode_t tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, int *tags, int match_eo) + const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo) { - tre_submatch_data_t *submatch_data; - unsigned int i, j; - int *parents; + unsigned int i; + + if (cflags & REG_NOSUB) return REG_OK; i = 0; - if (match_eo >= 0 && !(cflags & REG_NOSUB)) + if (match_eo >= 0 && intags) { + const tre_tag_t *tags = intags; + tre_submatch_data_t *submatch_data; + + if (tnfa->last_matched_branch && + tnfa->last_matched_branch->n_last_matched > 0) + { + tre_tag_t *t; +#ifdef TRE_USE_ALLOCA + t = alloca(sizeof(*t) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ + t = xmalloc(sizeof(*t) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ + if (!t) return REG_ESPACE; + memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t)); + tre_reset_last_matched_branches(t, + tnfa->last_matched_branch->last_matched, + tnfa->last_matched_branch->n_last_matched, + 0, 0); + tags = t; + } /* Construct submatch offsets from the tags. */ DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); submatch_data = tnfa->submatch_data; @@ -70,43 +183,29 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, if (submatch_data[i].so_tag == tnfa->end_tag) pmatch[i].rm_so = match_eo; else - pmatch[i].rm_so = tags[submatch_data[i].so_tag]; + pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag); if (submatch_data[i].eo_tag == tnfa->end_tag) pmatch[i].rm_eo = match_eo; else - pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; + pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag); /* If either of the endpoints were not used, this submatch was not part of the match. */ if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) pmatch[i].rm_so = pmatch[i].rm_eo = -1; - DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, + DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i, submatch_data[i].so_tag, pmatch[i].rm_so, submatch_data[i].eo_tag, pmatch[i].rm_eo)); i++; } - /* Reset all submatches that are not within all of their parent - submatches. */ - i = 0; - while (i < tnfa->num_submatches && i < nmatch) - { - if (pmatch[i].rm_eo == -1) - assert(pmatch[i].rm_so == -1); - assert(pmatch[i].rm_so <= pmatch[i].rm_eo); - - parents = submatch_data[i].parents; - if (parents != NULL) - for (j = 0; parents[j] >= 0; j++) - { - DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); - if (pmatch[i].rm_so < pmatch[parents[j]].rm_so - || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) - pmatch[i].rm_so = pmatch[i].rm_eo = -1; - } - i++; - } +#ifndef TRE_USE_ALLOCA +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" + if (tags != intags) xfree(tags); +#pragma GCC diagnostic pop +#endif /* !TRE_USE_ALLOCA */ } while (i < nmatch) @@ -115,6 +214,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, pmatch[i].rm_eo = -1; i++; } + + return REG_OK; } @@ -129,12 +230,14 @@ tre_have_backrefs(const regex_t *preg) return tnfa->have_backrefs; } +#ifdef TRE_APPROX int tre_have_approx(const regex_t *preg) { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; return tnfa->have_approx; } +#endif /* TRE_APPROX */ static int tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, @@ -142,7 +245,9 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, int eflags) { reg_errcode_t status; - int *tags = NULL, eo; + tre_tag_t *tags = NULL; + int eo; + size_t offset = 0, count = 0; if (tnfa->num_tags > 0 && nmatch > 0) { #ifdef TRE_USE_ALLOCA @@ -154,19 +259,26 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, return REG_ESPACE; } + if ( + (eflags & REG_STARTEND) && pmatch) + { + if (pmatch->rm_so < 0) + return REG_INVARG; + if (len == (size_t)-1) + { + if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo) + return REG_INVARG; + len = pmatch->rm_eo - pmatch->rm_so; + } + count = offset = pmatch->rm_so; + if (type == STR_WIDE) offset *= sizeof(wchar_t); + } + /* Dispatch to the appropriate matcher. */ if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) { /* The regex has back references, use the backtracking matcher. */ - if (type == STR_USER) - { - const tre_str_source *source = string; - if (source->rewind == NULL || source->compare == NULL) - /* The backtracking matcher requires rewind and compare - capabilities from the input stream. */ - return REG_BADPAT; - } - status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type, + status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type, tags, eflags, &eo); } #ifdef TRE_APPROX @@ -178,20 +290,36 @@ tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, tre_regaparams_default(¶ms); params.max_err = 0; params.max_cost = 0; - status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, + status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags, &match, params, eflags, &eo); } #endif /* TRE_APPROX */ else { /* Exact matching, no back references, use the parallel matcher. */ - status = tre_tnfa_run_parallel(tnfa, string, (int)len, type, + status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type, tags, eflags, &eo); } if (status == REG_OK) - /* A match was found, so fill the submatch registers. */ - tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); + { + /* A match was found, so fill the submatch registers. */ + status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); + /* If doing REG_STARTEND, adjust the pmatch array (we can't build + this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls + tre_fill_pmatch itself). */ + if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && + (eflags & REG_STARTEND) && pmatch && nmatch > 0) + { + size_t i; + regmatch_t *p; + for (i = nmatch, p = pmatch; i > 0; p++, i--) + { + if (p->rm_so >= 0) p->rm_so += count; + if (p->rm_eo >= 0) p->rm_eo += count; + } + } + } #ifndef TRE_USE_ALLOCA if (tags) xfree(tags); @@ -204,7 +332,11 @@ tre_regnexec(const regex_t *preg, const char *str, size_t len, size_t nmatch, regmatch_t pmatch[], int eflags) { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; - tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; + +#ifdef TRE_USE_SYSTEM_REGEX_H + if (preg->re_magic != RE_MAGIC) return REG_BADPAT; +#endif /* TRE_USE_SYSTEM_REGEX_H */ return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); } @@ -213,7 +345,7 @@ int tre_regexec(const regex_t *preg, const char *str, size_t nmatch, regmatch_t pmatch[], int eflags) { - return tre_regnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags); + return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); } @@ -224,6 +356,11 @@ tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, size_t nmatch, regmatch_t pmatch[], int eflags) { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + +#ifdef TRE_USE_SYSTEM_REGEX_H + if (preg->re_magic != RE_MAGIC) return REG_BADPAT; +#endif /* TRE_USE_SYSTEM_REGEX_H */ + return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); } @@ -231,20 +368,11 @@ int tre_regwexec(const regex_t *preg, const wchar_t *str, size_t nmatch, regmatch_t pmatch[], int eflags) { - return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags); + return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); } #endif /* TRE_WCHAR */ -int -tre_reguexec(const regex_t *preg, const tre_str_source *str, - size_t nmatch, regmatch_t pmatch[], int eflags) -{ - tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; - return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags); -} - - #ifdef TRE_APPROX /* @@ -257,7 +385,9 @@ tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, int eflags) { reg_errcode_t status; - int *tags = NULL, eo; + tre_tag_t *tags = NULL; + int eo; + size_t offset = 0, count = 0; /* If the regexp does not use approximate matching features, the maximum cost is zero, and the approximate matcher isn't forced, @@ -281,10 +411,44 @@ tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, if (tags == NULL) return REG_ESPACE; } + + if ( + (eflags & REG_STARTEND) && match->pmatch) + { + if (match->pmatch->rm_so < 0) + return REG_INVARG; + if (len == (size_t)-1) + { + if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so > + match->pmatch->rm_eo) + return REG_INVARG; + len = match->pmatch->rm_eo - match->pmatch->rm_so; + } + count = offset = match->pmatch->rm_so; + if (type == STR_WIDE) offset *= sizeof(wchar_t); + } + status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, match, params, eflags, &eo); if (status == REG_OK) - tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); + { + status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, + tnfa, tags, eo); + /* If doing REG_STARTEND, adjust the pmatch array (we can't build + this into tre_fill_pmatch, because tre_tnfa_run_backtrack call + tre_fill_pmatch itself). */ + if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && + (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0) + { + size_t i; + regmatch_t *p; + for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--) + { + if (p->rm_so >= 0) p->rm_so += count; + if (p->rm_eo >= 0) p->rm_eo += count; + } + } + } #ifndef TRE_USE_ALLOCA if (tags) xfree(tags); @@ -297,7 +461,11 @@ tre_reganexec(const regex_t *preg, const char *str, size_t len, regamatch_t *match, regaparams_t params, int eflags) { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; - tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; + +#ifdef TRE_USE_SYSTEM_REGEX_H + if (preg->re_magic != RE_MAGIC) return REG_BADPAT; +#endif /* TRE_USE_SYSTEM_REGEX_H */ return tre_match_approx(tnfa, str, len, type, match, params, eflags); } @@ -306,7 +474,7 @@ int tre_regaexec(const regex_t *preg, const char *str, regamatch_t *match, regaparams_t params, int eflags) { - return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags); + return tre_reganexec(preg, str, (size_t)-1, match, params, eflags); } #ifdef TRE_WCHAR @@ -316,6 +484,11 @@ tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, regamatch_t *match, regaparams_t params, int eflags) { tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + +#ifdef TRE_USE_SYSTEM_REGEX_H + if (preg->re_magic != RE_MAGIC) return REG_BADPAT; +#endif /* TRE_USE_SYSTEM_REGEX_H */ + return tre_match_approx(tnfa, str, len, STR_WIDE, match, params, eflags); } @@ -324,7 +497,7 @@ int tre_regawexec(const regex_t *preg, const wchar_t *str, regamatch_t *match, regaparams_t params, int eflags) { - return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags); + return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags); } #endif /* TRE_WCHAR */ diff --git a/contrib/tre/lib/tre-ast.c b/contrib/tre/lib/tre-ast.c index acb387acce..12f8cdc6ba 100644 --- a/contrib/tre/lib/tre-ast.c +++ b/contrib/tre/lib/tre-ast.c @@ -154,8 +154,8 @@ tre_do_print(FILE *stream, tre_ast_node_t *ast, int indent) else if (IS_ASSERTION(lit)) { int i; - char *assertions[] = { "bol", "eol", "ctype", "!ctype", - "bow", "eow", "wb", "!wb" }; + char *assertions[] = { "bol", "eol", "bracket", + "bow", "eow", "wb", "!wb", "backref" }; if (code_max >= ASSERT_LAST << 1) assert(0); fprintf(stream, "assertions: "); diff --git a/contrib/tre/lib/tre-ast.h b/contrib/tre/lib/tre-ast.h index d9af3762f9..3804eff934 100644 --- a/contrib/tre/lib/tre-ast.h +++ b/contrib/tre/lib/tre-ast.h @@ -10,6 +10,8 @@ #ifndef TRE_AST_H #define TRE_AST_H 1 +#include + #include "tre-mem.h" #include "tre-internal.h" #include "tre-compile.h" @@ -36,18 +38,26 @@ typedef enum { #define IS_BACKREF(x) ((x)->code_min == BACKREF) #define IS_PARAMETER(x) ((x)->code_min == PARAMETER) +#define SUBMATCH_ID_INVISIBLE_START (INT_MAX / 2 + 1) + /* A generic AST node. All AST nodes consist of this node on the top level with `obj' pointing to the actual content. */ -typedef struct { - tre_ast_type_t type; /* Type of the node. */ +typedef struct _tre_ast_node { void *obj; /* Pointer to actual node. */ - int nullable; + tre_last_matched_branch_pre_t *last_matched_branch; + tre_last_matched_pre_t *last_matched_in_progress; + tre_pos_and_tags_t *firstpos; + tre_pos_and_tags_t *lastpos; + /* The original pointer is used to point to the original node, when a + * CATENATION and tag are added. If NULL, this is node is the original */ + struct _tre_ast_node *original; + tre_ast_type_t type; /* Type of the node. */ int submatch_id; int num_submatches; int num_tags; - tre_pos_and_tags_t *firstpos; - tre_pos_and_tags_t *lastpos; + short nullable; + short make_branches; } tre_ast_node_t; @@ -55,14 +65,13 @@ typedef struct { tags, matching parameter settings, and all expressions that match one character. */ typedef struct { - long code_min; - long code_max; + tre_cint_t code_min; + tre_cint_t code_max; int position; union { - tre_ctype_t class; + tre_bracket_match_list_t *bracket_match_list; int *params; } u; - tre_ctype_t *neg_classes; } tre_literal_t; /* A "catenation" node. These are created when two regexps are concatenated. @@ -95,6 +104,9 @@ typedef struct { typedef struct { tre_ast_node_t *left; tre_ast_node_t *right; + /* The left end right end tags if non-zero */ + int left_tag; + int right_tag; } tre_union_t; tre_ast_node_t * diff --git a/contrib/tre/lib/tre-compile.c b/contrib/tre/lib/tre-compile.c index 948cc46989..6c4187e042 100644 --- a/contrib/tre/lib/tre-compile.c +++ b/contrib/tre/lib/tre-compile.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tre-internal.h" #include "tre-mem.h" @@ -29,11 +30,245 @@ #include "tre.h" #include "xmalloc.h" +/* + The bit_ffs() macro in bitstring.h is flawed. Replace it with a working one. +*/ +#undef bit_ffs +#define bit_ffs(name, nbits, value) { \ + register bitstr_t *_name = name; \ + register int _byte, _nbits = nbits; \ + register int _stopbyte = _bit_byte(_nbits), _value = -1; \ + for (_byte = 0; _byte <= _stopbyte; ++_byte) \ + if (_name[_byte]) { \ + _value = _byte << 3; \ + for (_stopbyte = _name[_byte]; !(_stopbyte&0x1); \ + ++_value, _stopbyte >>= 1); \ + break; \ + } \ + *(value) = _value; \ +} + /* Algorithms to setup tags so that submatch addressing can be done. */ +#ifdef TRE_DEBUG +static const char *tag_dir_str[] = { + "minimize", + "maximize", + "left-maximize" +}; + +static const char _indent[] = " "; + +static void +print_indent(int indent) +{ + while (indent-- > 0) + DPRINT((_indent)); +} + +static void print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, + int num_tags); +static void +print_last_match_branch_pre(tre_last_matched_branch_pre_t *branch, int indent, + int num_tags) +{ + tre_last_matched_pre_t *u = branch->last_matched; + int n_last_matched = 0; + + while (u) + { + n_last_matched++; + u = u->next; + } + + print_indent(indent); + DPRINT(("BRANCH: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", + branch->tot_branches, branch->tot_last_matched, branch->tot_tags)); + print_indent(indent); + DPRINT(("..n_last_matched=%d last_matched=%d\n", branch->n_last_matched, + n_last_matched)); + if (branch->n_last_matched != n_last_matched) + DPRINT(("*** mismatch between n_last_matched and unions ***\n")); + if (branch->cmp_tag > 0) + { + int i; + const char *sep = " tags="; + print_indent(indent); + DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); + for (i = 0; i < num_tags; i++) + if (bit_test(branch->tags, i)) + { + DPRINT(("%s%d", sep, i)); + sep = ","; + } + DPRINT(("\n")); + } + + u = branch->last_matched; + indent++; + while (u) + { + print_last_matched_pre(u, indent, num_tags); + u = u->next; + } +} + +static void +print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, int num_tags) +{ + tre_last_matched_branch_pre_t *b = lm->branches; + int n_branches = 0; + + while (b) + { + n_branches++; + b = b->next; + } + + print_indent(indent); + DPRINT(("LAST_MATCHED: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", + lm->tot_branches, lm->tot_last_matched, lm->tot_tags)); + print_indent(indent); + DPRINT(("..start_tag=%d n_branches=%d branches=%d\n", lm->start_tag, + lm->n_branches, n_branches)); + if (lm->n_branches != n_branches) + DPRINT(("*** mismatch between n and branches ***\n")); + + b = lm->branches; + indent++; + while (b) + { + print_last_match_branch_pre(b, indent, num_tags); + b = b->next; + } +} + +static void print_last_matched(tre_last_matched_t *lm, int indent); +static void +print_last_match_branch(tre_last_matched_branch_t *branch, int indent) +{ + tre_last_matched_t *u; + int i; + + print_indent(indent); + DPRINT(("BRANCH: n_last_matched=%d\n", branch->n_last_matched)); + if (branch->cmp_tag > 0) + { + print_indent(indent); + DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); + if (branch->n_tags > 0) + { + const char *sep = " tags="; + for (i = 0; i < branch->n_tags; i++) + { + DPRINT(("%s%d", sep, branch->tags[i])); + sep = ","; + } + } + DPRINT(("\n")); + } + + u = branch->last_matched; + indent++; + for (i = branch->n_last_matched; i > 0; i--, u++) + print_last_matched(u, indent); +} + +static void +print_last_matched(tre_last_matched_t *lm, int indent) +{ + int i; + tre_last_matched_branch_t *b; + + print_indent(indent); + DPRINT(("LAST_MATCHED: n_branches=%d start_tag=%d\n", lm->n_branches, + lm->start_tag)); + + b = lm->branches; + indent++; + for (i = lm->n_branches; i > 0; i--, b++) + print_last_match_branch(b, indent); +} +#endif /* TRE_DEBUG */ + + +/* Merge the tre_last_matched_branch_pre_t of src into dst, creating a new + one if needed. If tag_id > 0, add that tag as well (a negative tag_id will + create an unset tre_last_matched_branch_pre_t. */ +static reg_errcode_t +tre_merge_branches(tre_mem_t mem, tre_ast_node_t *dst, tre_ast_node_t *src, + int tag_id, int num_tags) +{ + tre_last_matched_branch_pre_t *db = dst->last_matched_branch; + tre_last_matched_branch_pre_t *sb = (src ? src->last_matched_branch : NULL); + + if (db) + { + if (sb) + { + bitstr_t *l = db->tags; + bitstr_t *r = sb->tags; + int i = bitstr_size(num_tags); + + while(i-- > 0) + *l++ |= *r++; + /* db and sb are the info from two parallel sub-trees, so the tags + must be mutually exclusive, and we can just add their numbers */ + db->n_tags += sb->n_tags; + db->tot_tags += sb->tot_tags; + if (db->last_matched) + { + if (sb->last_matched) + { + tre_last_matched_pre_t *u = db->last_matched; + + while(u->next) + u = u->next; + u->next = sb->last_matched; + db->n_last_matched += sb->n_last_matched; + db->tot_branches += sb->tot_branches; + db->tot_last_matched += sb->tot_last_matched; + } + } + else if (sb->last_matched) + { + db->last_matched = sb->last_matched; + db->n_last_matched = sb->n_last_matched; + db->tot_branches = sb->tot_branches; + db->tot_last_matched = sb->tot_last_matched; + } + } + } + else + db = sb; + + if (tag_id != 0) + { + if (!db) + { + db = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(num_tags)); + if (db == NULL) + return REG_ESPACE; + db->tot_branches = 1; + } + if (tag_id > 0) + { + /* tag_id is a new tag, and shouldn't exist in db's tags, + so we can always increment n_tags */ + bit_set(db->tags, tag_id); + db->n_tags++; + db->tot_tags++; + } + } + dst->last_matched_branch = db; + return REG_OK; +} + + /* Inserts a catenation node to the root of the tree given in `node'. As the left child a new tag with number `tag_id' to `node' is added, and the right child is the old root. */ @@ -50,19 +285,18 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); if (c->left == NULL) return REG_ESPACE; - c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + c->right = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); if (c->right == NULL) return REG_ESPACE; c->right->obj = node->obj; c->right->type = node->type; + c->right->last_matched_branch = node->last_matched_branch; c->right->nullable = -1; c->right->submatch_id = -1; - c->right->firstpos = NULL; - c->right->lastpos = NULL; - c->right->num_tags = 0; node->obj = c; node->type = CATENATION; + node->original = c->right; return REG_OK; } @@ -82,50 +316,58 @@ tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); if (c->right == NULL) return REG_ESPACE; - c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + c->left = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); if (c->left == NULL) return REG_ESPACE; c->left->obj = node->obj; c->left->type = node->type; + c->left->last_matched_branch = node->last_matched_branch; c->left->nullable = -1; c->left->submatch_id = -1; - c->left->firstpos = NULL; - c->left->lastpos = NULL; - c->left->num_tags = 0; node->obj = c; node->type = CATENATION; + node->original = c->left; return REG_OK; } typedef enum { ADDTAGS_RECURSE, + ADDTAGS_RECURSE_NOT_TOP_UNION, ADDTAGS_AFTER_ITERATION, ADDTAGS_AFTER_UNION_LEFT, ADDTAGS_AFTER_UNION_RIGHT, ADDTAGS_AFTER_CAT_LEFT, ADDTAGS_AFTER_CAT_RIGHT, - ADDTAGS_SET_SUBMATCH_END + ADDTAGS_SET_SUBMATCH_END, + ADDTAGS_UNION_RECURSE, + ADDTAGS_UNION_RIGHT_RECURSE, + ADDTAGS_AFTER_UNION_TOP, } tre_addtags_symbol_t; +enum { + COPY_LAST_MATCHED_BRANCH, + COPY_LAST_MATCHED_BRANCH_NEXT, + COPY_LAST_MATCHED, + COPY_LAST_MATCHED_NEXT, +}; -typedef struct { - int tag; - int next_tag; -} tre_tag_states_t; +#define REGSET_UNSET ((unsigned)-1) /* Go through `regset' and set submatch data for submatches that are using this tag. */ static void -tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) +tre_purge_regset(unsigned *regset, tre_tnfa_t *tnfa, int tag) { int i; - for (i = 0; regset[i] >= 0; i++) + for (i = 0; regset[i] != REGSET_UNSET; i++) { int id = regset[i] / 2; int start = !(regset[i] % 2); + if (id >= SUBMATCH_ID_INVISIBLE_START) + continue; DPRINT((" Using tag %d for %s offset of " "submatch %d\n", tag, start ? "start" : "end", id)); @@ -138,6 +380,10 @@ tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) } +#define REGSET_HAS_STARTS 0x1 +#define REGSET_HAS_ENDS 0x2 + + /* Adds tags to appropriate locations in the parse tree in `tree', so that subexpressions marked for submatch addressing can be traced. */ static reg_errcode_t @@ -150,49 +396,53 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, int bottom = tre_stack_num_objects(stack); /* True for first pass (counting number of needed tags) */ int first_pass = (mem == NULL || tnfa == NULL); - int *regset, *orig_regset; + unsigned *regset, *orig_regset; + int regset_contains = 0; int num_tags = 0; /* Total number of tags. */ int num_minimals = 0; /* Number of special minimal tags. */ int tag = 0; /* The tag that is to be added next. */ int next_tag = 1; /* Next tag to use after this one. */ - int *parents; /* Stack of submatches the current submatch is - contained in. */ int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ - tre_tag_states_t *saved_states; - - tre_tag_direction_t direction = TRE_TAG_MINIMIZE; + int *reorder_tags = NULL; /* Tag reorder array: a pair for each reorder, + * the first is the tag to reorder, the second + * is the tag after which the first is reordered */ + int *rtp; /* Pointer used to fill in reorder_tags and + * tag_order */ + int *to_reorder; /* Transform array converting sequential order to + * that specified by reorder_tags */ + int id; + + tre_tag_direction_t direction = TRE_TAG_LEFT_MAXIMIZE; if (!first_pass) { + DPRINT(("Initializing direction to %s\n", tag_dir_str[direction])); tnfa->end_tag = 0; tnfa->minimal_tags[0] = -1; } - regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); + regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + + tnfa->num_submatches_invisible + 1) * 2)); if (regset == NULL) - return REG_ESPACE; - regset[0] = -1; - orig_regset = regset; - - parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1)); - if (parents == NULL) { - xfree(regset); - return REG_ESPACE; + status = REG_ESPACE; + goto error_regset; } - parents[0] = -1; + regset[0] = REGSET_UNSET; + orig_regset = regset; - saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1)); - if (saved_states == NULL) - { - xfree(regset); - xfree(parents); - return REG_ESPACE; - } - else + if (!first_pass) { - unsigned int i; - for (i = 0; i <= tnfa->num_submatches; i++) - saved_states[i].tag = -1; + /* Allocate all memory for reorder_tags, tag_order, to_seq_order and + * to_reorder in one batch (assuming all are the same type) */ + rtp = reorder_tags = xmalloc(sizeof(*reorder_tags) * + ((2 * tnfa->num_reorder_tags + 1) + + tnfa->num_tags)); + if (reorder_tags == NULL) + { + status = REG_ESPACE; + goto error_reorder_tags; + } + to_reorder = reorder_tags + (2 * tnfa->num_reorder_tags + 1); } STACK_PUSH(stack, voidptr, node); @@ -206,60 +456,102 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); switch (symbol) { + int top_union; case ADDTAGS_SET_SUBMATCH_END: { - int id = tre_stack_pop_int(stack); int i; + id = tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); /* Add end of this submatch to regset. */ - for (i = 0; regset[i] >= 0; i++); + for (i = 0; regset[i] != REGSET_UNSET; i++); regset[i] = id * 2 + 1; regset[i + 1] = -1; + regset_contains |= REGSET_HAS_ENDS; - /* Pop this submatch from the parents stack. */ - for (i = 0; parents[i] >= 0; i++); - parents[i - 1] = -1; + /* Always put a tag after a minimal iterator. */ + if (minimal_tag >= 0) + { + if (first_pass) + { + node->num_tags++; + DPRINT((" ADDTAGS_SET_SUBMATCH_END: node->num_tags = %d\n", + node->num_tags)); + } + else + { + int i; + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_right(mem, node, tag); + if (status != REG_OK) + break; + tnfa->tag_directions[tag] = TRE_TAG_MINIMIZE; + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + + DPRINT((" Minimal end: t%d reordered to " + "after t%d\n", tag, minimal_tag)); + /* Append to tag_order, move "tag" after + * "minimal_tag" */ + *rtp++ = tag; + *rtp++ = minimal_tag; + + num_minimals++; + tre_purge_regset(regset, tnfa, tag); + } + + minimal_tag = -1; + DPRINT((" ADDTAGS_SET_SUBMATCH_END num_tags++ tag=%d\n", tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; + tag = next_tag; + num_tags++; + next_tag++; + } break; } + case ADDTAGS_RECURSE_NOT_TOP_UNION: + /* Like ADDTAGS_RECURSE, except that top_union is set to zero, + * indicating that if a union is being processed, it is not the + * top-most of a series */ + top_union = 0; + goto do_addtags_recurse; + case ADDTAGS_RECURSE: + /* Setting top_union to 1 means that if a union is begin processed, + * it is the top-most of a series, and should recurse through the + * series to set the left_tag and right_tag values */ + top_union = 1; + +do_addtags_recurse: node = tre_stack_pop_voidptr(stack); - if (node->submatch_id >= 0) + id = node->submatch_id; + if (id >= 0) { - int id = node->submatch_id; int i; /* Add start of this submatch to regset. */ - for (i = 0; regset[i] >= 0; i++); + for (i = 0; regset[i] != REGSET_UNSET; i++); regset[i] = id * 2; regset[i + 1] = -1; - - if (!first_pass) - { - for (i = 0; parents[i] >= 0; i++); - tnfa->submatch_data[id].parents = NULL; - if (i > 0) - { - int *p = xmalloc(sizeof(*p) * (i + 1)); - if (p == NULL) - { - status = REG_ESPACE; - break; - } - assert(tnfa->submatch_data[id].parents == NULL); - tnfa->submatch_data[id].parents = p; - for (i = 0; parents[i] >= 0; i++) - p[i] = parents[i]; - p[i] = -1; - } - } + regset_contains |= REGSET_HAS_STARTS; /* Add end of this submatch to regset after processing this node. */ - STACK_PUSHX(stack, int, node->submatch_id); + STACK_PUSH(stack, voidptr, node); + STACK_PUSHX(stack, int, id); STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); } @@ -269,39 +561,72 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, { tre_literal_t *lit = node->obj; - if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + if (!IS_SPECIAL(lit) || IS_BACKREF(lit) || IS_EMPTY(lit) || IS_ASSERTION(lit)) { - int i; DPRINT(("Literal %d-%d\n", (int)lit->code_min, (int)lit->code_max)); - if (regset[0] >= 0) + if (regset_contains) { /* Regset is not empty, so add a tag before the literal or backref. */ - if (!first_pass) + if (first_pass) { - status = tre_add_tag_left(mem, node, tag); - tnfa->tag_directions[tag] = direction; - if (minimal_tag >= 0) - { - DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); - for (i = 0; tnfa->minimal_tags[i] >= 0; i++); - tnfa->minimal_tags[i] = tag; - tnfa->minimal_tags[i + 1] = minimal_tag; - tnfa->minimal_tags[i + 2] = -1; - minimal_tag = -1; - num_minimals++; - } - tre_purge_regset(regset, tnfa, tag); + DPRINT((" ADDTAGS_RECURSE:LITERAL node->num_tags = 1\n")); + node->num_tags = 1; } else { - DPRINT((" num_tags = 1\n")); - node->num_tags = 1; + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_left(mem, node, tag); + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS) + tnfa->tag_directions[tag] = TRE_TAG_LEFT_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); + tre_purge_regset(regset, tnfa, tag); + + if (IS_BACKREF(lit)) + { + int b = lit->code_max; + int t = tnfa->submatch_data[b].so_tag; + /* Fail if the referenced submatch hasn't been + * completed yet */ + if (tnfa->submatch_data[b].eo_tag < 0) + { + status = REG_ESUBREG; + break; + } + if (t < tag) + { + DPRINT((" Backref %d start: " + "t%d reordered to before t%d\n", + b, tag, t)); + if(t > 0) + t--; + /* Append to tag_order, move "tag" after + * "t" */ + *rtp++ = tag; + *rtp++ = t; + } +#if TRE_DEBUG + else + DPRINT((" Backref %d start: " + "(t%d already before t%d)\n", + b, tag, t)); +#endif /* TRE_DEBUG */ + } } - DPRINT((" num_tags++\n")); - regset[0] = -1; + DPRINT((" ADDTAGS_RECURSE:LITERAL num_tags++ tag=%d\n", + tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; tag = next_tag; num_tags++; next_tag++; @@ -337,7 +662,8 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, if (left->num_tags > 0 && right->num_tags > 0) { /* Reserve the next tag to the right child. */ - DPRINT((" Reserving next_tag %d to right child\n", + DPRINT((" ADDTAGS_RECURSE:CATENATION num_tags++ " + "Reserving next_tag %d to right child\n", next_tag)); reserved_tag = next_tag; next_tag++; @@ -357,174 +683,309 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, DPRINT(("Iteration\n")); if (first_pass) - { - STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal); - } - else - { - STACK_PUSHX(stack, int, tag); - STACK_PUSHX(stack, int, iter->minimal); - } + STACK_PUSHX(stack, int, regset_contains != 0); + STACK_PUSHX(stack, int, tag); STACK_PUSHX(stack, voidptr, node); STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); STACK_PUSHX(stack, voidptr, iter->arg); STACK_PUSHX(stack, int, ADDTAGS_RECURSE); - /* Regset is not empty, so add a tag here. */ - if (regset[0] >= 0 || iter->minimal) + /* Regset is not empty, so add a tag here (this always happens + because iterators always get submatch id, even if in the + invisible range) */ + if (regset_contains) { if (!first_pass) { - int i; + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; status = tre_add_tag_left(mem, node, tag); - if (iter->minimal) - tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS && tag != 0) + tnfa->tag_directions[tag] = iter->minimal ? + TRE_TAG_MINIMIZE : + TRE_TAG_LEFT_MAXIMIZE; else tnfa->tag_directions[tag] = direction; - if (minimal_tag >= 0) - { - DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); - for (i = 0; tnfa->minimal_tags[i] >= 0; i++); - tnfa->minimal_tags[i] = tag; - tnfa->minimal_tags[i + 1] = minimal_tag; - tnfa->minimal_tags[i + 2] = -1; - minimal_tag = -1; - num_minimals++; - } + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); tre_purge_regset(regset, tnfa, tag); } - DPRINT((" num_tags++\n")); - regset[0] = -1; + DPRINT((" ADDTAGS_RECURSE:ITERATION num_tags++ tag=%d\n", + tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; tag = next_tag; num_tags++; next_tag++; } - direction = TRE_TAG_MINIMIZE; + direction = TRE_TAG_LEFT_MAXIMIZE; + DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); } break; case UNION: { - tre_union_t *uni = node->obj; - tre_ast_node_t *left = uni->left; - tre_ast_node_t *right = uni->right; - int left_tag; - int right_tag; + tre_union_t *uni; + tre_ast_node_t *left; + tre_ast_node_t *right; + int front_tag = -1; + + DPRINT(("Union\n")); - if (regset[0] >= 0) + if (regset_contains) { - left_tag = next_tag; - right_tag = next_tag + 1; + DPRINT((" UNION num_tags++ tag=%d\n", tag)); + front_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; } - else + + /* For the top union, walk the tree of consecutive unions, + * setting the left_tag and right_tag values in increasing + * order (left to right priority) */ + if (top_union && + (node->num_submatches - + (node->submatch_id >= 0 && + node->submatch_id < SUBMATCH_ID_INVISIBLE_START)) > 0) { - left_tag = tag; - right_tag = next_tag; - } + tre_ast_node_t *n; + int last = tre_stack_num_objects(stack); - DPRINT(("Union\n")); + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_UNION_RECURSE); + + while (tre_stack_num_objects(stack) > last) + { + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + case ADDTAGS_UNION_RECURSE: + n = tre_stack_pop_voidptr(stack); + uni = n->obj; + left = uni->left; + + /* Since the top union has num_submatches > 0, + * we set all the consecutive union's + * make_branches to 1 to force the generation + * of end tags for each union branch. */ + n->make_branches = 1; + + STACK_PUSH(stack, voidptr, n); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RIGHT_RECURSE); + + if (left->type == UNION) + { + STACK_PUSH(stack, voidptr, left); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RECURSE); + } + else + { + DPRINT((" ADDTAGS_UNION_RECURSE " + "num_tags++ tag=%d\n", tag)); + uni->left_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; + } + break; + + case ADDTAGS_UNION_RIGHT_RECURSE: + n = tre_stack_pop_voidptr(stack); + uni = n->obj; + right = uni->right; + + if (right->type == UNION) + { + STACK_PUSH(stack, voidptr, right); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RECURSE); + } + else + { + DPRINT((" ADDTAGS_UNION_RIGHT_RECURSE " + "num_tags++ tag=%d\n", tag)); + uni->right_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; + } + + break; + + default: + assert(0); + break; + + } /* end switch(symbol) */ + } /* end while(tre_stack_num_objects(stack) > last */ + if (!first_pass) + { + STACK_PUSHX(stack, int, front_tag); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_TOP); + } + } /* end if (top_union && ...) */ + + uni = node->obj; + left = uni->left; + right = uni->right; /* After processing right child. */ - STACK_PUSHX(stack, int, right_tag); - STACK_PUSHX(stack, int, left_tag); STACK_PUSHX(stack, voidptr, regset); - STACK_PUSHX(stack, int, regset[0] >= 0); + STACK_PUSHX(stack, int, regset_contains != 0); STACK_PUSHX(stack, voidptr, node); - STACK_PUSHX(stack, voidptr, right); - STACK_PUSHX(stack, voidptr, left); STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); /* Process right child. */ STACK_PUSHX(stack, voidptr, right); - STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); /* After processing left child. */ STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); /* Process left child. */ STACK_PUSHX(stack, voidptr, left); - STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); /* Regset is not empty, so add a tag here. */ - if (regset[0] >= 0) + if (regset_contains) { if (!first_pass) { - int i; - status = tre_add_tag_left(mem, node, tag); - tnfa->tag_directions[tag] = direction; - if (minimal_tag >= 0) - { - DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); - for (i = 0; tnfa->minimal_tags[i] >= 0; i++); - tnfa->minimal_tags[i] = tag; - tnfa->minimal_tags[i + 1] = minimal_tag; - tnfa->minimal_tags[i + 2] = -1; - minimal_tag = -1; - num_minimals++; - } - tre_purge_regset(regset, tnfa, tag); + status = tre_merge_branches(mem, node, NULL, front_tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_left(mem, node, front_tag); + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS) + tnfa->tag_directions[front_tag] = TRE_TAG_LEFT_MAXIMIZE; + else + tnfa->tag_directions[front_tag] = direction; + DPRINT(("Setting t%d direction to %s\n", front_tag, + tag_dir_str[tnfa->tag_directions[front_tag]])); + tre_purge_regset(regset, tnfa, front_tag); } - DPRINT((" num_tags++\n")); - regset[0] = -1; - tag = next_tag; - num_tags++; - next_tag++; - } - - if (node->num_submatches > 0) - { - /* The next two tags are reserved for markers. */ - next_tag++; - tag = next_tag; - next_tag++; + regset[0] = REGSET_UNSET; + regset_contains = 0; } break; } - } - - if (node->submatch_id >= 0) - { - int i; - /* Push this submatch on the parents stack. */ - for (i = 0; parents[i] >= 0; i++); - parents[i] = node->submatch_id; - parents[i + 1] = -1; - } + } /* end switch (node->type) */ break; /* end case: ADDTAGS_RECURSE */ case ADDTAGS_AFTER_ITERATION: { - int minimal = 0; + tre_iteration_t *iter; + tre_ast_node_t *orig; int enter_tag; + node = tre_stack_pop_voidptr(stack); + orig = node->original ? node->original : node; + iter = (tre_iteration_t *)orig->obj; + enter_tag = tre_stack_pop_int(stack); + if (iter->minimal) + minimal_tag = enter_tag; + + DPRINT(("After iteration\n")); if (first_pass) { - node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags - + tre_stack_pop_int(stack); - minimal_tag = -1; + node->num_tags = iter->arg->num_tags + tre_stack_pop_int(stack); + DPRINT((" ADDTAGS_AFTER_ITERATION: node->num_tags = %d\n", + node->num_tags)); } else { - minimal = tre_stack_pop_int(stack); - enter_tag = tre_stack_pop_int(stack); - if (minimal) - minimal_tag = enter_tag; - } + /* node->last_matched_branch will have the start tag (the tag + just *before* the iteration). iter->arg->last_matched_branch + will have the tag(s) inside the iteration, the ones that + may need to be reset if the iteration doesn't match. So + before we merge iter->arg into node, we need to set up + a new tre_last_matched_t and tre_last_matched_branch_t, + using any of the inside tags as cmp_tag (we choose the first + tag found by bit_ffs). If there are no inside tags, we + don't bother creating the extra structures. */ + tre_last_matched_branch_pre_t *b = + iter->arg->last_matched_branch; + + if (b && b->n_tags > 0) + { + tre_last_matched_pre_t *u; - DPRINT(("After iteration\n")); - if (!first_pass) - { - DPRINT((" Setting direction to %s\n", - minimal ? "minimize" : "maximize")); - if (minimal) - direction = TRE_TAG_MINIMIZE; + bit_ffs(b->tags, num_tags, &b->cmp_tag); + DPRINT((" ADDTAGS_AFTER_ITERATION: n_tags=%d " + "cmp_tag = %d\n", b->n_tags, b->cmp_tag)); + + u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t) + + sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(tnfa->num_tags)); + if (!u) + { + status = REG_ESPACE; + break; + } + u->branches = b; + u->n_branches = 1; + u->start_tag = b->cmp_tag; + u->tot_branches = b->tot_branches; + u->tot_last_matched = 1 + b->tot_last_matched; + u->tot_tags = b->tot_tags; + + b = (tre_last_matched_branch_pre_t *)(u + 1); + b->last_matched = u; + b->n_last_matched = 1; + b->tot_branches = 1 + u->tot_branches; + b->tot_last_matched = u->tot_last_matched; + b->tot_tags = u->tot_tags; + + iter->arg->last_matched_branch = b; + } + status = tre_merge_branches(mem, node, iter->arg, 0, + tnfa->num_tags); + if (status != REG_OK) + break; + + if (iter->minimal) + { + /* Add a union with a left EMPTY literal and the right + being iter->arg. This should force the tags inside + the minimal iteration to prefer being unset */ + if (iter->min == 0 && iter->max <= 1) + { + tre_ast_node_t *u, *e; + + e = tre_ast_new_literal(mem, EMPTY, -1, -1); + if (e == NULL) + { + status = REG_ESPACE; + break; + } + u = tre_ast_new_union(mem, e, iter->arg); + if (u == NULL) + { + status = REG_ESPACE; + break; + } + iter->arg = u; + } + + direction = TRE_TAG_MINIMIZE; + } else direction = TRE_TAG_MAXIMIZE; + DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); } break; } @@ -544,12 +1005,29 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } case ADDTAGS_AFTER_CAT_RIGHT: - DPRINT(("After cat right\n")); - node = tre_stack_pop_voidptr(stack); - if (first_pass) - node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags - + ((tre_catenation_t *)node->obj)->right->num_tags; - break; + { + tre_catenation_t *cat; + + DPRINT(("After cat right\n")); + node = tre_stack_pop_voidptr(stack); + cat = node->obj; + if (first_pass) + { + node->num_tags = cat->left->num_tags + cat->right->num_tags; + DPRINT((" ADDTAGS_AFTER_CAT_RIGHT: node->num_tags = %d\n", + node->num_tags)); + } + else + { + status = tre_merge_branches(mem, cat->left, cat->right, 0, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_merge_branches(mem, node, cat->left, 0, + tnfa->num_tags); + } + break; + } case ADDTAGS_AFTER_UNION_LEFT: DPRINT(("After union left\n")); @@ -557,53 +1035,223 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, the right operand the items currently in the array are invisible. The original bottom was saved at ADDTAGS_UNION and will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ - while (*regset >= 0) + while (*regset != REGSET_UNSET) regset++; + regset_contains = 0; break; case ADDTAGS_AFTER_UNION_RIGHT: { - int added_tags, tag_left, tag_right; - tre_ast_node_t *left = tre_stack_pop_voidptr(stack); - tre_ast_node_t *right = tre_stack_pop_voidptr(stack); + int added_tags; + tre_ast_node_t *orig; + tre_union_t *uni; + /* Note: node may not be a UNION, but a CATENATION with a left + * tag. So that is why we pass the original node->obj on the + * stack, to get the union's true values. */ + DPRINT(("After union right\n")); node = tre_stack_pop_voidptr(stack); + orig = node->original ? node->original : node; + uni = (tre_union_t *)orig->obj; added_tags = tre_stack_pop_int(stack); if (first_pass) { - node->num_tags = ((tre_union_t *)node->obj)->left->num_tags - + ((tre_union_t *)node->obj)->right->num_tags + added_tags - + ((node->num_submatches > 0) ? 2 : 0); + node->num_tags = uni->left->num_tags + uni->right->num_tags + + added_tags; + if (uni->left_tag > 0) + node->num_tags++; + if (uni->right_tag > 0) + node->num_tags++; + DPRINT((" ADDTAGS_AFTER_UNION_RIGHT: node->num_tags = %d\n", + node->num_tags)); } regset = tre_stack_pop_voidptr(stack); - tag_left = tre_stack_pop_int(stack); - tag_right = tre_stack_pop_int(stack); /* Add tags after both children, the left child gets a smaller tag than the right child. This guarantees that we prefer the left child over the right child. */ /* XXX - This is not always necessary (if the children have tags which must be seen for every match of that child). */ - /* XXX - Check if this is the only place where tre_add_tag_right - is used. If so, use tre_add_tag_left (putting the tag before - the child as opposed after the child) and throw away - tre_add_tag_right. */ - if (node->num_submatches > 0) + if (!first_pass && node->make_branches) { - if (!first_pass) + tre_last_matched_branch_pre_t *lb = + uni->left->last_matched_branch; + tre_last_matched_branch_pre_t *rb = + uni->right->last_matched_branch; + tre_last_matched_pre_t *lu = + uni->left->last_matched_in_progress; + tre_last_matched_pre_t *ru = + uni->right->last_matched_in_progress; + tre_last_matched_pre_t *u; + /* We don't need to call tre_merge_branches because these + * tags don't participate in submatch ranges, so don't need + * to be recorded. But we do set the cmp_tag entry of the + * tre_last_matched_branch_pre_t, so we might call + * tre_merge_branches if we need to create an empty + * tre_last_matched_branch_pre_t. */ + if (uni->left_tag > 0) + { + DPRINT(("Setting t%d direction to maximize\n", + uni->left_tag)); + status = tre_add_tag_right(mem, uni->left, uni->left_tag); + if (status != REG_OK) + break; + tnfa->tag_directions[uni->left_tag] = TRE_TAG_MAXIMIZE; + if (!lb) + { + status = tre_merge_branches(mem, uni->left, NULL, -1, + tnfa->num_tags); + if (status != REG_OK) + break; + lb = uni->left->last_matched_branch; + } + lb->cmp_tag = uni->left_tag; + } + if (uni->right_tag > 0) { - status = tre_add_tag_right(mem, left, tag_left); - tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; - status = tre_add_tag_right(mem, right, tag_right); - tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; + DPRINT(("Setting t%d direction to maximize\n", + uni->right_tag)); + status = tre_add_tag_right(mem, uni->right, uni->right_tag); + if (status != REG_OK) + break; + tnfa->tag_directions[uni->right_tag] = TRE_TAG_MAXIMIZE; + if (!rb) + { + status = tre_merge_branches(mem, uni->right, NULL, -1, + tnfa->num_tags); + if (status != REG_OK) + break; + rb = uni->right->last_matched_branch; + } + rb->cmp_tag = uni->right_tag; } - DPRINT((" num_tags += 2\n")); - num_tags += 2; + /* Now merge the tre_last_matched_branch_pre_t into a + tre_last_matched_pre_t */ + if (lu == NULL) + { + if (ru == NULL) + { + /* Create a new tre_last_matched_pre_t */ + u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t)); + if (!u) + { + status = REG_ESPACE; + break; + } + u->tot_last_matched = 1; + + if (lb) + { + u->branches = lb; + u->n_branches = 1; + u->tot_branches += lb->tot_branches; + u->tot_last_matched += lb->tot_last_matched; + u->tot_tags += lb->tot_tags; + if (rb) + { + lb->next = rb; + u->n_branches++; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else if (rb) + { + u->branches = rb; + u->n_branches = 1; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else + { + /* Use ru, and add lb */ + u = ru; + if (lb) + { + lb->next = u->branches; + u->branches = lb; + u->n_branches++; + u->tot_branches += lb->tot_branches; + u->tot_last_matched += lb->tot_last_matched; + u->tot_tags += lb->tot_tags; + } + } + } + else if (ru == NULL) + { + /* Use lu, and add rb */ + u = lu; + if (rb) + { + rb->next = u->branches; + u->branches = rb; + u->n_branches++; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else + { + /* Merge lu and ru into lu */ + if (lu->branches) + { + if (ru->branches) + { + tre_last_matched_branch_pre_t *b = lu->branches; + while (b->next) b = b->next; + b->next = ru->branches; + lu->n_branches += ru->n_branches; + } + } + else if (ru->branches) + { + lu->branches = ru->branches; + lu->n_branches = ru->n_branches; + } + lu->tot_branches += ru->tot_branches; + lu->tot_last_matched += ru->tot_last_matched - 1; + lu->tot_tags += ru->tot_tags; + u = lu; + } + node->last_matched_in_progress = u; } direction = TRE_TAG_MAXIMIZE; break; } + case ADDTAGS_AFTER_UNION_TOP: /* only called when not first_pass */ + { + tre_last_matched_branch_pre_t *b; + tre_last_matched_pre_t *u; + int start_tag; + + DPRINT(("After union top\n")); + node = tre_stack_pop_voidptr(stack); + start_tag = tre_stack_pop_int(stack); + b = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(tnfa->num_tags)); + if (!b) + { + status = REG_ESPACE; + break; + } + + u = node->last_matched_in_progress; + u->start_tag = start_tag; + b->tot_branches = 1 + u->tot_branches; + b->tot_last_matched = u->tot_last_matched; + b->tot_tags = u->tot_tags; + b->last_matched = u; + b->n_last_matched = 1; + node->last_matched_branch = b; + node->last_matched_in_progress = NULL; + break; + } + default: assert(0); break; @@ -611,31 +1259,384 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } /* end switch(symbol) */ } /* end while(tre_stack_num_objects(stack) > bottom) */ + if (status != REG_OK) + { + DPRINT(("Error during %s pass\n", first_pass ? "first" : "second")); + goto error_post_compile; + } + if (!first_pass) - tre_purge_regset(regset, tnfa, tag); + { + int i; + if (num_tags != tnfa->num_tags) + { + DPRINT(("num_tags(%d) != tnfa->num_tags(%d)\n", num_tags, + tnfa->num_tags)); + status = REG_BADPAT; + goto error_post_compile; + } - if (!first_pass && minimal_tag >= 0) + tre_purge_regset(regset, tnfa, tag); + DPRINT(("Setting t%d to %s\n", num_tags, + tag_dir_str[direction])); + tnfa->tag_directions[num_tags] = direction; + + if (rtp > reorder_tags + 2 * tnfa->num_reorder_tags) + { + DPRINT(("Processed %d reorder tags instead of %d\n", + (int)(rtp - reorder_tags) / 2, tnfa->num_reorder_tags)); + status = REG_BADPAT; + goto error_post_compile; + } + *rtp = -1; +#if TRE_DEBUG + if (reorder_tags[0] >= 0) + { + DPRINT(("reorder_tags:\n")); + for (rtp = reorder_tags; *rtp >= 0;) + { + DPRINT(("%d after ", *rtp++)); + DPRINT(("%d\n", *rtp++)); + } + } + else + DPRINT(("No reorder_tags\n")); +#endif /* TRE_DEBUG */ + + /* Initialize to_reorder */ + for (i = 0; i < num_tags; i++) + to_reorder[i] = i; + /* Use to_seq_order to convert reorder_tags values, and use those to + * reorder to_reorder */ + for (rtp = reorder_tags; *rtp >= 0;) + { + int j, high, low; + int ti = *rtp++; + + /* Skip reordering the final tag */ + if (ti >= num_tags) + { + DPRINT(("Skipping reorder of %d\n", ti)); + rtp++; + continue; + } + /* The number of the tag to reorder */ + high = to_reorder[ti]; + /* Reorder after this tag */ + low = to_reorder[*rtp++]; + + DPRINT(("ti=%d high=%d low=%d\n", ti, high, low)); + if (low > high) + { + DPRINT(("Tag %d already before %d\n", high, low)); + continue; + } + for (j = 0; j < num_tags; j++) + if (to_reorder[j] > low && to_reorder[j] < high) + to_reorder[j]++; + to_reorder[ti] = low + 1; +#ifdef TRE_DEBUG + DPRINT(("to_reorder=(")); + for (j = 0; j < num_tags; j++) + { + DPRINT(("%d", to_reorder[j])); + if (j < num_tags - 1) + DPRINT((",")); + } + DPRINT((")\n")); +#endif /* TRE_DEBUG */ + } + /* Determine if reordering in really necessary */ + { + int need_reorder = 0; + for (i = 0; i < num_tags; i++) + if(to_reorder[i] != i) + { + need_reorder = 1; + break; + } + /* If need_reorder is not set, free reorder_tags, and set to NULL, + * indicating no reordering is needed */ + if (!need_reorder) + { + DPRINT(("Don't need to reorder\n")); + xfree(reorder_tags); + reorder_tags = NULL; + } + } + } + + if (reorder_tags) { int i; - DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); - for (i = 0; tnfa->minimal_tags[i] >= 0; i++); - tnfa->minimal_tags[i] = tag; - tnfa->minimal_tags[i + 1] = minimal_tag; - tnfa->minimal_tags[i + 2] = -1; - minimal_tag = -1; - num_minimals++; + tre_tag_direction_t *new_tag_directions; +#if TRE_DEBUG + DPRINT(("to_reorder:")); + for (i = 0; i < num_tags; i++) + DPRINT((" %d->%d", i, to_reorder[i])); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + + DPRINT(("Reordering submatch_data\n")); + for (i = 0; i < tnfa->num_submatches; i++) + { +#if TRE_DEBUG + int so = tnfa->submatch_data[i].so_tag; + int eo = tnfa->submatch_data[i].eo_tag; +#endif /* TRE_DEBUG */ + tnfa->submatch_data[i].so_tag = + to_reorder[tnfa->submatch_data[i].so_tag]; + tnfa->submatch_data[i].eo_tag = + tnfa->submatch_data[i].eo_tag < num_tags ? + to_reorder[tnfa->submatch_data[i].eo_tag] : + tnfa->submatch_data[i].eo_tag; + DPRINT(("pmatch[%d]: {%d, %d}->{%d, %d}\n", i, so, eo, + tnfa->submatch_data[i].so_tag, + tnfa->submatch_data[i].eo_tag)); + } + + DPRINT(("Reordering tag_directions\n")); + /* We only allocate num_tags directions and reorder them. The + * num_tags-th direction (end tag) is left unchanged. */ + new_tag_directions = xmalloc(sizeof(*new_tag_directions) * num_tags); + if (new_tag_directions == NULL) + { + status = REG_ESPACE; + goto error_post_compile; + } + for (i = 0; i < num_tags; i++) + { + new_tag_directions[to_reorder[i]] = tnfa->tag_directions[i]; + } +#if TRE_DEBUG + for (i = 0; i < num_tags; i++) + { + DPRINT(("t%d %s->%s\n", i, + tag_dir_str[tnfa->tag_directions[i]], + tag_dir_str[new_tag_directions[i]])); + } + DPRINT(("t%d %s->%s\n", num_tags, + tag_dir_str[tnfa->tag_directions[num_tags]], + tag_dir_str[tnfa->tag_directions[num_tags]])); +#endif /* TRE_DEBUG */ + memcpy(tnfa->tag_directions, new_tag_directions, sizeof(*new_tag_directions) * num_tags); + xfree(new_tag_directions); + + DPRINT(("Reordering minimal_tags\n")); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++) + tnfa->minimal_tags[i] = tnfa->minimal_tags[i] < num_tags ? + to_reorder[tnfa->minimal_tags[i]] : + tnfa->minimal_tags[i]; + + DPRINT(("Reordering AST tags\n")); + STACK_PUSH(stack, voidptr, tree); + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + node = tre_stack_pop_voidptr(stack); + + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = (tre_literal_t *)node->obj; + if (IS_TAG(lit)) + lit->code_max = to_reorder[lit->code_max]; + break; + } + + case UNION: + { + tre_union_t *uni = (tre_union_t *)node->obj; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, voidptr, uni->left); + break; + } + + case CATENATION: + { + tre_catenation_t *cat = (tre_catenation_t *)node->obj; + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, voidptr, cat->left); + break; + } + + case ITERATION: + { + tre_iteration_t *iter = (tre_iteration_t *)node->obj; + STACK_PUSHX(stack, voidptr, iter->arg); + break; + } + + default: + assert(0); + break; + } + } + if (status != REG_OK) + { + DPRINT(("Error while reordering tags\n")); + goto error_post_compile; + } + } + + + if (!first_pass) + { + if (tree->last_matched_branch) + { + tre_last_matched_branch_t *buf, *b, *bb; + tre_last_matched_branch_pre_t *bp; + tre_last_matched_t *u, *uu; + tre_last_matched_pre_t *up; + int *t; + int i; +#ifdef TRE_DEBUG + tre_last_matched_branch_t *_b; + tre_last_matched_t *_u; + int *_t; + + DPRINT(("last_match_branch_pre:\n")); + print_last_match_branch_pre(tree->last_matched_branch, 0, num_tags); +#endif /* TRE_DEBUG */ + buf = (tre_last_matched_branch_t *)xcalloc(1, + tree->last_matched_branch->tot_branches + * sizeof(tre_last_matched_branch_t) + + tree->last_matched_branch->tot_last_matched + * sizeof(tre_last_matched_t) + + tree->last_matched_branch->tot_tags * + sizeof(int)); + if (!buf) + { + status = REG_ESPACE; + goto error_post_compile; + } + + b = buf; + u = (tre_last_matched_t *)(b + + tree->last_matched_branch->tot_branches); + t = (int *)(u + tree->last_matched_branch->tot_last_matched); +#ifdef TRE_DEBUG + _b = b; + _u = u; + _t = t; +#endif /* TRE_DEBUG */ + DPRINT(("Copying info_pre to info\n")); + STACK_PUSH(stack, voidptr, tree->last_matched_branch); + STACK_PUSH(stack, int, 1); + STACK_PUSH(stack, int, COPY_LAST_MATCHED_BRANCH); + + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + switch (tre_stack_pop_int(stack)) + { + case COPY_LAST_MATCHED_BRANCH: + i = tre_stack_pop_int(stack); + /* The tre_last_matched_branch_pre_t * is still on the + stack */ + STACK_PUSHX(stack, voidptr, b); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); + b += i; + break; + + case COPY_LAST_MATCHED_BRANCH_NEXT: + bb = tre_stack_pop_voidptr(stack); + bp = tre_stack_pop_voidptr(stack); + bb->n_last_matched = bp->n_last_matched; + bb->cmp_tag = bp->cmp_tag; + if (bp->n_tags > 0) + { + int n; + n = bb->n_tags = bp->n_tags; + bb->tags = t; + for (i = 0; i < num_tags; i++) + if (bit_test(bp->tags, i)) + { + *t++ = i; + if (--n <= 0) + break; + } + } + if (bp->next) + { + STACK_PUSHX(stack, voidptr, bp->next); + STACK_PUSHX(stack, voidptr, bb + 1); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); + } + if (bp->n_last_matched > 0) + { + bb->last_matched = u; + STACK_PUSHX(stack, voidptr, bp->last_matched); + STACK_PUSHX(stack, int, bp->n_last_matched); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED); + } + break; + + case COPY_LAST_MATCHED: + i = tre_stack_pop_int(stack); + /* The tre_last_matched_pre_t * is still on the stack */ + STACK_PUSHX(stack, voidptr, u); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); + u += i; + break; + + case COPY_LAST_MATCHED_NEXT: + uu = tre_stack_pop_voidptr(stack); + up = tre_stack_pop_voidptr(stack); + uu->n_branches = up->n_branches; + uu->branches = b; + uu->start_tag = up->start_tag; + if (up->next) + { + STACK_PUSHX(stack, voidptr, up->next); + STACK_PUSHX(stack, voidptr, uu + 1); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); + } + STACK_PUSHX(stack, voidptr, up->branches); + STACK_PUSHX(stack, int, up->n_branches); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH); + break; + } + } + if (status != REG_OK) + goto error_post_compile; +#ifdef TRE_DEBUG + DPRINT(("last_matched_branch:\n")); + print_last_match_branch(buf, 0); + if (b != _b + tree->last_matched_branch->tot_branches) + DPRINT(("b/%p != _b + tree->last_matched_branch->tot_branches/%p\n", + b, _b + tree->last_matched_branch->tot_branches)); + if (u != _u + tree->last_matched_branch->tot_last_matched) + DPRINT(("u/%p != _u + " + "tree->last_matched_branch->tot_last_matched/%p\n", + u, _u + tree->last_matched_branch->tot_last_matched)); + if (t != _t + tree->last_matched_branch->tot_tags) + DPRINT(("t/%p != _t + tree->last_matched_branch->tot_tags/%p\n", + t, _t + tree->last_matched_branch->tot_tags)); +#endif /* TRE_DEBUG */ + tnfa->last_matched_branch = buf; + } +#ifdef TRE_DEBUG + else + DPRINT(("No last_match_branch_pre\n")); +#endif /* TRE_DEBUG */ } DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", first_pass? "First pass" : "Second pass", num_tags)); - +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + DPRINT(("tre_add_tags: tree->num_tags=%d num_tags=%d\n", tree->num_tags, + num_tags)); assert(tree->num_tags == num_tags); tnfa->end_tag = num_tags; tnfa->num_tags = num_tags; tnfa->num_minimals = num_minimals; +error_post_compile: + xfree(reorder_tags); +error_reorder_tags: xfree(orig_regset); - xfree(parents); - xfree(saved_states); +error_regset: return status; } @@ -691,6 +1692,9 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, int pos = lit->position; int min = lit->code_min; int max = lit->code_max; + tre_bracket_match_list_t *list = !IS_SPECIAL(lit) ? + lit->u.bracket_match_list : + NULL; if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) { /* XXX - e.g. [ab] has only one position but two @@ -709,7 +1713,8 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, && first_tag) { /* Maximize the first tag. */ - tag_directions[max] = TRE_TAG_MAXIMIZE; + if (tag_directions[max] == TRE_TAG_LEFT_MAXIMIZE) + tag_directions[max] = TRE_TAG_MAXIMIZE; first_tag = 0; } *result = tre_ast_new_literal(mem, min, max, pos); @@ -718,6 +1723,10 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (pos > *max_pos) *max_pos = pos; + + if (!IS_SPECIAL(lit)) + ((tre_literal_t *)(*result)->obj)->u.bracket_match_list + = list; break; } case UNION: @@ -807,15 +1816,21 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, int pos_add = 0; int pos_add_total = 0; int max_pos = 0; +#ifdef TRE_APPROX /* Current approximate matching parameters. */ int params[TRE_PARAM_LAST]; /* Approximate parameter nesting level. */ int params_depth = 0; +#endif /* TRE_APPROX */ int iter_depth = 0; +#ifdef TRE_APPROX int i; +#endif /* TRE_APPROX */ +#ifdef TRE_APPROX for (i = 0; i < TRE_PARAM_LAST; i++) params[i] = TRE_PARAM_DEFAULT; +#endif /* TRE_APPROX */ STACK_PUSHR(stack, voidptr, ast); STACK_PUSHR(stack, int, EXPAND_RECURSE); @@ -893,7 +1908,20 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, int pos_add_last; pos_add = tre_stack_pop_int(stack); pos_add_last = pos_add; - if (iter->min > 1 || iter->max > 1) + /* Originally (in tre_parse_bound), if min == 0 && max == 0, we + immediate replace the whole iteration with EMPTY. This + unfortunately drops any submatches, and messes up setting the + pmatch values (we can get tags of -1, and tag values in the + billions). So we left it there and replace with EMPTY here. */ + if (iter->min == 0 && iter->max == 0) + { + tre_ast_node_t *empty = tre_ast_new_literal(mem, EMPTY, -1, -1); + if (empty == NULL) + return REG_ESPACE; + node->obj = empty->obj; + node->type = empty->type; + } + else if (iter->min > 1 || iter->max > 1) { tre_ast_node_t *seq1 = NULL, *seq2 = NULL; int j; @@ -975,6 +2003,7 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (iter_depth == 0) pos_add = pos_add_total; +#ifdef TRE_APPROX /* If approximate parameters are specified, surround the result with two parameter setting nodes. The one on the left sets the specified parameters, and the one on the right restores @@ -1021,6 +2050,7 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (params_depth > *max_depth) *max_depth = params_depth; } +#endif /* TRE_APPROX */ break; } default: @@ -1065,7 +2095,7 @@ tre_set_empty(tre_mem_t mem) static tre_pos_and_tags_t * tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, - tre_ctype_t class, tre_ctype_t *neg_classes, int backref) + tre_bracket_match_list_t *bracket_match_list, int backref) { tre_pos_and_tags_t *new_set; @@ -1076,8 +2106,7 @@ tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, new_set[0].position = position; new_set[0].code_min = code_min; new_set[0].code_max = code_max; - new_set[0].class = class; - new_set[0].neg_classes = neg_classes; + new_set[0].bracket_match_list = bracket_match_list; new_set[0].backref = backref; new_set[1].position = -1; new_set[1].code_min = -1; @@ -1108,8 +2137,7 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, new_set[s1].code_min = set1[s1].code_min; new_set[s1].code_max = set1[s1].code_max; new_set[s1].assertions = set1[s1].assertions | assertions; - new_set[s1].class = set1[s1].class; - new_set[s1].neg_classes = set1[s1].neg_classes; + new_set[s1].bracket_match_list = set1[s1].bracket_match_list; new_set[s1].backref = set1[s1].backref; if (set1[s1].tags == NULL && tags == NULL) new_set[s1].tags = NULL; @@ -1153,8 +2181,7 @@ tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, new_set[s1 + s2].code_max = set2[s2].code_max; /* XXX - why not | assertions here as well? */ new_set[s1 + s2].assertions = set2[s2].assertions; - new_set[s1 + s2].class = set2[s2].class; - new_set[s1 + s2].neg_classes = set2[s2].neg_classes; + new_set[s1 + s2].bracket_match_list = set2[s2].bracket_match_list; new_set[s1 + s2].backref = set2[s2].backref; if (set2[s2].tags == NULL) new_set[s1 + s2].tags = NULL; @@ -1344,11 +2371,11 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) lastpos = {i}. */ node->nullable = 0; node->firstpos = tre_set_one(mem, lit->position, 0, - TRE_CHAR_MAX, 0, NULL, -1); + TRE_CHAR_MAX, NULL, -1); if (!node->firstpos) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, 0, - TRE_CHAR_MAX, 0, NULL, + TRE_CHAR_MAX, NULL, (int)lit->code_max); if (!node->lastpos) return REG_ESPACE; @@ -1372,13 +2399,13 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) node->nullable = 0; node->firstpos = tre_set_one(mem, lit->position, (int)lit->code_min, - (int)lit->code_max, 0, NULL, -1); + (int)lit->code_max, NULL, -1); if (!node->firstpos) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, (int)lit->code_min, (int)lit->code_max, - lit->u.class, lit->neg_classes, + lit->u.bracket_match_list, -1); if (!node->lastpos) return REG_ESPACE; @@ -1436,14 +2463,89 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) case NFL_POST_ITERATION: { + int num_tags, *tags, assertions, params_seen; + int *params; + reg_errcode_t status; tre_iteration_t *iter = (tre_iteration_t *)node->obj; + /* From Ville Laurikari's original 2001 Master's thesis, the + firstpos(n) and lastpos(n) of an iteration is just the + corresponding values of the iteration's argument. Unfortunately, + this isn't sufficient for the following BRE: + + \(a*\)*b\(\1\) matched against ab + + The backreference wants to force the first subexpression to + be the empty string, but there is no transition for this. So + we need to modify the lastpos(n) of an iteration to be the + equivalent of that of catentation. Using the same notation as + in the thesis, lastpos(n) is redefined as: + + if nullable(c1) then + lastpos(c1) U + addtags(lastpos(c1), + emptymatch(c1)) + else + lastpos(c1) + + where c1 is the argument node. firstpos(n) remains the same. */ + + /* Compute lastpos. */ if (iter->min == 0 || iter->arg->nullable) - node->nullable = 1; + { + node->nullable = 1; + if (iter->arg->nullable) + { + /* The arg matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, iter->arg, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(int) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, iter->arg, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->lastpos = + tre_set_union(mem, iter->arg->lastpos, iter->arg->lastpos, + tags, assertions, params); + xfree(tags); + if (!node->lastpos) + return REG_ESPACE; + } + else + node->lastpos = iter->arg->lastpos; + } else - node->nullable = 0; + { + node->nullable = 0; + node->lastpos = iter->arg->lastpos; + } node->firstpos = iter->arg->firstpos; - node->lastpos = iter->arg->lastpos; break; } @@ -1624,30 +2726,23 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, trans->state = transitions + offs[p2->position]; trans->state_id = p2->position; trans->assertions = p1->assertions | p2->assertions - | (p1->class ? ASSERT_CHAR_CLASS : 0) - | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0); + | (p1->bracket_match_list != NULL ? ASSERT_BRACKET_MATCH : 0); if (p1->backref >= 0) { - assert((trans->assertions & ASSERT_CHAR_CLASS) == 0); + assert((trans->assertions & ASSERT_BRACKET_MATCH) == 0); assert(p2->backref < 0); trans->u.backref = p1->backref; trans->assertions |= ASSERT_BACKREF; } - else - trans->u.class = p1->class; - if (p1->neg_classes != NULL) + if (p1->bracket_match_list != NULL) { - for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++); - trans->neg_classes = - xmalloc(sizeof(*trans->neg_classes) * (i + 1)); - if (trans->neg_classes == NULL) + trans->u.bracket_match_list = + xmalloc(SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); + if (trans->u.bracket_match_list == NULL) return REG_ESPACE; - for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++) - trans->neg_classes[i] = p1->neg_classes[i]; - trans->neg_classes[i] = (tre_ctype_t)0; + memcpy(trans->u.bracket_match_list, p1->bracket_match_list, + SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); } - else - trans->neg_classes = NULL; /* Find out how many tags this transition has. */ i = 0; @@ -1748,10 +2843,9 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, DPRINT((", assert %d", trans->assertions)); if (trans->assertions & ASSERT_BACKREF) DPRINT((", backref %d", trans->u.backref)); - else if (trans->u.class) - DPRINT((", class %ld", (long)trans->u.class)); - if (trans->neg_classes) - DPRINT((", neg_classes %p", trans->neg_classes)); + else if (trans->assertions & ASSERT_BRACKET_MATCH) + DPRINT((", bracket_match_list %p", + trans->u.bracket_match_list)); if (trans->params) { DPRINT((", ")); @@ -1852,7 +2946,8 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, int -tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, + locale_t loc) { tre_stack_t *stack; tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; @@ -1861,7 +2956,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) int i, add = 0; tre_tnfa_transition_t *transitions, *initial; tre_tnfa_t *tnfa = NULL; - tre_submatch_data_t *submatch_data; + tre_submatch_data_t *submatch_data = NULL; tre_tag_direction_t *tag_directions = NULL; reg_errcode_t errcode; tre_mem_t mem; @@ -1888,8 +2983,15 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) parse_ctx.stack = stack; parse_ctx.re = regex; parse_ctx.len = n; + /* Only allow REG_UNGREEDY to be set if both REG_ENHANCED and REG_EXTENDED + are also set */ + if ((cflags & (REG_ENHANCED | REG_EXTENDED)) != (REG_ENHANCED | REG_EXTENDED)) + cflags &= ~REG_UNGREEDY; parse_ctx.cflags = cflags; parse_ctx.max_backref = -1; + parse_ctx.loc = loc; + parse_ctx.submatch_id_invisible = SUBMATCH_ID_INVISIBLE_START; + DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); errcode = tre_parse(&parse_ctx); if (errcode != REG_OK) @@ -1917,10 +3019,14 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tnfa->have_backrefs = parse_ctx.max_backref >= 0; tnfa->have_approx = parse_ctx.have_approx; tnfa->num_submatches = parse_ctx.submatch_id; + tnfa->num_submatches_invisible = parse_ctx.submatch_id_invisible + - SUBMATCH_ID_INVISIBLE_START; + tnfa->num_reorder_tags = parse_ctx.num_reorder_tags; + tnfa->loc = parse_ctx.loc; /* Set up tags for submatch addressing. If REG_NOSUB is set and the regexp does not have back references, this can be skipped. */ - if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) + if (tnfa->num_reorder_tags > 0 || !(cflags & REG_NOSUB)) { DPRINT(("tre_compile: setting up tags\n")); @@ -1942,7 +3048,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) memset(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1)); } - tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1, + tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 3, sizeof(tnfa->minimal_tags)); if (tnfa->minimal_tags == NULL) ERROR_EXIT(REG_ESPACE); @@ -1951,6 +3057,12 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) sizeof(*submatch_data)); if (submatch_data == NULL) ERROR_EXIT(REG_ESPACE); + /* Set the eo_tag value to -1 to indicate that that corresponding + * submatch has not be completed yet */ + for (i = 0; i < parse_ctx.submatch_id; i++) + { + submatch_data[i].eo_tag = -1; + } tnfa->submatch_data = submatch_data; errcode = tre_add_tags(mem, stack, tree, tnfa); @@ -1961,10 +3073,8 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) for (i = 0; i < parse_ctx.submatch_id; i++) DPRINT(("pmatch[%d] = {t%d, t%d}\n", i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); - for (i = 0; i < tnfa->num_tags; i++) - DPRINT(("t%d is %s\n", i, - tag_directions[i] == TRE_TAG_MINIMIZE ? - "minimized" : "maximized")); + for (i = 0; i <= tnfa->num_tags; i++) + DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); #endif /* TRE_DEBUG */ } @@ -1990,6 +3100,13 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) #ifdef TRE_DEBUG tre_ast_print(tree); DPRINT(("Number of states: %d\n", parse_ctx.position)); + if (submatch_data) + for (i = 0; i < parse_ctx.submatch_id; i++) + DPRINT(("pmatch[%d] = {t%d, t%d}\n", + i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); + if (tag_directions) + for (i = 0; i <= tnfa->num_tags; i++) + DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); #endif /* TRE_DEBUG */ errcode = tre_compute_nfl(mem, stack, tree); @@ -2026,52 +3143,36 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (errcode != REG_OK) ERROR_EXIT(errcode); - /* If in eight bit mode, compute a table of characters that can be the - first character of a match. */ + + /* Set first_char only if there is only one character that can be the + first character of a match */ tnfa->first_char = -1; - if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable) + if (!tmp_ast_l->nullable) { - int count = 0; - tre_cint_t k; - DPRINT(("Characters that can start a match:")); - tnfa->firstpos_chars = xcalloc(256, sizeof(char)); - if (tnfa->firstpos_chars == NULL) - ERROR_EXIT(REG_ESPACE); - for (p = tree->firstpos; p->position >= 0; p++) + int scanning = 1; + for (p = tree->firstpos; scanning && p->position >= 0; p++) { tre_tnfa_transition_t *j = transitions + offs[p->position]; while (j->state != NULL) { - for (k = j->code_min; k <= j->code_max && k < 256; k++) + if (j->code_min <= j->code_max) { - DPRINT((" %d", k)); - tnfa->firstpos_chars[k] = 1; - count++; + if (j->code_max != j->code_min || j->code_min == -1 || tnfa->first_char != -1) + { + tnfa->first_char = -1; + scanning = 0; + break; + } + tnfa->first_char = j->code_min; } j++; } } - DPRINT(("\n")); -#define TRE_OPTIMIZE_FIRST_CHAR 1 -#if TRE_OPTIMIZE_FIRST_CHAR - if (count == 1) - { - for (k = 0; k < 256; k++) - if (tnfa->firstpos_chars[k]) - { - DPRINT(("first char must be %d\n", k)); - tnfa->first_char = k; - xfree(tnfa->firstpos_chars); - tnfa->firstpos_chars = NULL; - break; - } - } -#endif - +#ifdef TRE_DEBUG + if (tnfa->first_char >= 0) + DPRINT(("first char must be %d\n", tnfa->first_char)); +#endif /* TRE_DEBUG */ } - else - tnfa->firstpos_chars = NULL; - p = tree->firstpos; i = 0; @@ -2150,14 +3251,19 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tnfa->num_states = parse_ctx.position; tnfa->cflags = cflags; - DPRINT(("final state %p\n", (void *)tnfa->final)); + DPRINT(("final state %d (%p)\n", tree->lastpos[0].position, + (void *)tnfa->final)); tre_mem_destroy(mem); tre_stack_destroy(stack); xfree(counts); xfree(offs); +#ifdef TRE_USE_SYSTEM_REGEX_H + preg->re_magic = RE_MAGIC; +#endif /* TRE_USE_SYSTEM_REGEX_H */ preg->TRE_REGEX_T_FIELD = (void *)tnfa; + xlocale_retain(tnfa->loc); return REG_OK; error_exit: @@ -2169,7 +3275,12 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) xfree(counts); if (offs != NULL) xfree(offs); + + /* Set tnfa into preg, so that calling tre_free() will free the contents + of tnfa. NULL out the loc field since we never retained the locale. */ preg->TRE_REGEX_T_FIELD = (void *)tnfa; + if(tnfa) tnfa->loc = NULL; + tre_free(preg); return errcode; } @@ -2184,17 +3295,21 @@ tre_free(regex_t *preg) unsigned int i; tre_tnfa_transition_t *trans; +#ifdef TRE_USE_SYSTEM_REGEX_H + preg->re_magic = 0; +#endif /* TRE_USE_SYSTEM_REGEX_H */ tnfa = (void *)preg->TRE_REGEX_T_FIELD; if (!tnfa) return; + preg->TRE_REGEX_T_FIELD = NULL; for (i = 0; i < tnfa->num_transitions; i++) if (tnfa->transitions[i].state) { if (tnfa->transitions[i].tags) xfree(tnfa->transitions[i].tags); - if (tnfa->transitions[i].neg_classes) - xfree(tnfa->transitions[i].neg_classes); + if (tnfa->transitions[i].assertions & ASSERT_BRACKET_MATCH) + xfree(tnfa->transitions[i].u.bracket_match_list); if (tnfa->transitions[i].params) xfree(tnfa->transitions[i].params); } @@ -2215,18 +3330,20 @@ tre_free(regex_t *preg) if (tnfa->submatch_data) { - for (i = 0; i < tnfa->num_submatches; i++) - if (tnfa->submatch_data[i].parents) - xfree(tnfa->submatch_data[i].parents); xfree(tnfa->submatch_data); } if (tnfa->tag_directions) xfree(tnfa->tag_directions); - if (tnfa->firstpos_chars) - xfree(tnfa->firstpos_chars); if (tnfa->minimal_tags) xfree(tnfa->minimal_tags); + + if (tnfa->loc) + xlocale_release(tnfa->loc); + + if (tnfa->last_matched_branch) + xfree(tnfa->last_matched_branch); + xfree(tnfa); } diff --git a/contrib/tre/lib/tre-compile.h b/contrib/tre/lib/tre-compile.h index 51d5ac94ac..355ec4d68c 100644 --- a/contrib/tre/lib/tre-compile.h +++ b/contrib/tre/lib/tre-compile.h @@ -16,10 +16,10 @@ typedef struct { int code_max; int *tags; int assertions; - tre_ctype_t class; - tre_ctype_t *neg_classes; + tre_bracket_match_list_t *bracket_match_list; int backref; int *params; + locale_t loc; } tre_pos_and_tags_t; #endif /* TRE_COMPILE_H */ diff --git a/contrib/tre/lib/tre-internal.h b/contrib/tre/lib/tre-internal.h index dacde8c872..1dec1c2ba4 100644 --- a/contrib/tre/lib/tre-internal.h +++ b/contrib/tre/lib/tre-internal.h @@ -18,7 +18,9 @@ #endif /* !HAVE_WCTYPE_H */ #include +#include #include "tre.h" +#include "xlocale_private.h" #ifdef TRE_DEBUG #include @@ -31,6 +33,7 @@ #ifdef HAVE_MBRTOWC #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) +#define tre_mbrtowc_l(pwc,s,n,ps,l) (mbrtowc_l((pwc), (s), (n), (ps), (l))) #else /* !HAVE_MBRTOWC */ #ifdef HAVE_MBTOWC #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) @@ -52,6 +55,7 @@ typedef wint_t tre_cint_t; #ifdef TRE_MULTIBYTE #define TRE_MB_CUR_MAX MB_CUR_MAX +#define TRE_MB_CUR_MAX_L MB_CUR_MAX_L #else /* !TRE_MULTIBYTE */ #define TRE_MB_CUR_MAX 1 #endif /* !TRE_MULTIBYTE */ @@ -75,6 +79,14 @@ typedef wint_t tre_cint_t; #define tre_toupper towupper #define tre_strlen wcslen +#define tre_isalnum_l iswalnum_l +#define tre_isdigit_l iswdigit_l +#define tre_islower_l iswlower_l +#define tre_isupper_l iswupper_l +#define tre_isxdigit_l iswxdigit_l +#define tre_tolower_l towlower_l +#define tre_toupper_l towupper_l + #else /* !TRE_WCHAR */ /* 8 bit characters. */ @@ -115,6 +127,8 @@ typedef short tre_cint_t; typedef wctype_t tre_ctype_t; #define tre_isctype iswctype #define tre_ctype wctype +#define tre_isctype_l iswctype_l +#define tre_ctype_l wctype_l #else /* !TRE_USE_SYSTEM_WCTYPE */ /* Define our own versions of iswctype() and wctype(). */ typedef int (*tre_ctype_t)(tre_cint_t); @@ -122,7 +136,7 @@ typedef int (*tre_ctype_t)(tre_cint_t); tre_ctype_t tre_ctype(const char *name); #endif /* !TRE_USE_SYSTEM_WCTYPE */ -typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; +typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t; /* Returns number of bytes to add to (char *)ptr to make it properly aligned for the type. */ @@ -143,6 +157,47 @@ typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; #define STRF "s" #endif /* !TRE_WCHAR */ +/* Types to handle bracket expressions. */ +typedef enum { + TRE_BRACKET_MATCH_TYPE_UNUSED = 0, + TRE_BRACKET_MATCH_TYPE_CHAR, /* Single character value */ + TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, /* Collation range begin */ + TRE_BRACKET_MATCH_TYPE_RANGE_END, /* Collation range end */ + TRE_BRACKET_MATCH_TYPE_CLASS, /* Character class */ + TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, /* Collation equivalence value */ +} tre_bracket_match_type_t; + +typedef struct { + tre_bracket_match_type_t type; + tre_cint_t value; +} tre_bracket_match_t; + +#define TRE_BRACKET_MATCH_FLAG_NEGATE 1 + +typedef struct { + int num_bracket_matches; + int flags; + tre_bracket_match_t bracket_matches[0]; +} tre_bracket_match_list_t; + +#define SIZEOF_BRACKET_MATCH_LIST_N(n) (sizeof(tre_bracket_match_list_t) + \ + sizeof(tre_bracket_match_t) * (n)) +#define SIZEOF_BRACKET_MATCH_LIST(l) SIZEOF_BRACKET_MATCH_LIST_N( \ + (l)->num_bracket_matches) + +/* The "count" field is the number of time the tag was set, initially zero. + The "first" field contains the first set value (when "count" equals 1). + The "value" field contains the current value of the tag, if "count" is + greater than zero (the tag's current value is -1 if "count" is zero). + The "touch" field is the touch value, a montonically increasing value + (maintained by the caller) set each time the tag itself is set. */ +typedef struct { + int count; + int first; + int value; + int touch; +} tre_tag_t; + /* TNFA transition type. A TNFA state is an array of transitions, the terminator is a transition with NULL `state'. */ typedef struct tnfa_transition tre_tnfa_transition_t; @@ -163,32 +218,30 @@ struct tnfa_transition { int assertions; /* Assertion parameters. */ union { - /* Character class assertion. */ - tre_ctype_t class; + /* Bracket matches. */ + tre_bracket_match_list_t *bracket_match_list; /* Back reference assertion. */ int backref; } u; - /* Negative character class assertions. */ - tre_ctype_t *neg_classes; }; /* Assertions. */ #define ASSERT_AT_BOL 1 /* Beginning of line. */ #define ASSERT_AT_EOL 2 /* End of line. */ -#define ASSERT_CHAR_CLASS 4 /* Character class in `class'. */ -#define ASSERT_CHAR_CLASS_NEG 8 /* Character classes in `neg_classes'. */ -#define ASSERT_AT_BOW 16 /* Beginning of word. */ -#define ASSERT_AT_EOW 32 /* End of word. */ -#define ASSERT_AT_WB 64 /* Word boundary. */ -#define ASSERT_AT_WB_NEG 128 /* Not a word boundary. */ -#define ASSERT_BACKREF 256 /* A back reference in `backref'. */ -#define ASSERT_LAST 256 +#define ASSERT_BRACKET_MATCH 4 /* Matches in `bracket_match_list'. */ +#define ASSERT_AT_BOW 8 /* Beginning of word. */ +#define ASSERT_AT_EOW 16 /* End of word. */ +#define ASSERT_AT_WB 32 /* Word boundary. */ +#define ASSERT_AT_WB_NEG 64 /* Not a word boundary. */ +#define ASSERT_BACKREF 128 /* A back reference in `backref'. */ +#define ASSERT_LAST 128 /* Tag directions. */ typedef enum { TRE_TAG_MINIMIZE = 0, - TRE_TAG_MAXIMIZE = 1 + TRE_TAG_MAXIMIZE, + TRE_TAG_LEFT_MAXIMIZE, } tre_tag_direction_t; /* Parameters that can be changed dynamically while matching. */ @@ -218,66 +271,95 @@ struct tre_submatch_data { int so_tag; /* Tag that gives the value for rm_eo (submatch end offset). */ int eo_tag; - /* List of submatches this submatch is contained in. */ - int *parents; }; typedef struct tre_submatch_data tre_submatch_data_t; +/* LAST MATCHED structures */ +struct __previous_type; +struct __previous_branch_type; +struct __current_type; +struct __current_branch_type; + +typedef struct __previous_branch_type { + struct __previous_branch_type *next; + struct __previous_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; + int tot_branches; + int tot_last_matched; + int tot_tags; + bitstr_t tags[0]; +} tre_last_matched_branch_pre_t; + +typedef struct __previous_type { + struct __previous_type *next; + tre_last_matched_branch_pre_t *branches; int n_branches; int start_tag; + int tot_branches; + int tot_last_matched; + int tot_tags; +} tre_last_matched_pre_t; + +typedef struct __current_branch_type { + int *tags; + struct __current_type *last_matched; int n_last_matched; int cmp_tag; int n_tags; +} tre_last_matched_branch_t; + +typedef struct __current_type { + tre_last_matched_branch_t *branches; int n_branches; int start_tag; +} tre_last_matched_t; + /* TNFA definition. */ typedef struct tnfa tre_tnfa_t; struct tnfa { tre_tnfa_transition_t *transitions; - unsigned int num_transitions; tre_tnfa_transition_t *initial; tre_tnfa_transition_t *final; tre_submatch_data_t *submatch_data; - char *firstpos_chars; - int first_char; - unsigned int num_submatches; tre_tag_direction_t *tag_directions; int *minimal_tags; + tre_last_matched_branch_t *last_matched_branch; + locale_t loc; + unsigned int num_transitions; + int first_char; + unsigned int num_submatches; + unsigned int num_submatches_invisible; int num_tags; int num_minimals; int end_tag; int num_states; int cflags; int have_backrefs; + int num_reorder_tags; int have_approx; int params_depth; }; int -tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags); +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, + locale_t loc); void tre_free(regex_t *preg); -void -tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, - const tre_tnfa_t *tnfa, int *tags, int match_eo); - reg_errcode_t -tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, int eflags, - int *match_end_ofs); +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, + const tre_tnfa_t *tnfa, const tre_tag_t *tags, int match_eo); reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, int eflags, + tre_str_type_t type, tre_tag_t *match_tags, int eflags, int *match_end_ofs); reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - int len, tre_str_type_t type, int *match_tags, + int len, tre_str_type_t type, tre_tag_t *match_tags, int eflags, int *match_end_ofs); #ifdef TRE_APPROX reg_errcode_t tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, + tre_str_type_t type, tre_tag_t *match_tags, regamatch_t *match, regaparams_t params, int eflags, int *match_end_ofs); #endif /* TRE_APPROX */ diff --git a/contrib/tre/lib/tre-match-backtrack.c b/contrib/tre/lib/tre-match-backtrack.c index eab13a7bc8..726c4c01fb 100644 --- a/contrib/tre/lib/tre-match-backtrack.c +++ b/contrib/tre/lib/tre-match-backtrack.c @@ -34,6 +34,10 @@ #include #endif /* HAVE_CONFIG_H */ +/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state + info while running */ +#undef TRE_USE_ALLOCA + #ifdef TRE_USE_ALLOCA /* AIX requires this to be the first thing in the file. */ #ifndef __GNUC__ @@ -75,6 +79,7 @@ char *alloca (); typedef struct { int pos; + unsigned int pos_add_next; const char *str_byte; #ifdef TRE_WCHAR const wchar_t *str_wide; @@ -82,7 +87,7 @@ typedef struct { tre_tnfa_transition_t *state; int state_id; int next_c; - int *tags; + tre_tag_t *tags; #ifdef TRE_MBSTATE mbstate_t mbstate; #endif /* TRE_MBSTATE */ @@ -122,10 +127,9 @@ typedef struct tre_backtrack_struct { #endif /* !TRE_USE_ALLOCA */ -#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ +#define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ do \ { \ - int i; \ if (!stack->next) \ { \ tre_backtrack_t s; \ @@ -144,7 +148,7 @@ typedef struct tre_backtrack_struct { s->prev = stack; \ s->next = NULL; \ s->item.tags = tre_bt_mem_alloc(mem, \ - sizeof(*tags) * tnfa->num_tags); \ + num_tags * sizeof(*tags)); \ if (!s->item.tags) \ { \ tre_bt_mem_destroy(mem); \ @@ -162,13 +166,13 @@ typedef struct tre_backtrack_struct { else \ stack = stack->next; \ stack->item.pos = (_pos); \ + stack->item.pos_add_next = (_pos_add_next); \ stack->item.str_byte = (_str_byte); \ BT_STACK_WIDE_IN(_str_wide); \ stack->item.state = (_state); \ stack->item.state_id = (_state_id); \ stack->item.next_c = (_next_c); \ - for (i = 0; i < tnfa->num_tags; i++) \ - stack->item.tags[i] = (_tags)[i]; \ + memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \ BT_STACK_MBSTATE_IN; \ } \ while (/*CONSTCOND*/0) @@ -176,17 +180,14 @@ typedef struct tre_backtrack_struct { #define BT_STACK_POP() \ do \ { \ - int i; \ assert(stack->prev); \ pos = stack->item.pos; \ - if (type == STR_USER) \ - str_source->rewind(pos + pos_add_next, str_source->context); \ + pos_add_next = stack->item.pos_add_next; \ str_byte = stack->item.str_byte; \ BT_STACK_WIDE_OUT; \ state = stack->item.state; \ next_c = stack->item.next_c; \ - for (i = 0; i < tnfa->num_tags; i++) \ - tags[i] = stack->item.tags[i]; \ + memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \ BT_STACK_MBSTATE_OUT; \ stack = stack->prev; \ } \ @@ -197,7 +198,7 @@ typedef struct tre_backtrack_struct { reg_errcode_t tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, - int len, tre_str_type_t type, int *match_tags, + int len, tre_str_type_t type, tre_tag_t *match_tags, int eflags, int *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ @@ -214,7 +215,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, int reg_notbol = eflags & REG_NOTBOL; int reg_noteol = eflags & REG_NOTEOL; int reg_newline = tnfa->cflags & REG_NEWLINE; - int str_user_end = 0; + int i; /* These are used to remember the necessary values of the above variables to return to the position where the current search @@ -232,7 +233,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* End offset of best match so far, or -1 if no match found yet. */ int match_eo = -1; /* Tag arrays. */ - int *next_tags, *tags = NULL; + int *next_tags; + tre_tag_t *tags = NULL; /* Current TNFA state. */ tre_tnfa_transition_t *state; int *states_seen = NULL; @@ -245,7 +247,12 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, tre_tnfa_transition_t *trans_i; regmatch_t *pmatch = NULL; - int ret; + reg_errcode_t ret; + + int num_tags = tnfa->num_tags; + int touch = 1; + char *buf; + int tbytes; #ifdef TRE_MBSTATE memset(&mbstate, '\0', sizeof(mbstate)); @@ -265,57 +272,48 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); DPRINT(("len = %d\n", len)); + { + int pbytes, sbytes, total_bytes; + char *tmp_buf; + /* Compute the length of the block we need. */ + tbytes = sizeof(*tags) * num_tags; + pbytes = sizeof(*pmatch) * tnfa->num_submatches; + sbytes = sizeof(*states_seen) * tnfa->num_states; + total_bytes = + (sizeof(long) - 1) * 2 /* for alignment paddings */ + + tbytes + pbytes + sbytes; + + DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes)); + /* Allocate the memory. */ #ifdef TRE_USE_ALLOCA - tags = alloca(sizeof(*tags) * tnfa->num_tags); - pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); - states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); + buf = alloca(total_bytes); #else /* !TRE_USE_ALLOCA */ - if (tnfa->num_tags) - { - tags = xmalloc(sizeof(*tags) * tnfa->num_tags); - if (!tags) - { - ret = REG_ESPACE; - goto error_exit; - } - } - if (tnfa->num_submatches) - { - pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); - if (!pmatch) - { - ret = REG_ESPACE; - goto error_exit; - } - } - if (tnfa->num_states) - { - states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); - if (!states_seen) - { - ret = REG_ESPACE; - goto error_exit; - } - } + buf = xmalloc((unsigned)total_bytes); #endif /* !TRE_USE_ALLOCA */ + if (buf == NULL) + return REG_ESPACE; + + /* Get the various pointers within tmp_buf (properly aligned). */ + tags = (void *)buf; + tmp_buf = buf + tbytes; + tmp_buf += ALIGN(tmp_buf, long); + pmatch = (void *)tmp_buf; + tmp_buf += pbytes; + tmp_buf += ALIGN(tmp_buf, long); + states_seen = (void *)tmp_buf; + } retry: { - int i; - for (i = 0; i < tnfa->num_tags; i++) - { - tags[i] = -1; - if (match_tags) - match_tags[i] = -1; - } + memset(tags, 0, num_tags * sizeof(*tags)); + if (match_tags) + memset(match_tags, 0, num_tags * sizeof(*match_tags)); for (i = 0; i < tnfa->num_states; i++) states_seen[i] = 0; } state = NULL; pos = pos_start; - if (type == STR_USER) - str_source->rewind(pos + pos_add_next, str_source->context); GET_NEXT_WCHAR(); pos_start = pos; next_c_start = next_c; @@ -347,20 +345,26 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, { /* Backtrack to this state. */ DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); - BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state, trans_i->state_id, next_c, tags, mbstate); { int *tmp = trans_i->tags; if (tmp) - while (*tmp >= 0) - stack->item.tags[*tmp++] = pos; + { + while (*tmp >= 0) + tre_tag_set(stack->item.tags, *tmp++, pos, touch); + touch++; + } } } } if (next_tags) - for (; *next_tags >= 0; next_tags++) - tags[*next_tags] = pos; + { + for (; *next_tags >= 0; next_tags++) + tre_tag_set(tags, *next_tags, pos, touch); + touch++; + } DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); @@ -376,22 +380,103 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, int empty_br_match; DPRINT(("start loop\n")); + + if (match_eo >= 0 && tnfa->num_minimals) + { + int skip = 0; +#ifdef TRE_DEBUG + DPRINT(("Checking minimal conditions: match_eo=%d match_tags=", + match_eo)); + tre_print_tags(match_tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) + { + int end = tnfa->minimal_tags[i]; + int start = tnfa->minimal_tags[i + 1]; + DPRINT((" Minimal start %d, end %d\n", start, end)); + if (tre_minimal_tag_order(start, end, match_tags, tags) > 0) + { + skip = 1; + break; + } + } + if (!skip) + { +#ifdef TRE_DEBUG + DPRINT((" Keeping tags=")); + tre_print_tags(tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + } + else + { +#ifdef TRE_DEBUG + DPRINT((" Throwing out tags=")); + tre_print_tags(tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + goto backtrack; + } + } + if (state == tnfa->final) { - DPRINT((" match found, %d %d\n", match_eo, pos)); + DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos)); + + if (match_eo >= 0 && tnfa->num_minimals) + { + int compare = 0; +#ifdef TRE_DEBUG + DPRINT(("Checking minimal conditions: match_eo=%d " + "match_tags=", match_eo)); + tre_print_tags(match_tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) + { + int end = tnfa->minimal_tags[i]; + int start = tnfa->minimal_tags[i + 1]; + DPRINT((" Minimal start %d, end %d\n", start, end)); + if ((compare = tre_minimal_tag_order(start, end, + match_tags, tags)) != 0) + break; + } + if (compare > 0) + { +#ifdef TRE_DEBUG + DPRINT((" Throwing out new match, tags=")); + tre_print_tags(tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + goto backtrack; + } + else if (compare < 0) + { +#ifdef TRE_DEBUG + DPRINT((" Throwing out old match, tags=")); + tre_print_tags(match_tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + match_eo = -1; + } + } + if (match_eo < pos || (match_eo == pos && match_tags && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, tags, match_tags))) { - int i; /* This match wins the previous match. */ - DPRINT((" win previous\n")); +#ifdef TRE_DEBUG + DPRINT((" win previous tags=")); + tre_print_tags(tags, tnfa->num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ match_eo = pos; if (match_tags) - for (i = 0; i < tnfa->num_tags; i++) - match_tags[i] = tags[i]; + memcpy(match_tags, tags, num_tags * sizeof(*tags)); } /* Our TNFAs never have transitions leaving from the final state, so we jump right to backtracking. */ @@ -401,12 +486,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, #ifdef TRE_DEBUG DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, state)); - { - int i; - for (i = 0; i < tnfa->num_tags; i++) - DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); - DPRINT(("\n")); - } + tre_print_tags(tags, tnfa->num_tags); + DPRINT(("\n")); #endif /* TRE_DEBUG */ /* Go to the next character in the input string. */ @@ -424,8 +505,9 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, DPRINT((" should match back reference %d\n", bt)); /* Get the substring we need to match against. Remember to turn off REG_NOSUB temporarily. */ - tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB, + ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, tnfa, tags, pos); + if (ret != REG_OK) goto error_exit; so = pmatch[bt].rm_so; eo = pmatch[bt].rm_eo; bt_len = eo - so; @@ -440,14 +522,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, if (type == STR_BYTE) { - DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n", + DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n", bt_len, so, eo, bt_len, (char*)string + so)); DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); } #ifdef TRE_WCHAR else if (type == STR_WIDE) { - DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n", + DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n", bt_len, so, eo, bt_len, (wchar_t*)string + so)); DPRINT((" current string is '%.*" STRF "'\n", slen, str_wide - 1)); @@ -456,19 +538,19 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, } #endif - if (len < 0) + if (so < 0) + { + result = 1; /* Back reference of nomatch doesn't match */ + } + else if (len < 0) { - if (type == STR_USER) - result = str_source->compare((unsigned)so, (unsigned)pos, - (unsigned)bt_len, - str_source->context); #ifdef TRE_WCHAR - else if (type == STR_WIDE) + if (type == STR_WIDE) result = wcsncmp((const wchar_t*)string + so, str_wide - 1, (size_t)bt_len); -#endif /* TRE_WCHAR */ else - result = strncmp((const char*)string + so, str_byte - 1, +#endif /* TRE_WCHAR */ + result = strncmp((const char*)string + so, str_byte - 1, (size_t)bt_len); } else if (len - pos < bt_len) @@ -517,12 +599,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* Check for end of string. */ if (len < 0) { - if (type == STR_USER) - { - if (str_user_end) - goto backtrack; - } - else if (next_c == L'\0') + if (next_c == L'\0') goto backtrack; } else @@ -568,12 +645,14 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, jump back here if needed. */ DPRINT((" saving state %d for backtracking\n", trans_i->state_id)); - BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, - trans_i->state_id, next_c, tags, mbstate); + BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, + trans_i->state, trans_i->state_id, next_c, + tags, mbstate); { int *tmp; for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) - stack->item.tags[*tmp] = pos; + tre_tag_set(stack->item.tags, *tmp, pos, touch); + touch++; } #if 0 /* XXX - it's important not to look at all transitions here to keep the stack small! */ @@ -590,8 +669,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, /* Update the tag values. */ if (next_tags) - while (*next_tags >= 0) - tags[*next_tags++] = pos; + { + while (*next_tags >= 0) + tre_tag_set(tags, *next_tags++, pos, touch); + touch++; + } } else { @@ -600,7 +682,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, if (stack->prev) { DPRINT((" backtracking\n")); - if (stack->item.state->assertions && ASSERT_BACKREF) + if (stack->item.state->assertions & ASSERT_BACKREF) { DPRINT((" states_seen[%d] = 0\n", stack->item.state_id)); @@ -613,20 +695,23 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, { /* Try starting from a later position in the input string. */ /* Check for end of string. */ - if (len < 0) + if (pos == pos_start) { - if (next_c == L'\0') + if (len < 0) { - DPRINT(("end of string.\n")); - break; + if (next_c == L'\0') + { + DPRINT(("end of string.\n")); + break; + } } - } - else - { - if (pos >= len) + else { - DPRINT(("end of string.\n")); - break; + if (pos >= len) + { + DPRINT(("end of string.\n")); + break; + } } } DPRINT(("restarting from next start position\n")); @@ -654,12 +739,8 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, error_exit: tre_bt_mem_destroy(mem); #ifndef TRE_USE_ALLOCA - if (tags) - xfree(tags); - if (pmatch) - xfree(pmatch); - if (states_seen) - xfree(states_seen); + if (buf) + xfree(buf); #endif /* !TRE_USE_ALLOCA */ return ret; diff --git a/contrib/tre/lib/tre-match-parallel.c b/contrib/tre/lib/tre-match-parallel.c index 0d6ec173c5..7e30ebdd91 100644 --- a/contrib/tre/lib/tre-match-parallel.c +++ b/contrib/tre/lib/tre-match-parallel.c @@ -27,6 +27,10 @@ #include #endif /* HAVE_CONFIG_H */ +/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state + info while running */ +#undef TRE_USE_ALLOCA + #ifdef TRE_USE_ALLOCA /* AIX requires this to be the first thing in the file. */ #ifndef __GNUC__ @@ -69,34 +73,33 @@ char *alloca (); typedef struct { tre_tnfa_transition_t *state; - int *tags; + tre_tag_t *tags; } tre_tnfa_reach_t; typedef struct { int pos; - int **tags; + tre_tag_t **tags; } tre_reach_pos_t; #ifdef TRE_DEBUG static void -tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) +tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) { - int i; + DPRINT((" %p", (void *)state)); + if (num_tags > 0) + { + DPRINT(("/")); + tre_print_tags(tags, num_tags); + } +} +static void +tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) +{ while (reach->state != NULL) { - DPRINT((" %p", (void *)reach->state)); - if (num_tags > 0) - { - DPRINT(("/")); - for (i = 0; i < num_tags; i++) - { - DPRINT(("%d:%d", i, reach->tags[i])); - if (i < (num_tags-1)) - DPRINT((",")); - } - } + tre_print_reach1(reach->state, reach->tags, num_tags); reach++; } DPRINT(("\n")); @@ -106,7 +109,7 @@ tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) reg_errcode_t tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, - tre_str_type_t type, int *match_tags, int eflags, + tre_str_type_t type, tre_tag_t *match_tags, int eflags, int *match_end_ofs) { /* State variables required by GET_NEXT_WCHAR. */ @@ -123,7 +126,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, int reg_notbol = eflags & REG_NOTBOL; int reg_noteol = eflags & REG_NOTEOL; int reg_newline = tnfa->cflags & REG_NEWLINE; - int str_user_end = 0; char *buf; tre_tnfa_transition_t *trans_i; @@ -133,9 +135,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, int num_tags, i; int match_eo = -1; /* end offset of match (-1 if no match found yet) */ - int new_match = 0; - int *tmp_tags = NULL; - int *tmp_iptr; +#ifdef TRE_DEBUG + int once; +#endif /* TRE_DEBUG */ + tre_tag_t *tmp_tags = NULL; + tre_tag_t *tmp_iptr; + int tbytes; + int touch = 1; #ifdef TRE_MBSTATE memset(&mbstate, '\0', sizeof(mbstate)); @@ -153,17 +159,17 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, everything in a single large block from the stack frame using alloca() or with malloc() if alloca is unavailable. */ { - int tbytes, rbytes, pbytes, xbytes, total_bytes; + int rbytes, pbytes, total_bytes; char *tmp_buf; /* Compute the length of the block we need. */ tbytes = sizeof(*tmp_tags) * num_tags; rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); pbytes = sizeof(*reach_pos) * tnfa->num_states; - xbytes = sizeof(int) * num_tags; total_bytes = (sizeof(long) - 1) * 4 /* for alignment paddings */ - + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; + + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; + DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); /* Allocate the memory. */ #ifdef TRE_USE_ALLOCA buf = alloca(total_bytes); @@ -190,9 +196,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, for (i = 0; i < tnfa->num_states; i++) { reach[i].tags = (void *)tmp_buf; - tmp_buf += xbytes; + tmp_buf += tbytes; reach_next[i].tags = (void *)tmp_buf; - tmp_buf += xbytes; + tmp_buf += tbytes; } } @@ -200,15 +206,103 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, reach_pos[i].pos = -1; /* If only one character can start a match, find it first. */ - if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) + if (tnfa->first_char >= 0 && str_byte) { const char *orig_str = str_byte; int first = tnfa->first_char; + int found_high_bit = 0; - if (len >= 0) - str_byte = memchr(orig_str, first, (size_t)len); - else - str_byte = strchr(orig_str, first); + + if (type == STR_BYTE) + { + if (len >= 0) + str_byte = memchr(orig_str, first, (size_t)len); + else + str_byte = strchr(orig_str, first); + } + else if (type == STR_MBS) + { + /* + * If the match character is ASCII, try to match the character + * directly, but if a high bit character is found, we stop there. + */ + if (first < 0x80) + { + if (len >= 0) + { + int i; + for (i = 0; ; str_byte++, i++) + { + if (i >= len) + { + str_byte = NULL; + break; + } + if (*str_byte == first) + break; + if (*str_byte & 0x80) + { + found_high_bit = 1; + break; + } + } + } + else + { + for (; ; str_byte++) + { + if (!*str_byte) + { + str_byte = NULL; + break; + } + if (*str_byte == first) + break; + if (*str_byte & 0x80) + { + found_high_bit = 1; + break; + } + } + } + } + else + { + if (len >= 0) + { + int i; + for (i = 0; ; str_byte++, i++) + { + if (i >= len) + { + str_byte = NULL; + break; + } + if (*str_byte & 0x80) + { + found_high_bit = 1; + break; + } + } + } + else + { + for (; ; str_byte++) + { + if (!*str_byte) + { + str_byte = NULL; + break; + } + if (*str_byte & 0x80) + { + found_high_bit = 1; + break; + } + } + } + } + } if (str_byte == NULL) { #ifndef TRE_USE_ALLOCA @@ -218,51 +312,35 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, return REG_NOMATCH; } DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); - if (str_byte >= orig_str + 1) - prev_c = (unsigned char)*(str_byte - 1); - next_c = (unsigned char)*str_byte; - pos = str_byte - orig_str; - if (len < 0 || pos < len) - str_byte++; - } - else - { - GET_NEXT_WCHAR(); - pos = 0; - } - -#if 0 - /* Skip over characters that cannot possibly be the first character - of a match. */ - if (tnfa->firstpos_chars != NULL) - { - char *chars = tnfa->firstpos_chars; - - if (len < 0) + if (!found_high_bit) { - const char *orig_str = str_byte; - /* XXX - use strpbrk() and wcspbrk() because they might be - optimized for the target architecture. Try also strcspn() - and wcscspn() and compare the speeds. */ - while (next_c != L'\0' && !chars[next_c]) - { - next_c = *str_byte++; - } - prev_c = *(str_byte - 2); - pos += str_byte - orig_str; - DPRINT(("skipped %d chars\n", str_byte - orig_str)); + if (str_byte >= orig_str + 1) + prev_c = (unsigned char)*(str_byte - 1); + next_c = (unsigned char)*str_byte; + pos = str_byte - orig_str; + if (len < 0 || pos < len) + str_byte++; } else { - while (pos <= len && !chars[next_c]) - { - prev_c = next_c; - next_c = (unsigned char)(*str_byte++); - pos++; - } + if (str_byte == orig_str) + goto no_first_optimization; + /* + * Back up one character, fix up the position, then call + * GET_NEXT_WCHAR() to process the multibyte character. + */ + /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */ + next_c = (unsigned char)*(str_byte - 1); + pos = (str_byte - 1) - orig_str; + GET_NEXT_WCHAR(); } } -#endif + else + { +no_first_optimization: + GET_NEXT_WCHAR(); + pos = 0; + } DPRINT(("length: %d\n", len)); DPRINT(("pos:chr/code | states and tags\n")); @@ -290,23 +368,23 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, DPRINT((" %p", (void *)trans_i->state)); reach_next_i->state = trans_i->state; - for (i = 0; i < num_tags; i++) - reach_next_i->tags[i] = -1; + memset(reach_next_i->tags, 0, tbytes); tag_i = trans_i->tags; if (tag_i) - while (*tag_i >= 0) - { - if (*tag_i < num_tags) - reach_next_i->tags[*tag_i] = pos; - tag_i++; - } + { + while (*tag_i >= 0) + { + if (*tag_i < num_tags) + tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); + tag_i++; + } + touch++; + } if (reach_next_i->state == tnfa->final) { DPRINT((" found empty match\n")); match_eo = pos; - new_match = 1; - for (i = 0; i < num_tags; i++) - match_tags[i] = reach_next_i->tags[i]; + memcpy(match_tags, reach_next_i->tags, tbytes); } reach_pos[trans_i->state_id].pos = pos; reach_pos[trans_i->state_id].tags = &reach_next_i->tags; @@ -327,12 +405,7 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, /* Check for end of string. */ if (len < 0) { - if (type == STR_USER) - { - if (str_user_end) - break; - } - else if (next_c == L'\0') + if (next_c == L'\0') break; } else @@ -346,8 +419,6 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, #ifdef TRE_DEBUG DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); tre_print_reach(tnfa, reach_next, num_tags); - DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c)); - tre_print_reach(tnfa, reach_next, num_tags); #endif /* TRE_DEBUG */ /* Swap `reach' and `reach_next'. */ @@ -355,51 +426,9 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, reach = reach_next; reach_next = reach_i; - /* For each state in `reach', weed out states that don't fulfill the - minimal matching conditions. */ - if (tnfa->num_minimals && new_match) - { - new_match = 0; - reach_next_i = reach_next; - for (reach_i = reach; reach_i->state; reach_i++) - { - int skip = 0; - for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) - { - int end = tnfa->minimal_tags[i]; - int start = tnfa->minimal_tags[i + 1]; - DPRINT((" Minimal start %d, end %d\n", start, end)); - if (end >= num_tags) - { - DPRINT((" Throwing %p out.\n", reach_i->state)); - skip = 1; - break; - } - else if (reach_i->tags[start] == match_tags[start] - && reach_i->tags[end] < match_tags[end]) - { - DPRINT((" Throwing %p out because t%d < %d\n", - reach_i->state, end, match_tags[end])); - skip = 1; - break; - } - } - if (!skip) - { - reach_next_i->state = reach_i->state; - tmp_iptr = reach_next_i->tags; - reach_next_i->tags = reach_i->tags; - reach_i->tags = tmp_iptr; - reach_next_i++; - } - } - reach_next_i->state = NULL; - - /* Swap `reach' and `reach_next'. */ - reach_i = reach; - reach = reach_next; - reach_next = reach_i; - } +#ifdef TRE_DEBUG + once = 0; +#endif /* TRE_DEBUG */ /* For each state in `reach' see if there is a transition leaving with the current input symbol to a state not yet in `reach_next', and @@ -422,16 +451,57 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, } /* Compute the tags after this transition. */ - for (i = 0; i < num_tags; i++) - tmp_tags[i] = reach_i->tags[i]; + memcpy(tmp_tags, reach_i->tags, tbytes); tag_i = trans_i->tags; if (tag_i != NULL) - while (*tag_i >= 0) - { - if (*tag_i < num_tags) - tmp_tags[*tag_i] = pos; - tag_i++; - } + { + while (*tag_i >= 0) + { + if (*tag_i < num_tags) + tre_tag_set(tmp_tags, *tag_i, pos, touch); + tag_i++; + } + touch++; + } + + /* For each new transition, weed out those that don't + fulfill the minimal matching conditions. */ + if (tnfa->num_minimals && match_eo >= 0) + { + int skip = 0; +#ifdef TRE_DEBUG + if (!once) + { + DPRINT(("Checking minimal conditions: match_eo=%d " + "match_tags=", match_eo)); + tre_print_tags(match_tags, num_tags); + DPRINT(("\n")); + once++; + } +#endif /* TRE_DEBUG */ + for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) + { + int end = tnfa->minimal_tags[i]; + int start = tnfa->minimal_tags[i + 1]; + DPRINT((" Minimal start %d, end %d\n", start, end)); + if (tre_minimal_tag_order(start, end, match_tags, + tmp_tags) > 0) + { + skip = 1; + break; + } + } + if (skip) + { +#ifdef TRE_DEBUG + DPRINT((" Throwing out")); + tre_print_reach1(reach_i->state, tmp_tags, + num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + continue; + } + } if (reach_pos[trans_i->state_id].pos < pos) { @@ -446,13 +516,16 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, if (reach_next_i->state == tnfa->final && (match_eo == -1 || (num_tags > 0 - && reach_next_i->tags[0] <= match_tags[0]))) + && tre_tag_get(reach_next_i->tags, 0) <= + tre_tag_get(match_tags, 0)))) { - DPRINT((" found match %p\n", trans_i->state)); +#ifdef TRE_DEBUG + DPRINT((" found match")); + tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ match_eo = pos; - new_match = 1; - for (i = 0; i < num_tags; i++) - match_tags[i] = reach_next_i->tags[i]; + memcpy(match_tags, reach_next_i->tags, tbytes); } reach_next_i++; @@ -472,11 +545,13 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, *reach_pos[trans_i->state_id].tags = tmp_tags; if (trans_i->state == tnfa->final) { - DPRINT((" found better match\n")); +#ifdef TRE_DEBUG + DPRINT((" found better match")); + tre_print_reach1(trans_i->state, tmp_tags, num_tags); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ match_eo = pos; - new_match = 1; - for (i = 0; i < num_tags; i++) - match_tags[i] = tmp_tags[i]; + memcpy(match_tags, tmp_tags, tbytes); } tmp_tags = tmp_iptr; } @@ -489,12 +564,21 @@ tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, DPRINT(("match end offset = %d\n", match_eo)); + *match_end_ofs = match_eo; +#ifdef TRE_DEBUG + if (match_tags) + { + DPRINT(("Winning tags=")); + tre_print_tags_all(match_tags, num_tags); + DPRINT((" touch=%d\n", touch)); + } +#endif /* TRE_DEBUG */ + #ifndef TRE_USE_ALLOCA if (buf) xfree(buf); #endif /* !TRE_USE_ALLOCA */ - *match_end_ofs = match_eo; return match_eo >= 0 ? REG_OK : REG_NOMATCH; } diff --git a/contrib/tre/lib/tre-match-utils.h b/contrib/tre/lib/tre-match-utils.h index 70db745e10..34da4031d3 100644 --- a/contrib/tre/lib/tre-match-utils.h +++ b/contrib/tre/lib/tre-match-utils.h @@ -6,6 +6,8 @@ */ +#include "tre-internal.h" + #define str_source ((const tre_str_source*)string) #ifdef TRE_WCHAR @@ -14,30 +16,43 @@ /* Wide character and multibyte support. */ +/* + * Because all multibyte encodings are exclusively single-shift encoding, + * with the shift codes having the high bit set, we can make an optimization + * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character + * is detected, and just do a direct copy for ASCII characters. + */ #define GET_NEXT_WCHAR() \ do { \ prev_c = next_c; \ - if (type == STR_BYTE) \ + switch (type) \ { \ + case STR_BYTE: \ pos++; \ if (len >= 0 && pos >= len) \ next_c = '\0'; \ else \ next_c = (unsigned char)(*str_byte++); \ - } \ - else if (type == STR_WIDE) \ - { \ + break; \ + case STR_WIDE: \ pos++; \ if (len >= 0 && pos >= len) \ next_c = L'\0'; \ else \ next_c = *str_wide++; \ - } \ - else if (type == STR_MBS) \ - { \ - pos += pos_add_next; \ - if (str_byte == NULL) \ - next_c = L'\0'; \ + break; \ + case STR_MBS: \ + pos += pos_add_next; \ + if (__builtin_expect(len >= 0 && pos >= len, 0)) \ + { \ + next_c = L'\0'; \ + pos_add_next = 1; \ + } \ + else if (__builtin_expect(!(*str_byte & 0x80), 1)) \ + { \ + next_c = (unsigned char)(*str_byte++); \ + pos_add_next = 1; \ + } \ else \ { \ size_t w; \ @@ -46,41 +61,30 @@ max = len - pos; \ else \ max = 32; \ - if (max <= 0) \ + w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \ + tnfa->loc); \ + if (w == (size_t)-1 || w == (size_t)-2) \ + return REG_ILLSEQ; \ + if (w == 0 && len >= 0) \ { \ - next_c = L'\0'; \ pos_add_next = 1; \ + next_c = 0; \ + str_byte++; \ } \ else \ { \ - w = tre_mbrtowc(&next_c, str_byte, (size_t)max, &mbstate); \ - if (w == (size_t)-1 || w == (size_t)-2) \ - return REG_NOMATCH; \ - if (w == 0 && len >= 0) \ - { \ - pos_add_next = 1; \ - next_c = 0; \ - str_byte++; \ - } \ - else \ - { \ - pos_add_next = w; \ - str_byte += w; \ - } \ + pos_add_next = w; \ + str_byte += w; \ } \ } \ - } \ - else if (type == STR_USER) \ - { \ - pos += pos_add_next; \ - str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ - str_source->context); \ + break; \ } \ } while(/*CONSTCOND*/0) #else /* !TRE_MULTIBYTE */ /* Wide character support, no multibyte support. */ +#error TRE_MULTIBYTE undefined #define GET_NEXT_WCHAR() \ do { \ @@ -101,12 +105,6 @@ else \ next_c = *str_wide++; \ } \ - else if (type == STR_USER) \ - { \ - pos += pos_add_next; \ - str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ - str_source->context); \ - } \ } while(/*CONSTCOND*/0) #endif /* !TRE_MULTIBYTE */ @@ -114,6 +112,7 @@ #else /* !TRE_WCHAR */ /* No wide character or multibyte support. */ +#error TRE_WCHAR undefined #define GET_NEXT_WCHAR() \ do { \ @@ -126,19 +125,14 @@ else \ next_c = (unsigned char)(*str_byte++); \ } \ - else if (type == STR_USER) \ - { \ - pos += pos_add_next; \ - str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ - str_source->context); \ - } \ } while(/*CONSTCOND*/0) #endif /* !TRE_WCHAR */ -#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) +/* Assumes tre_tnfa_t *tnfa in scope */ +#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc)) #define CHECK_ASSERTIONS(assertions) \ (((assertions & ASSERT_AT_BOL) \ @@ -159,57 +153,355 @@ || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ - (((trans_i->assertions & ASSERT_CHAR_CLASS) \ - && !(tnfa->cflags & REG_ICASE) \ - && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \ - || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ - && (tnfa->cflags & REG_ICASE) \ - && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ - && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ - || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ - && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ - tnfa->cflags & REG_ICASE))) + ((trans_i->assertions & ASSERT_BRACKET_MATCH) \ + && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \ + tnfa)) -/* Returns 1 if `t1' wins `t2', 0 otherwise. */ inline static int -tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, - int *t1, int *t2) +tre_tag_get(const tre_tag_t *tags, int i) +{ + tags += i; + return tags->count > 0 ? tags->value : -1; +} + +inline static void +tre_tag_set(tre_tag_t *tags, int i, int val, int touch) +{ + tags += i; + if (tags->count++ == 0) + tags->first = val; + tags->value = val; + tags->touch = touch; +} + +inline static void +tre_tag_reset(tre_tag_t *tags, int i) +{ + tags[i].count = 0; +} + +inline static int +tre_tag_touch_get(const tre_tag_t *tags, int i) +{ + return tags[i].touch; +} + +#ifdef TRE_DEBUG +inline static void +tre_print_tags(const tre_tag_t *tags, int num_tags) { int i; - for (i = 0; i < num_tags; i++) + for (i = 0; i < num_tags; i++, tags++) { - if (tag_directions[i] == TRE_TAG_MINIMIZE) + switch(tags->count) { - if (t1[i] < t2[i]) - return 1; - if (t1[i] > t2[i]) + case 0: + DPRINT(("%d:(0,-1)", i)); + break; + case 1: + DPRINT(("%d:(1,%d)", i, tags->first)); + break; + default: + DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first, + tags->value)); + break; + } + if (i < (num_tags - 1)) + DPRINT((" ")); + } +} + +inline static void +tre_print_tags_all(const tre_tag_t *tags, int num_tags) +{ + int i; + for (i = 0; i < num_tags; i++, tags++) + { + switch(tags->count) + { + case 0: + DPRINT(("%d:(0,-1)/%d", i, tags->touch)); + break; + case 1: + DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch)); + break; + default: + DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first, + tags->value, tags->touch)); + break; + } + if (i < (num_tags - 1)) + DPRINT((" ")); + } +} +#endif /* TRE_DEBUG */ + +/* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal + * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */ +inline static int +tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1, + const tre_tag_t *tags2) +{ + const tre_tag_t *t1, *t2; + + t1 = tags1 + start; + t2 = tags2 + start; + /* We need both start tags to be set */ + if (t1->count == 0 || t2->count == 0) + return 0; + + /* The start tags must be equal */ + if (t1->value != t2->value) + return 0; + + t1 = tags1 + end; + t2 = tags2 + end; + /* For the end tags, we prefer set over unset, because unset means that + * the end tag is still growing */ + if (t1->count == 0) + { + /* if t2 is set, t1 loses since it is unset */ + if (t2->count != 0) + return -1; + } + /* if t2 not set, t1 wins since it is set */ + else if (t2->count == 0) + return 1; + + /* least current value wins */ + return t2->value - t1->value; +} + +/* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare + * (t1 loses if < 0, t1 wins if > 0) */ +inline static int +tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1, + const tre_tag_t *t2) +{ + int diff; + + t1 += i; + t2 += i; + switch (dir) + { + case TRE_TAG_MINIMIZE: + /* least current value wins (because tags are initialized to all zeros, + * unset wins over set; also, tre_minimal_tag_order() will have already + * been run, which checks for being unset) */ + return t2->value - t1->value; + + case TRE_TAG_MAXIMIZE: + /* prefer set */ + if (t1->count == 0) + { + /* if neither t1 and t2 are set, try next tag */ + if (t2->count == 0) return 0; + /* t2 is set, t1 loses since it is unset */ + return -1; } - else + /* if t2 not set, t1 wins since it is set */ + else if (t2->count == 0) + return 1; + /* greatest initial value wins */ + if ((diff = t1->first - t2->first) != 0) + return diff; + /* least number of times the tag was set, wins */ + if ((diff = t2->count - t1->count) != 0) + return diff; + /* if the tags were only set once, they only have initial values */ + if (t1->count == 1) + return 0; + /* greatest current value wins */ + return t1->value - t2->value; + + case TRE_TAG_LEFT_MAXIMIZE: + /* prefer set */ + if (t1->count == 0) { - if (t1[i] > t2[i]) - return 1; - if (t1[i] < t2[i]) + /* if neither t1 and t2 are set, try next tag */ + if (t2->count == 0) return 0; + /* t2 is set, t1 loses since it is unset */ + return -1; } + /* if t2 not set, t1 wins since it is set */ + else if (t2->count == 0) + return 1; + /* least initial value wins */ + if ((diff = t2->first - t1->first) != 0) + return diff; + /* least number of times the tag was set, wins */ + if ((diff = t2->count - t1->count) != 0) + return diff; + /* if the tags were only set once, they only have initial values */ + if (t1->count == 1) + return 0; + /* greatest current value wins */ + return t1->value - t2->value; + + default: + /* Shouldn't happen: only assert if TRE_DEBUG defined */ + assert(0); + break; + } + return 0; +} + +#ifdef TRE_DEBUG +#define _MORE_DEBUGGING +#endif /* TRE_DEBUG */ + +/* Returns 1 if `t1' wins `t2', 0 otherwise. */ +inline static int +#ifdef _MORE_DEBUGGING +_tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, + const tre_tag_t *t1, const tre_tag_t *t2) +#else /* !_MORE_DEBUGGING */ +tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, + const tre_tag_t *t1, const tre_tag_t *t2) +#endif /* !_MORE_DEBUGGING */ +{ + int i, ret; + + for (i = 0; i < num_tags; i++) + { + if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0) + return (ret > 0); } /* assert(0);*/ return 0; } +#ifdef _MORE_DEBUGGING +inline static int +tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, + const tre_tag_t *t1, const tre_tag_t *t2) +{ + int ret = _tre_tag_order(num_tags, tag_directions, t1, t2); + DPRINT(("tre_tag_order: ")); + tre_print_tags(t1, num_tags); + DPRINT((" %s ", ret ? "wins" : "doesn't win")); + tre_print_tags(t2, num_tags); + DPRINT(("\n")); + return ret; +} +#endif /* _MORE_DEBUGGING */ + +int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); + inline static int -tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) +tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc, + const tre_tnfa_t * __restrict tnfa) { - DPRINT(("neg_char_classes_test: %p, %d, %d\n", classes, wc, icase)); - while (*classes != (tre_ctype_t)0) - if ((!icase && tre_isctype(wc, *classes)) - || (icase && (tre_isctype(tre_toupper(wc), *classes) - || tre_isctype(tre_tolower(wc), *classes)))) - return 1; /* Match. */ - else - classes++; - return 0; /* No match. */ + int match = 0; + int i; + tre_bracket_match_t *b; + tre_cint_t uc, lc; + int we, ue, le, got_equiv = 0; + int icase = ((tnfa->cflags & REG_ICASE) != 0); + + DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase)); + if (icase) + { + if (tre_islower_l(wc, tnfa->loc)) + { + lc = wc; + uc = tre_toupper_l(wc, tnfa->loc); + } + else if (tre_isupper_l(wc, tnfa->loc)) + { + uc = wc; + lc = tre_tolower_l(wc, tnfa->loc); + } + else + { + icase = 0; + } + } + for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches; + i++, b++) + { + switch (b->type) + { + case TRE_BRACKET_MATCH_TYPE_CHAR: + if (icase) + match = (b->value == uc || b->value == lc); + else + match = (b->value == wc); + break; + case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN: + { + tre_cint_t start = b->value, end; + if (++i >= list->num_bracket_matches || + (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END) + { + DPRINT(("tre_bracket_match: no following range end\n")); + assert(0); + goto error; + } + end = b->value; + if (!got_equiv) + { + if (icase) + { + ue = __collate_equiv_value(tnfa->loc, &uc, 1); + le = __collate_equiv_value(tnfa->loc, &lc, 1); + } + else + we = __collate_equiv_value(tnfa->loc, &wc, 1); + got_equiv = 1; + } + if (icase) + match = ((start <= ue && ue <= end) || + (start <= le && le <= end)); + else + match = (start <= we && we <= end); + break; + } + case TRE_BRACKET_MATCH_TYPE_RANGE_END: + DPRINT(("tre_bracket_match: range end without preceeding start\n")); + assert(0); + break; + case TRE_BRACKET_MATCH_TYPE_CLASS: + if (icase) + match = (tre_isctype_l(uc, b->value, tnfa->loc) || + tre_isctype_l(lc, b->value, tnfa->loc)); + else + match = (tre_isctype_l(wc, b->value, tnfa->loc)); + break; + case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE: + if (!got_equiv) + { + if (icase) + { + ue = __collate_equiv_value(tnfa->loc, &uc, 1); + le = __collate_equiv_value(tnfa->loc, &lc, 1); + } + else + we = __collate_equiv_value(tnfa->loc, &wc, 1); + got_equiv = 1; + } + if (icase) + match = (b->value == ue || b->value == le); + else + match = (b->value == we); + break; + default: + DPRINT(("tre_bracket_match: unknown type %d\n", b->type)); + assert(0); + break; + } + if (match) + break; + } +error: + if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) { + if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0; + match = !match; + } + return match; } diff --git a/contrib/tre/lib/tre-parse.c b/contrib/tre/lib/tre-parse.c index 9282725d06..76e66eeb40 100644 --- a/contrib/tre/lib/tre-parse.c +++ b/contrib/tre/lib/tre-parse.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "xmalloc.h" #include "tre-mem.h" @@ -26,6 +27,23 @@ #include "tre-stack.h" #include "tre-parse.h" +#include "xlocale_private.h" +#include "collate.h" + +/* BSD compatibility: + Before looking up a collating symbol, check if the name matches in + the character names (cnames) array; if so, use the corresponding + character. + + Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve + the implementation choice that for ERE, a non-numeric character following + a left brace that would normally be a bound, causes the left brace to be + literal. */ +#define BSD_COMPATIBILITY +#ifdef BSD_COMPATIBILITY +#include "cname.h" +#define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND +#endif /* BSD_COMPATIBILITY */ /* Characters with special meanings in regexp syntax. */ #define CHAR_PIPE L'|' @@ -92,86 +110,34 @@ tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, } static reg_errcode_t -tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, - tre_ast_node_t ***items) +tre_new_item(tre_mem_t mem, int type, int val, int *max_i, + tre_bracket_match_list_t **items) { - reg_errcode_t status; - tre_ast_node_t **array = *items; + reg_errcode_t status = REG_OK; + tre_bracket_match_list_t *array = *items; + int i = array->num_bracket_matches; /* Allocate more space if necessary. */ - if (*i >= *max_i) + if (i >= *max_i) { - tre_ast_node_t **new_items; - DPRINT(("out of array space, i = %d\n", *i)); + tre_bracket_match_list_t *new_items; + DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i)); /* If the array is already 1024 items large, give up -- there's probably an error in the regexp (e.g. not a '\0' terminated string and missing ']') */ - if (*max_i > 1024) + if (*max_i >= 1024) return REG_ESPACE; *max_i *= 2; - new_items = xrealloc(array, sizeof(*items) * *max_i); + new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i)); if (new_items == NULL) return REG_ESPACE; *items = array = new_items; } - array[*i] = tre_ast_new_literal(mem, min, max, -1); - status = array[*i] == NULL ? REG_ESPACE : REG_OK; - (*i)++; - return status; -} - - -/* Expands a character class to character ranges. */ -static reg_errcode_t -tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, - int *i, int *max_i, int cflags) -{ - reg_errcode_t status = REG_OK; - tre_cint_t c; - int j, min = -1, max = 0; - assert(TRE_MB_CUR_MAX == 1); - - DPRINT((" expanding class to character ranges\n")); - for (j = 0; (j < 256) && (status == REG_OK); j++) - { - c = j; - if (tre_isctype(c, class) - || ((cflags & REG_ICASE) - && (tre_isctype(tre_tolower(c), class) - || tre_isctype(tre_toupper(c), class)))) -{ - if (min < 0) - min = c; - max = c; - } - else if (min >= 0) - { - DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); - status = tre_new_item(mem, min, max, i, max_i, items); - min = -1; - } - } - if (min >= 0 && status == REG_OK) - status = tre_new_item(mem, min, max, i, max_i, items); + array->bracket_matches[i].type = type; + array->bracket_matches[i].value = val; + array->num_bracket_matches++; return status; } - -static int -tre_compare_items(const void *a, const void *b) -{ - const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; - const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; - tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; - int a_min = l_a->code_min, b_min = l_b->code_min; - - if (a_min < b_min) - return -1; - else if (a_min > b_min) - return 1; - else - return 0; -} - #ifndef TRE_USE_SYSTEM_WCTYPE /* isalnum() and the rest may be macros, so wrap them to functions. */ @@ -236,336 +202,672 @@ tre_ctype_t tre_ctype(const char *name) } #endif /* !TRE_USE_SYSTEM_WCTYPE */ -/* Maximum number of character classes that can occur in a negated bracket - expression. */ -#define MAX_NEG_CLASSES 64 +#define REST(re) (int)(ctx->re_end - (re)), (re) -/* Maximum length of character class names. */ -#define MAX_CLASS_NAME +#define START_COLLATING_SYMBOLS 16 +#define MAX_COLLATING_SYMBOL_LEN 4 -#define REST(re) (int)(ctx->re_end - (re)), (re) +typedef struct { + const tre_char_t *start; + int len; +} tre_collating_symbol; + +#ifdef BSD_COMPATIBILITY +static wchar_t +tre_search_cnames(const wchar_t *name, size_t len) +{ + size_t low = 0; + size_t high = NCNAMES - 1; + size_t cur; + int cmp; + while(low <= high) + { + cur = (low + high) / 2; + cmp = wcsncmp(name, cnames[cur].name, len); + if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code; + if (cmp > 0) low = cur + 1; + else high = cur - 1; + } + return (wchar_t)-1; +} +#endif /* BSD_COMPATIBILITY */ + +/* Scan the contents of a bracket expression, and create a + * tre_bracket_match_list_t encoding the bracket expression. If during + * the scan, multi-character collating symbols are detected, switch + * into a mode to collect those MCCSs into a tre_collating_symbol + * list and pass them back. tre_parse_bracket will use that to + * create a new string composed of a union of the bracket expression + * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and + * call tre_parse (recursive) to parse that new string (which will + * call tre_parse_bracket and tre_parse_bracket_items again. */ static reg_errcode_t -tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, - tre_ctype_t neg_classes[], int *num_neg_classes, - tre_ast_node_t ***items, int *num_items, - int *items_size) +tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items, + int *items_size, tre_collating_symbol **result) { const tre_char_t *re = ctx->re; - reg_errcode_t status = REG_OK; - tre_ctype_t class = (tre_ctype_t)0; - int i = *num_items; + const tre_char_t *re_end = ctx->re_end; + tre_collating_symbol *col_syms = NULL; + tre_collating_symbol *cp = NULL; + int n_col_syms = 0; + reg_errcode_t status; int max_i = *items_size; - int skip; + int other = 0; /* contains content other than multi-character collating + * symbols */ + int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */ + tre_cint_t min, c; + int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE); + int collect_MCCS = 0; + const tre_char_t *start; - /* Build an array of the items in the bracket expression. */ - while (status == REG_OK) + for ( ;re < re_end; re++) { - skip = 0; - if (re == ctx->re_end) - { - status = REG_EBRACK; - } - else if (*re == CHAR_RBRACKET && re > ctx->re) + switch (*re) { - DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); - re++; + case CHAR_MINUS: + /* A first hyphen */ + if (re == ctx->re) + { + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + min = CHAR_MINUS; + other++; + range = 0; + break; + } + /* The hyphen is the end range */ + if (range > 0) + { + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + c = CHAR_MINUS; + goto process_end_range; + } + if (re + 1 >= re_end) + { + status = REG_EBRACK; + goto error; + } + /* The hyphen is at the end */ + if (re[1] == CHAR_RBRACKET) + { + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + c = CHAR_MINUS; + goto process_begin_range; + } + /* Two ranges are not allowed to share an endpoint, or begin + * range is illegal. */ + if (range < 0) + { + status = REG_ERANGE; + goto error; + } + range = 1; /* Expect end range */ + DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); break; - } - else - { - tre_cint_t min = 0, max = 0; - class = (tre_ctype_t)0; - if (re + 2 < ctx->re_end - && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET) + case CHAR_LBRACKET: + if (re + 1 >= re_end) { - DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); - min = *re; - max = *(re + 2); - re += 3; - /* XXX - Should use collation order instead of encoding values - in character ranges. */ - if (min > max) - status = REG_ERANGE; + status = REG_EBRACK; + goto error; } - else if (re + 1 < ctx->re_end - && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) - status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) - status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) + switch (re[1]) { - char tmp_str[64]; - const tre_char_t *endptr = re + 2; - int len; - DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); - while (endptr < ctx->re_end && *endptr != CHAR_COLON) - endptr++; - if (endptr != ctx->re_end) - { - len = MIN(endptr - re - 2, 63); -#ifdef TRE_WCHAR + case CHAR_PERIOD: + { + re += 2; + start = re; + for (;; re++) + { + if (re >= re_end) + { + status = REG_ECOLLATE; + goto error; + } + if (*re == CHAR_PERIOD) + { + if (re + 1 >= re_end) + { + status = REG_ECOLLATE; + goto error; + } + /* Found end */ + if (re[1] == CHAR_RBRACKET) + { + DPRINT(("tre_parse_bracket: collating " + "symbol: '%.*" STRF "'\n", + REST(start - 2))); + /* Empty name */ + if (re == start) + { + status = REG_ECOLLATE; + goto error; + } +#ifdef BSD_COMPATIBILITY + /* Check if the name is in cnames; if so, use + the corresponding code */ + c = tre_search_cnames(start, re - start); + if (c != (wchar_t)-1) + { + re++; + goto process_single_character; + } +#endif /* BSD_COMPATIBILITY */ + /* Verify this is a known sequence */ + if (__collate_equiv_value(ctx->loc, start, + re - start) <= 0) + { + status = REG_ECOLLATE; + goto error; + } + /* Process single character collating symbols */ + if (re - start == 1) + { + c = *start; + re++; + goto process_single_character; + } + /* Inverted MCCSs are undefined */ + if (invert) + { + status = REG_ECOLLATE; + goto error; + } + /* Can't have MCCSs as an endpoint to a range */ + if (range > 0) + { + status = REG_ERANGE; + goto error; + } + range = -1; + /* Switch into MCCS collection mode (if not + * already there */ +#if TRE_DEBUG + if (!collect_MCCS) + { + collect_MCCS = 1; + DPRINT(("tre_parse_bracket: Detected MCCS\n")); + } +#else /* !TRE_DEBUG */ + collect_MCCS = 1; +#endif /* !TRE_DEBUG */ + /* Allocate a memory block the first time */ + if (!cp) + { + if ((col_syms = xmalloc(sizeof(*col_syms) * + (START_COLLATING_SYMBOLS + 2))) + == NULL) + return REG_ESPACE; + cp = col_syms + 1; + n_col_syms = START_COLLATING_SYMBOLS; + } + /* Enlarge the memory block is more is needed */ + if ((cp - col_syms) - 1 >= n_col_syms) + { + int i = n_col_syms; + tre_collating_symbol *tmp = + xrealloc(col_syms, sizeof(*col_syms) * + ((n_col_syms *= 2) + 2)); + if (tmp == NULL) + { + xfree(col_syms); + return REG_ESPACE; + } + DPRINT(("tre_list_collating_symbols: " + "Enlarging col_syms to %d\n", + n_col_syms)); + col_syms = tmp; + cp = col_syms + i + 1; + } + cp->start = start; + cp->len = re - start; + cp++; + re++; + break; + } + } + } + break; + } + + case CHAR_EQUAL: + case CHAR_COLON: + { + /* Process equivalence and character classes */ + tre_char_t kind = re[1]; + + /* Can't have a class as an endpoint to a range */ + if (range > 0) + { + status = REG_ERANGE; + goto error; + } + if (!collect_MCCS && range == 0) + { + status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, + min, &max_i, items); + if (status != REG_OK) + goto error; + } + range = -1; + re += 2; + start = re; + for (;; re++) { - tre_char_t tmp_wcs[64]; - wcsncpy(tmp_wcs, re + 2, (size_t)len); - tmp_wcs[len] = L'\0'; + if (re >= re_end) + { + status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; + goto error; + } + if (*re == kind) + { + if (re + 1 >= re_end) + { + status = kind == CHAR_EQUAL ? REG_ECOLLATE : + REG_ECTYPE; + goto error; + } + /* Found end */ + if (re[1] == CHAR_RBRACKET) + { + if (re == start) + { + /* Empty class name */ + status = kind == CHAR_EQUAL ? REG_ECOLLATE : + REG_ECTYPE; + goto error; + } + /* Process equivalence class */ + if (kind == CHAR_EQUAL) + { + int equiv; + + DPRINT(("tre_parse_bracket: equivalence: '%.*" + STRF "'\n", REST(start - 2))); + + /* While we find the collation value even for + multi-character collating elements , we + don't (yet) match any collation values + against multi-character sequences. We'd have + to enumerate those multi-character sequences + and like multi-character collating symbols, + create a union of those sequences with the + rest of the bracket expression. While + doable, a bracket expression matching + multiple characters, that doesn't explicitly + contain multi-character sequences, might + be unexpected, so we punt for now. */ + if ((equiv = __collate_equiv_value(ctx->loc, + start, re - start)) <= 0) + { + /* The standard says that if no collating + element if found, we use the collating + symbol itself. But __collate_equiv_value + doesn't make a distinction between + an element that is in a equvalence + class with others, or is the only member, + so we already know there is no collating + symbol. (Note that in the case of a + collating element whose collation value + is unique, matching against the + collating element itself, or against + its collation value, is equivalent.) */ +#ifdef BSD_COMPATIBILITY + /* Check if the name is in cnames; if so, + use the corresponding code */ + c = tre_search_cnames(start, re - start); + if (c != (wchar_t)-1) + { + re++; + goto process_single_character; + } +#endif /* BSD_COMPATIBILITY */ + status = REG_ECOLLATE; + goto error; + } + if (!collect_MCCS) + { + status = tre_new_item(ctx->mem, + TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, + equiv, &max_i, items); + if (status != REG_OK) + goto error; + } + } + else + { + /* Process character class */ + DPRINT(("tre_parse_bracket: class: '%.*" STRF + "'\n", REST(start - 2))); + if (!collect_MCCS) + { + char tmp_str[64]; + tre_ctype_t class; + int len = MIN(re - start, 63); +#ifdef TRE_WCHAR + { + tre_char_t tmp_wcs[64]; + wcsncpy(tmp_wcs, start, (size_t)len); + tmp_wcs[len] = L'\0'; #if defined HAVE_WCSRTOMBS - { - mbstate_t state; - const tre_char_t *src = tmp_wcs; - memset(&state, '\0', sizeof(state)); - len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); - } + { + mbstate_t state; + const tre_char_t *src = tmp_wcs; + memset(&state, '\0', sizeof(state)); + len = wcsrtombs_l(tmp_str, &src, + sizeof(tmp_str), &state, + ctx->loc); + } #elif defined HAVE_WCSTOMBS - len = wcstombs(tmp_str, tmp_wcs, 63); + len = wcstombs(tmp_str, tmp_wcs, 63); #endif /* defined HAVE_WCSTOMBS */ - } + } #else /* !TRE_WCHAR */ - strncpy(tmp_str, (const char*)re + 2, len); + strncpy(tmp_str, (const char*)start, len); #endif /* !TRE_WCHAR */ - tmp_str[len] = '\0'; - DPRINT((" class name: %s\n", tmp_str)); - class = tre_ctype(tmp_str); - if (!class) - status = REG_ECTYPE; - /* Optimize character classes for 8 bit character sets. */ - if (status == REG_OK && TRE_MB_CUR_MAX == 1) - { - status = tre_expand_ctype(ctx->mem, class, items, - &i, &max_i, ctx->cflags); - class = (tre_ctype_t)0; - skip = 1; - } - re = endptr + 2; - } - else - status = REG_ECTYPE; - min = 0; - max = TRE_CHAR_MAX; + tmp_str[len] = '\0'; + DPRINT((" class name: %s\n", tmp_str)); + class = tre_ctype_l(tmp_str, ctx->loc); + if (!class) + { + status = REG_ECTYPE; + goto error; + } + status = tre_new_item(ctx->mem, + TRE_BRACKET_MATCH_TYPE_CLASS, + class, &max_i, items); + if (status != REG_OK) + goto error; + } + } + re++; + break; + } + } + } + other++; + break; + } + + default: + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + c = CHAR_LBRACKET; + goto process_single_character; + break; } - else + break; + + case CHAR_RBRACKET: + /* A first right bracket */ + if (re == ctx->re) { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); - if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET - && ctx->re != re) - /* Two ranges are not allowed to share and endpoint. */ - status = REG_ERANGE; - min = max = *re++; + min = CHAR_RBRACKET; + range = 0; + other++; + break; } - - if (status != REG_OK) - break; - - if (class && negate) - if (*num_neg_classes >= MAX_NEG_CLASSES) - status = REG_ESPACE; - else - neg_classes[(*num_neg_classes)++] = class; - else if (!skip) + /* Done */ + if (collect_MCCS) + { + DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", + REST(re))); + if (col_syms) + { + /* Mark the character following the right bracket. Set len + * to whether there are other things besides the + * multi-character collating symbols */ + col_syms->start = re + 1; + col_syms->len = other; + /* Mark the end of the list */ + cp->start = NULL; + } + *result = col_syms; + return REG_OK; + } + /* range > 0 is not possible, since we did a lookahead after the + * hyphen */ + if (range == 0) { - status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); + status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, + min, &max_i, items); if (status != REG_OK) - break; - ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class; + goto error; } + DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); + *items_size = max_i; + ctx->re = re + 1; + return REG_OK; - /* Add opposite-case counterpoints if REG_ICASE is present. - This is broken if there are more than two "same" characters. */ - if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) + default: + DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); + c = *re; +process_single_character: + /* Process single character */ + if (range > 0) { - tre_cint_t cmin, ccurr; + int mine, maxe; - DPRINT(("adding opposite-case counterpoints\n")); - while (min <= max) +process_end_range: + /* Get collation equivalence values */ + mine = __collate_equiv_value(ctx->loc, &min, 1); + maxe = __collate_equiv_value(ctx->loc, &c, 1); + if (maxe < mine) { - if (tre_islower(min)) - { - cmin = ccurr = tre_toupper(min++); - while (tre_islower(min) && tre_toupper(min) == ccurr + 1 - && min <= max) - ccurr = tre_toupper(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); - } - else if (tre_isupper(min)) + status = REG_ERANGE; + goto error; + } + if (!collect_MCCS) + { + status = tre_new_item(ctx->mem, + TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, + mine, &max_i, items); + if (status != REG_OK) + goto error; + status = tre_new_item(ctx->mem, + TRE_BRACKET_MATCH_TYPE_RANGE_END, + maxe, &max_i, items); + if (status != REG_OK) + goto error; + } + range = -1; + } + else + { +process_begin_range: + if (!collect_MCCS) + { + if (range == 0) { - cmin = ccurr = tre_tolower(min++); - while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 - && min <= max) - ccurr = tre_tolower(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); + status = tre_new_item(ctx->mem, + TRE_BRACKET_MATCH_TYPE_CHAR, + min, &max_i, items); + if (status != REG_OK) + goto error; } - else min++; - if (status != REG_OK) - break; + min = c; } - if (status != REG_OK) - break; + range = 0; } + other++; + break; } } - *num_items = i; - *items_size = max_i; - ctx->re = re; + status = REG_EBRACK; +error: + DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n", + REST(re), status)); + if (col_syms) + xfree(col_syms); return status; } +#ifdef TRE_DEBUG +static const char *bracket_match_type_str[] = { + "unused", + "char", + "range begin", + "range end", + "class", + "equivalence value", +}; +#endif /* TRE_DEBUG */ + static reg_errcode_t tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { - tre_ast_node_t *node = NULL; - int negate = 0; + tre_ast_node_t *node; reg_errcode_t status = REG_OK; - tre_ast_node_t **items, *u, *n; - int i = 0, j, max_i = 32, curr_max, curr_min; - tre_ctype_t neg_classes[MAX_NEG_CLASSES]; - int num_neg_classes = 0; + tre_bracket_match_list_t *items; + int max_i = 32; + tre_collating_symbol *col_syms = NULL; + + /* Handle special cases [[:<:]] and [[:>:]] */ + if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET + && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>') + && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET + && ctx->re[5] == CHAR_RBRACKET) + { + *result = tre_ast_new_literal(ctx->mem, ASSERTION, + (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW, + -1); + DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ? + "[[:<:]]" : "[[:>:]]")); + ctx->re += 6; + return *result ? REG_OK : REG_ESPACE; + } /* Start off with an array of `max_i' elements. */ - items = xmalloc(sizeof(*items) * max_i); + items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i)); if (items == NULL) return REG_ESPACE; if (*ctx->re == CHAR_CARET) { DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); - negate = 1; + items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE; ctx->re++; } - status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, - &items, &i, &max_i); + status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms); if (status != REG_OK) goto parse_bracket_done; - /* Sort the array if we need to negate it. */ - if (negate) - qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); - - curr_max = curr_min = 0; - /* Build a union of the items in the array, negated if necessary. */ - for (j = 0; j < i && status == REG_OK; j++) + /* If there are collating symbols, split off the multi-character ones + * into a union of the bracket expression (without the collating symbols) + * and the multiple-character sequences. We create an equivalent input + * string and run tre_parse() recursively */ + if (col_syms) { - int min, max; - tre_literal_t *l = items[j]->obj; - min = l->code_min; - max = l->code_max; - - DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", - (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max)); - - if (negate) + tre_char_t *str, *sp; + tre_collating_symbol *cp; + tre_parse_ctx_t subctx; + + /* Allocate a new string. We start with the size of the original + * bracket expression (minus 1) and add 2 (for a leading "[" and + * a trailing nil; don't need a "^", since it is illegal to have + * inverted MCCSs). Since a multi-character collating symbols + * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't + * need to worry about the new string getting too long. */ + xfree(items); + str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2)); + if (str == NULL) { - if (min < curr_max) - { - /* Overlap. */ - curr_max = MAX(max + 1, curr_max); - DPRINT(("overlap, curr_max = %d\n", curr_max)); - l = NULL; - } - else - { - /* No overlap. */ - curr_max = min - 1; - if (curr_max >= curr_min) - { - DPRINT(("no overlap\n")); - l->code_min = curr_min; - l->code_max = curr_max; - } - else - { - DPRINT(("no overlap, zero room\n")); - l = NULL; - } - curr_min = curr_max = max + 1; - } + xfree(col_syms); + return REG_ESPACE; } - - if (l != NULL) + sp = str; + if (col_syms->len > 0) { - int k; - DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); - l->position = ctx->position; - if (num_neg_classes > 0) + /* There are other items in the bracket expression besides the + * multi-character collating symbols, so create a new bracket + * expression with only those other itmes. */ + const tre_char_t *re; + ptrdiff_t i; + + *sp++ = '['; + re = ctx->re; + for (cp = col_syms + 1; cp->start; cp++) { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) + /* The "- 2" is to account for the "[." */ + if ((i = ((cp->start - re) - 2)) > 0) { - status = REG_ESPACE; - break; + memcpy(sp, re, sizeof(*sp) * i); + sp += i; } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = items[j]; - else - { - u = tre_ast_new_union(ctx->mem, node, items[j]); - if (u == NULL) - status = REG_ESPACE; - node = u; + /* The "+ 2" is to account for the ".]" */ + re = cp->start + cp->len + 2; } + i = col_syms->start - re; /* Includes the trailing right bracket */ + memcpy(sp, re, sizeof(*sp) * i); + sp += i; + *sp++ = '|'; + } + for (cp = col_syms + 1; cp->start; cp++) + { + memcpy(sp, cp->start, sizeof(*sp) * cp->len); + sp += cp->len; + if (cp[1].start) + *sp++ = '|'; } + *sp = 0; + DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n", + str)); + + memcpy(&subctx, ctx, sizeof(subctx)); + subctx.re = str; + subctx.len = sp - str; + subctx.nofirstsub = 1; + subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */ + status = tre_parse(&subctx); + xfree(str); + if (status != REG_OK) + { + xfree(col_syms); + return status; + } + ctx->re = col_syms->start; + ctx->position = subctx.position; + xfree(col_syms); + *result = subctx.result; + DPRINT(("tre_parse_bracket: Returning to original string\n")); + return REG_OK; } - if (status != REG_OK) - goto parse_bracket_done; - - if (negate) + DPRINT(("tre_parse_bracket: creating bracket expression literal\n")); + node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); + if (node == NULL) + { + status = REG_ESPACE; + goto parse_bracket_done; + } + else { - int k; - DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); - n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); - if (n == NULL) - status = REG_ESPACE; - else + tre_literal_t *l = node->obj; + l->u.bracket_match_list = tre_mem_alloc(ctx->mem, + SIZEOF_BRACKET_MATCH_LIST(items)); + if (l->u.bracket_match_list == NULL) { - tre_literal_t *l = n->obj; - if (num_neg_classes > 0) - { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) - { - status = REG_ESPACE; - goto parse_bracket_done; - } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = n; - else - { - u = tre_ast_new_union(ctx->mem, node, n); - if (u == NULL) - status = REG_ESPACE; - node = u; - } + status = REG_ESPACE; + goto parse_bracket_done; } + memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items)); } - if (status != REG_OK) - goto parse_bracket_done; - #ifdef TRE_DEBUG - tre_ast_print(node); + { + int i; + tre_bracket_match_t *b; + DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n", + items->num_bracket_matches, items->flags)); + for (i = 0, b = items->bracket_matches; + i < items->num_bracket_matches; i++, b++) + { + DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type], + b->value)); + } + } #endif /* TRE_DEBUG */ parse_bracket_done: @@ -598,25 +900,46 @@ tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) static reg_errcode_t tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { - int min, max, i; + int min, max; +#ifdef TRE_APPROX + int i; int cost_ins, cost_del, cost_subst, cost_max; int limit_ins, limit_del, limit_subst, limit_err; - const tre_char_t *r = ctx->re; const tre_char_t *start; +#endif /* TRE_APPROX */ + const tre_char_t *r = ctx->re; int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; +#ifdef TRE_APPROX int approx = 0; int costs_set = 0; int counts_set = 0; cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; +#endif /* TRE_APPROX */ /* Parse number (minimum repetition count). */ min = -1; - if (r < ctx->re_end && *r >= L'0' && *r <= L'9') { + if (r >= ctx->re_end) +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE; +#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + return REG_EBRACE; +#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + if (*r >= L'0' && *r <= L'9') { DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); min = tre_parse_int(&r, ctx->re_end); } +#ifndef TRE_APPROX + else +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + /* For ERE, return REG_NOMATCH to signal that the lbrace should + be treated as a literal */ + return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR; +#else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + return REG_BADBR; +#endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ +#endif /* !TRE_APPROX */ /* Parse comma and second number (maximum repetition count). */ max = min; @@ -628,10 +951,11 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) } /* Check that the repeat counts are sane. */ - if ((max >= 0 && min > max) || max > RE_DUP_MAX) + if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX) return REG_BADBR; +#ifdef TRE_APPROX /* '{' optionally followed immediately by a number == minimum repcount @@ -785,8 +1109,9 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) } } } while (start != r); +#endif /* TRE_APPROX */ - /* Missing }. */ + /*{*//* Missing }. */ if (r >= ctx->re_end) return REG_EBRACE; @@ -800,6 +1125,27 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) if (r >= ctx->re_end || *r != CHAR_RBRACE) return REG_BADBR; r++; + /* Parse trailing '?' marking minimal repetition. */ + if (r < ctx->re_end) + { + if (*r == CHAR_QUESTIONMARK) + { + /* Process the question mark only in enhanced mode. + Otherwise, the question mark is an error in ERE + or a literal in BRE */ + if (ctx->cflags & REG_ENHANCED) + { + minimal = !(ctx->cflags & REG_UNGREEDY); + r++; + } + else return REG_BADRPT; + } + else if (*r == CHAR_STAR || *r == CHAR_PLUS) + { + /* These are reserved for future extensions. */ + return REG_BADRPT; + } + } } else { @@ -808,108 +1154,118 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) || *(r + 1) != CHAR_RBRACE) return REG_BADBR; r += 2; - } - - - /* Parse trailing '?' marking minimal repetition. */ - if (r < ctx->re_end) - { - if (*r == CHAR_QUESTIONMARK) - { - minimal = !(ctx->cflags & REG_UNGREEDY); - r++; - } - else if (*r == CHAR_STAR || *r == CHAR_PLUS) + if (r < ctx->re_end && *r == CHAR_STAR) { - /* These are reserved for future extensions. */ + /* This is reserved for future extensions. */ return REG_BADRPT; } } + if (minimal) + ctx->num_reorder_tags++; + + if (!result) goto parse_bound_exit; /* Create the AST node(s). */ - if (min == 0 && max == 0) - { - *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (*result == NULL) - return REG_ESPACE; - } - else - { - if (min < 0 && max < 0) - /* Only approximate parameters set, no repetitions. */ - min = max = 1; + /* Originally, if min == 0 && max == 0, we immediately replace the whole + iteration with EMPTY. This unfortunately drops any submatches, and + messes up setting the pmatch values (we can get tags of -1, and + tag values in the billions). So we leave it and process this case as + usual, and wait until tre_expand_ast() to replace with EMPTY */ +#ifdef TRE_APPROX + if (min < 0 && max < 0) + /* Only approximate parameters set, no repetitions. */ + min = max = 1; +#endif /* TRE_APPROX */ + + *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); + if (!*result) + return REG_ESPACE; - *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); - if (!*result) - return REG_ESPACE; +#ifdef TRE_APPROX + /* If approximate matching parameters are set, add them to the + iteration node. */ + if (approx || costs_set || counts_set) + { + int *params; + tre_iteration_t *iter = (*result)->obj; - /* If approximate matching parameters are set, add them to the - iteration node. */ - if (approx || costs_set || counts_set) + if (costs_set || counts_set) { - int *params; - tre_iteration_t *iter = (*result)->obj; - - if (costs_set || counts_set) + if (limit_ins == TRE_PARAM_UNSET) { - if (limit_ins == TRE_PARAM_UNSET) - { - if (cost_ins == TRE_PARAM_UNSET) - limit_ins = 0; - else - limit_ins = INT_MAX; - } - - if (limit_del == TRE_PARAM_UNSET) - { - if (cost_del == TRE_PARAM_UNSET) - limit_del = 0; - else - limit_del = INT_MAX; - } + if (cost_ins == TRE_PARAM_UNSET) + limit_ins = 0; + else + limit_ins = INT_MAX; + } - if (limit_subst == TRE_PARAM_UNSET) - { - if (cost_subst == TRE_PARAM_UNSET) - limit_subst = 0; - else - limit_subst = INT_MAX; - } + if (limit_del == TRE_PARAM_UNSET) + { + if (cost_del == TRE_PARAM_UNSET) + limit_del = 0; + else + limit_del = INT_MAX; } - if (cost_max == TRE_PARAM_UNSET) - cost_max = INT_MAX; - if (limit_err == TRE_PARAM_UNSET) - limit_err = INT_MAX; - - ctx->have_approx = 1; - params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); - if (!params) - return REG_ESPACE; - for (i = 0; i < TRE_PARAM_LAST; i++) - params[i] = TRE_PARAM_UNSET; - params[TRE_PARAM_COST_INS] = cost_ins; - params[TRE_PARAM_COST_DEL] = cost_del; - params[TRE_PARAM_COST_SUBST] = cost_subst; - params[TRE_PARAM_COST_MAX] = cost_max; - params[TRE_PARAM_MAX_INS] = limit_ins; - params[TRE_PARAM_MAX_DEL] = limit_del; - params[TRE_PARAM_MAX_SUBST] = limit_subst; - params[TRE_PARAM_MAX_ERR] = limit_err; - iter->params = params; + if (limit_subst == TRE_PARAM_UNSET) + { + if (cost_subst == TRE_PARAM_UNSET) + limit_subst = 0; + else + limit_subst = INT_MAX; + } } + + if (cost_max == TRE_PARAM_UNSET) + cost_max = INT_MAX; + if (limit_err == TRE_PARAM_UNSET) + limit_err = INT_MAX; + + ctx->have_approx = 1; + params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); + if (!params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = TRE_PARAM_UNSET; + params[TRE_PARAM_COST_INS] = cost_ins; + params[TRE_PARAM_COST_DEL] = cost_del; + params[TRE_PARAM_COST_SUBST] = cost_subst; + params[TRE_PARAM_COST_MAX] = cost_max; + params[TRE_PARAM_MAX_INS] = limit_ins; + params[TRE_PARAM_MAX_DEL] = limit_del; + params[TRE_PARAM_MAX_SUBST] = limit_subst; + params[TRE_PARAM_MAX_ERR] = limit_err; + iter->params = params; } +#endif /* TRE_APPROX */ +parse_bound_exit: +#ifdef TRE_APPROX DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " "limits [%d,%d,%d, total %d]\n", min, max, cost_ins, cost_del, cost_subst, cost_max, limit_ins, limit_del, limit_subst, limit_err)); +#else /* !TRE_APPROX */ + DPRINT(("tre_parse_bound: min %d, max %d\n", min, max)); +#endif /* !TRE_APPROX */ ctx->re = r; return REG_OK; } +/* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for + non-self-contained options, like (?i), this causes ((?i)fu)bar to be + treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect. + Because we now set up tags for even non-capturing parenthesized + subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we + pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and + have it restore cflags after the subexpression, we don't need to have + a separate PARSE_RESTORE_CFLAGS, and then after processing the + non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE. + This has the side-benefit of now matching the perl behavior: the RE + foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior + of foo AND (?i) (bar OR zap). */ typedef enum { PARSE_RE = 0, PARSE_ATOM, @@ -921,7 +1277,6 @@ typedef enum { PARSE_UNION, PARSE_POST_UNION, PARSE_POSTFIX, - PARSE_RESTORE_CFLAGS } tre_parse_re_stack_symbol_t; @@ -935,16 +1290,23 @@ tre_parse(tre_parse_ctx_t *ctx) int bottom = tre_stack_num_objects(stack); int depth = 0; int temporary_cflags = 0; + int bre_branch_begin; +#ifdef TRE_DEBUG + const tre_char_t *tmp_re; +#endif - DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", - ctx->len, ctx->re, ctx->len)); + DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n", + ctx->len, ctx->re, ctx->len, ctx->cflags)); + if (ctx->len <= 0) return REG_EMPTY; if (!ctx->nofirstsub) { + STACK_PUSH(stack, int, ctx->cflags); STACK_PUSH(stack, int, ctx->submatch_id); STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); ctx->submatch_id++; } + STACK_PUSH(stack, int, 0); // bre_branch_begin STACK_PUSH(stack, int, PARSE_RE); ctx->re_start = ctx->re; ctx->re_end = ctx->re + ctx->len; @@ -954,38 +1316,40 @@ tre_parse(tre_parse_ctx_t *ctx) an explicit stack instead of recursive functions mostly because of two reasons: compatibility with systems which have an overflowable call stack, and efficiency (both in lines of code and speed). */ - while (tre_stack_num_objects(stack) > bottom && status == REG_OK) + while (tre_stack_num_objects(stack) > bottom) { - if (status != REG_OK) - break; symbol = tre_stack_pop_int(stack); switch (symbol) { case PARSE_RE: /* Parse a full regexp. A regexp is one or more branches, separated by the union operator `|'. */ + bre_branch_begin = tre_stack_pop_int(stack); + if ( #ifdef REG_LITERAL - if (!(ctx->cflags & REG_LITERAL) - && ctx->cflags & REG_EXTENDED) + !(ctx->cflags & REG_LITERAL) && #endif /* REG_LITERAL */ + ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_BRANCH); break; case PARSE_BRANCH: /* Parse a branch. A branch is one or more pieces, concatenated. A piece is an atom possibly followed by a postfix operator. */ + bre_branch_begin = tre_stack_pop_int(stack); STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_PIECE); break; case PARSE_PIECE: /* Parse a piece. A piece is an atom possibly followed by one or more postfix operators. */ -#ifdef REG_LITERAL - if (!(ctx->cflags & REG_LITERAL)) -#endif /* REG_LITERAL */ - STACK_PUSHX(stack, int, PARSE_POSTFIX); + bre_branch_begin = tre_stack_pop_int(stack); + STACK_PUSHX(stack, int, PARSE_POSTFIX); + STACK_PUSHX(stack, int, bre_branch_begin); STACK_PUSHX(stack, int, PARSE_ATOM); break; @@ -1000,20 +1364,23 @@ tre_parse(tre_parse_ctx_t *ctx) if (!(ctx->cflags & REG_LITERAL)) { #endif /* REG_LITERAL */ - if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) + if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) || + ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED + && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && + *(ctx->re + 1) == CHAR_PIPE)) break; if ((ctx->cflags & REG_EXTENDED && c == CHAR_RPAREN && depth > 0) || (!(ctx->cflags & REG_EXTENDED) - && (c == CHAR_BACKSLASH - && *(ctx->re + 1) == CHAR_RPAREN))) + && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_RPAREN)) { if (!(ctx->cflags & REG_EXTENDED) && depth == 0) - status = REG_EPAREN; + return REG_EPAREN; DPRINT(("tre_parse: group end: '%.*" STRF "'\n", REST(ctx->re))); depth--; - if (!(ctx->cflags & REG_EXTENDED)) + if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED))) ctx->re += 2; break; } @@ -1021,22 +1388,24 @@ tre_parse(tre_parse_ctx_t *ctx) } #endif /* REG_LITERAL */ -#ifdef REG_RIGHT_ASSOC - if (ctx->cflags & REG_RIGHT_ASSOC) +#ifdef REG_LEFT_ASSOC + if (ctx->cflags & REG_LEFT_ASSOC) { - /* Right associative concatenation. */ + /* Left associative concatenation. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_CATENATION); - STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_PIECE); } else -#endif /* REG_RIGHT_ASSOC */ +#endif /* REG_LEFT_ASSOC */ { - /* Default case, left associative concatenation. */ - STACK_PUSHX(stack, int, PARSE_CATENATION); + /* Default case, right associative concatenation. */ STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_CATENATION); + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, 0); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_PIECE); } break; @@ -1060,14 +1429,24 @@ tre_parse(tre_parse_ctx_t *ctx) if (ctx->cflags & REG_LITERAL) break; #endif /* REG_LITERAL */ + if (!(ctx->cflags & REG_EXTENDED)) + { + if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end) + break; + ctx->re++; + } switch (*ctx->re) { case CHAR_PIPE: DPRINT(("tre_parse: union: '%.*" STRF "'\n", REST(ctx->re))); STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, voidptr, (void *)ctx->re); STACK_PUSHX(stack, voidptr, result); STACK_PUSHX(stack, int, PARSE_POST_UNION); + /* We need to pass a boolean (eventually) to PARSE_ATOM to + indicate if this is the beginning of a BRE extended branch. */ + STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin STACK_PUSHX(stack, int, PARSE_BRANCH); ctx->re++; break; @@ -1077,6 +1456,8 @@ tre_parse(tre_parse_ctx_t *ctx) break; default: + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re--; break; } break; @@ -1085,6 +1466,12 @@ tre_parse(tre_parse_ctx_t *ctx) { tre_ast_node_t *tmp_node; tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); + const tre_char_t *pipechar = tre_stack_pop_voidptr(stack); + /* error on empty expression at end of union */ + if (pipechar == ctx->re - 1) + { + return REG_EMPTY; + } tmp_node = tre_ast_new_union(ctx->mem, tree, result); if (!tmp_node) return REG_ESPACE; @@ -1100,6 +1487,12 @@ tre_parse(tre_parse_ctx_t *ctx) if (ctx->cflags & REG_LITERAL) break; #endif /* REG_LITERAL */ + int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; + int rep_min = 0; + int rep_max = -1; +#ifdef TRE_DEBUG + int lbrace_off; +#endif switch (*ctx->re) { case CHAR_PLUS: @@ -1110,598 +1503,803 @@ tre_parse(tre_parse_ctx_t *ctx) case CHAR_STAR: { tre_ast_node_t *tmp_node; - int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; - int rep_min = 0; - int rep_max = -1; #ifdef TRE_DEBUG - const tre_char_t *tmp_re; + const char *tstr = "star"; + tmp_re = ctx->re; #endif + handle_plus_or_question: + /* error on iteration of raw assertion (not in subexpression) */ + if (result->type == LITERAL && result->submatch_id < 0 && + IS_ASSERTION((tre_literal_t *)result->obj)) + { + if (!(ctx->cflags & REG_EXTENDED)) break; + return REG_BADRPT; + } if (*ctx->re == CHAR_PLUS) - rep_min = 1; + { + rep_min = 1; +#ifdef TRE_DEBUG + tstr = "plus"; +#endif + } if (*ctx->re == CHAR_QUESTIONMARK) - rep_max = 1; + { + rep_max = 1; #ifdef TRE_DEBUG - tmp_re = ctx->re; + tstr = "questionmark"; #endif + } - if (ctx->re + 1 < ctx->re_end) + if (ctx->cflags & REG_EXTENDED) { - if (*(ctx->re + 1) == CHAR_QUESTIONMARK) + if (ctx->re + 1 < ctx->re_end) { - minimal = !(ctx->cflags & REG_UNGREEDY); - ctx->re++; + if (*(ctx->re + 1) == CHAR_QUESTIONMARK) + { + /* Process the question mark only in enhanced mode. + Otherwise, the question mark is an error in ERE */ + if (ctx->cflags & REG_ENHANCED) + { + minimal = !(ctx->cflags & REG_UNGREEDY); + ctx->re++; + } + else return REG_BADRPT; + } + else if (*(ctx->re + 1) == CHAR_STAR + || *(ctx->re + 1) == CHAR_PLUS) + { + /* These are reserved for future extensions. */ + return REG_BADRPT; + } } - else if (*(ctx->re + 1) == CHAR_STAR - || *(ctx->re + 1) == CHAR_PLUS) + } + else + { + if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR) { - /* These are reserved for future extensions. */ + /* This is reserved for future extensions. */ return REG_BADRPT; } - } + if (ctx->re + 2 < ctx->re_end) + { + if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK) + { + /* Process the question mark only in enhanced mode. + Otherwise, the question mark is a literal in BRE */ + if (ctx->cflags & REG_ENHANCED) + { + minimal = !(ctx->cflags & REG_UNGREEDY); + ctx->re += 2; + } + } + else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS) + { + /* This is reserved for future extensions. */ + return REG_BADRPT; + } + } + } - DPRINT(("tre_parse: %s star: '%.*" STRF "'\n", - minimal ? " minimal" : "greedy", REST(tmp_re))); + if (minimal) + ctx->num_reorder_tags++; + + DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n", + minimal ? " minimal" : "greedy", tstr, REST(tmp_re))); + if (result == NULL) + { + if (ctx->cflags & REG_EXTENDED) return REG_BADRPT; + else goto parse_literal; + } ctx->re++; tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, minimal); if (tmp_node == NULL) return REG_ESPACE; result = tmp_node; + + /* Set the iterator with a submatch id in the invisible range + * (which will be overridden if a real submatch is needed) */ + result->submatch_id = ctx->submatch_id_invisible++; + +#if 0 + /* We don't allow multiple postfixes, but this might be needed + to support approximate matching */ STACK_PUSHX(stack, int, PARSE_POSTFIX); +#endif } break; case CHAR_BACKSLASH: /* "\{" is special without REG_EXTENDED */ + /* "\+" and "\?" are special with REG_ENHANCED for BRE */ if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *(ctx->re + 1) == CHAR_LBRACE) + && ctx->re + 1 < ctx->re_end) { - ctx->re++; - goto parse_brace; + switch (*(ctx->re + 1)) + { + case CHAR_LBRACE: + ctx->re++; +#ifdef TRE_DEBUG + lbrace_off = 2; +#endif + goto parse_brace; + case CHAR_PLUS: + case CHAR_QUESTIONMARK: + if (ctx->cflags & REG_ENHANCED) + { +#ifdef TRE_DEBUG + tmp_re = ctx->re; +#endif + ctx->re++; + goto handle_plus_or_question; + } + break; + } + break; } else break; case CHAR_LBRACE: - /* "{" is literal without REG_EXTENDED */ - if (!(ctx->cflags & REG_EXTENDED)) - break; + { + int raw_assertion; + + /* "{" is literal without REG_EXTENDED */ + if (!(ctx->cflags & REG_EXTENDED)) + break; +#ifdef TRE_DEBUG + lbrace_off = 1; +#endif parse_brace: - DPRINT(("tre_parse: bound: '%.*" STRF "'\n", - REST(ctx->re))); - ctx->re++; + /* error on iteration of raw assertion (not in subexpression), + but wait until after parsing bounds */ + raw_assertion = (result->type == LITERAL + && result->submatch_id < 0 + && IS_ASSERTION((tre_literal_t *)result->obj)); + ctx->re++; - status = tre_parse_bound(ctx, &result); - if (status != REG_OK) - return status; - STACK_PUSHX(stack, int, PARSE_POSTFIX); - break; + status = tre_parse_bound(ctx, &result); +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + /* For ERE, if status is REG_NOMATCH, this mean the lbrace + is to be treated as a literal. */ + if (status == REG_NOMATCH) + { + ctx->re--; + break; + } +#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + DPRINT(("tre_parse: bound: '%.*" STRF "'\n", + REST(ctx->re - lbrace_off))); + if (status != REG_OK) + return status; + if (raw_assertion) return REG_BADRPT; + + /* Set the iterator with a submatch id in the invisible range + * (which will be overridden if a real submatch is needed) */ + if (result->type == ITERATION) + result->submatch_id = ctx->submatch_id_invisible++; + +#if 0 + /* We don't allow multiple postfixes, but this might be needed + to support approximate matching */ + STACK_PUSHX(stack, int, PARSE_POSTFIX); +#endif + break; + } } break; case PARSE_ATOM: - /* Parse an atom. An atom is a regular expression enclosed in `()', - an empty set of `()', a bracket expression, `.', `^', `$', - a `\' followed by a character, or a single character. */ + { + /* Parse an atom. An atom is a regular expression enclosed in `()', + an empty set of `()', a bracket expression, `.', `^', `$', + a `\' followed by a character, or a single character. */ - /* End of regexp? (empty string). */ - if (ctx->re >= ctx->re_end) - goto parse_literal; + /* The stack contains a boolean value, whether PARSE_ATOM is + being called just after the start of a group (left paren) + in a BRE */ + bre_branch_begin = tre_stack_pop_int(stack); + + /* End of regexp? (empty string). */ + if (ctx->re >= ctx->re_end) + goto parse_literal; #ifdef REG_LITERAL - if (ctx->cflags & REG_LITERAL) - goto parse_literal; + if (ctx->cflags & REG_LITERAL) + goto parse_literal; #endif /* REG_LITERAL */ - switch (*ctx->re) - { - case CHAR_LPAREN: /* parenthesized subexpression */ + switch (*ctx->re) + { + case CHAR_LPAREN: /* parenthesized subexpression */ - /* Handle "(?...)" extensions. They work in a way similar - to Perls corresponding extensions. */ - if (ctx->cflags & REG_EXTENDED - && *(ctx->re + 1) == CHAR_QUESTIONMARK) - { - int new_cflags = ctx->cflags; - int bit = 1; - DPRINT(("tre_parse: extension: '%.*" STRF "\n", - REST(ctx->re))); - ctx->re += 2; - while (/*CONSTCOND*/1) - { - if (*ctx->re == L'i') - { - DPRINT(("tre_parse: icase: '%.*" STRF "\n", - REST(ctx->re))); - if (bit) - new_cflags |= REG_ICASE; - else - new_cflags &= ~REG_ICASE; - ctx->re++; - } - else if (*ctx->re == L'n') - { - DPRINT(("tre_parse: newline: '%.*" STRF "\n", - REST(ctx->re))); - if (bit) - new_cflags |= REG_NEWLINE; - else - new_cflags &= ~REG_NEWLINE; - ctx->re++; - } -#ifdef REG_RIGHT_ASSOC - else if (*ctx->re == L'r') - { - DPRINT(("tre_parse: right assoc: '%.*" STRF "\n", - REST(ctx->re))); - if (bit) - new_cflags |= REG_RIGHT_ASSOC; - else - new_cflags &= ~REG_RIGHT_ASSOC; - ctx->re++; - } -#endif /* REG_RIGHT_ASSOC */ + /* Handle "(?...)" extensions. They work in a way similar + to Perls corresponding extensions. */ + if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) == + (REG_EXTENDED|REG_ENHANCED) + && *(ctx->re + 1) == CHAR_QUESTIONMARK) + { + int new_cflags = ctx->cflags; + int bit = 1; + int invisible_submatch = 0; + DPRINT(("tre_parse: extension: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->re += 2; + while (/*CONSTCOND*/1) + { + if (*ctx->re == L'i') + { + DPRINT(("tre_parse: icase: '%.*" STRF "'\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_ICASE; + else + new_cflags &= ~REG_ICASE; + ctx->re++; + } + else if (*ctx->re == L'n') + { + DPRINT(("tre_parse: newline: '%.*" STRF "'\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_NEWLINE; + else + new_cflags &= ~REG_NEWLINE; + ctx->re++; + } +#ifdef REG_LEFT_ASSOC + else if (*ctx->re == L'l') + { + DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_LEFT_ASSOC; + else + new_cflags &= ~REG_LEFT_ASSOC; + ctx->re++; + } +#endif /* REG_LEFT_ASSOC */ #ifdef REG_UNGREEDY - else if (*ctx->re == L'U') - { - DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n", - REST(ctx->re))); - if (bit) - new_cflags |= REG_UNGREEDY; - else - new_cflags &= ~REG_UNGREEDY; - ctx->re++; - } + else if (*ctx->re == L'U') + { + DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n", + REST(ctx->re))); + if (bit) + new_cflags |= REG_UNGREEDY; + else + new_cflags &= ~REG_UNGREEDY; + ctx->re++; + } #endif /* REG_UNGREEDY */ - else if (*ctx->re == CHAR_MINUS) - { - DPRINT(("tre_parse: turn off: '%.*" STRF "\n", - REST(ctx->re))); - ctx->re++; - bit = 0; - } - else if (*ctx->re == CHAR_COLON) - { - DPRINT(("tre_parse: no group: '%.*" STRF "\n", - REST(ctx->re))); - ctx->re++; - depth++; - break; - } - else if (*ctx->re == CHAR_HASH) - { - DPRINT(("tre_parse: comment: '%.*" STRF "\n", - REST(ctx->re))); - /* A comment can contain any character except a - right parenthesis */ - while (*ctx->re != CHAR_RPAREN - && ctx->re < ctx->re_end) + else if (*ctx->re == CHAR_MINUS) + { + DPRINT(("tre_parse: turn off: '%.*" STRF "'\n", + REST(ctx->re))); ctx->re++; - if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) - { + bit = 0; + } + else if (*ctx->re == CHAR_COLON) + { + DPRINT(("tre_parse: no group: '%.*" STRF + "', (invisible submatch %d)\n", + REST(ctx->re), ctx->submatch_id_invisible)); + ctx->re++; + depth++; + invisible_submatch = 1; + break; + } + else if (*ctx->re == CHAR_HASH) + { + DPRINT(("tre_parse: comment: '%.*" STRF "'\n", + REST(ctx->re))); + /* A comment can contain any character except a + right parenthesis */ + while (*ctx->re != CHAR_RPAREN + && ctx->re < ctx->re_end) ctx->re++; - break; - } - else - return REG_BADPAT; - } - else if (*ctx->re == CHAR_RPAREN) - { - ctx->re++; - break; - } - else - return REG_BADPAT; - } - - /* Turn on the cflags changes for the rest of the - enclosing group. */ - STACK_PUSHX(stack, int, ctx->cflags); - STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS); - STACK_PUSHX(stack, int, PARSE_RE); - ctx->cflags = new_cflags; - break; - } + if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) + { + ctx->re++; + break; + } + else + return REG_BADPAT; + } + else if (*ctx->re == CHAR_RPAREN) + { + ctx->re++; + break; + } + else + return REG_BADRPT; + } - if (ctx->cflags & REG_EXTENDED - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == CHAR_BACKSLASH)) - { - depth++; - if (ctx->re + 2 < ctx->re_end - && *(ctx->re + 1) == CHAR_QUESTIONMARK - && *(ctx->re + 2) == CHAR_COLON) - { - DPRINT(("tre_parse: group begin: '%.*" STRF - "', no submatch\n", REST(ctx->re))); - /* Don't mark for submatching. */ - ctx->re += 3; - STACK_PUSHX(stack, int, PARSE_RE); - } - else - { - DPRINT(("tre_parse: group begin: '%.*" STRF - "', submatch %d\n", REST(ctx->re), - ctx->submatch_id)); - ctx->re++; - /* First parse a whole RE, then mark the resulting tree - for submatching. */ - STACK_PUSHX(stack, int, ctx->submatch_id); - STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); - STACK_PUSHX(stack, int, PARSE_RE); - ctx->submatch_id++; + /* Turn on the cflags changes for the rest of the + enclosing group. */ + if (invisible_submatch) + { + STACK_PUSHX(stack, int, ctx->cflags); + STACK_PUSHX(stack, int, ctx->submatch_id_invisible); + STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); + ctx->submatch_id_invisible++; + STACK_PUSHX(stack, int, 0); // bre_branch_begin + STACK_PUSHX(stack, int, PARSE_RE); + } + else { + STACK_PUSHX(stack, int, 0); // bre_branch_begin + STACK_PUSHX(stack, int, PARSE_ATOM); } - } - else - goto parse_literal; - break; + ctx->cflags = new_cflags; + break; + } - case CHAR_RPAREN: /* end of current subexpression */ - if ((ctx->cflags & REG_EXTENDED && depth > 0) - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == CHAR_BACKSLASH)) - { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - REST(ctx->re))); - /* We were expecting an atom, but instead the current - subexpression was closed. POSIX leaves the meaning of - this to be implementation-defined. We interpret this as - an empty expression (which matches an empty string). */ - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (result == NULL) - return REG_ESPACE; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re--; - } - else - goto parse_literal; - break; + if (ctx->cflags & REG_EXTENDED) + { + parse_bre_lparen: + DPRINT(("tre_parse: group begin: '%.*" STRF + "', submatch %d\n", REST(ctx->re), + ctx->submatch_id)); + ctx->re++; + /* First parse a whole RE, then mark the resulting tree + for submatching. */ + STACK_PUSHX(stack, int, ctx->cflags); + STACK_PUSHX(stack, int, ctx->submatch_id); + STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); + /* We need to pass a boolean (eventually) to PARSE_ATOM to + indicate if this is the beginning of a BRE group. */ + STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED)); + STACK_PUSHX(stack, int, PARSE_RE); + ctx->submatch_id++; + depth++; + } + else + goto parse_literal; + break; - case CHAR_LBRACKET: /* bracket expression */ - DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", - REST(ctx->re))); - ctx->re++; - status = tre_parse_bracket(ctx, &result); - if (status != REG_OK) - return status; - break; + case CHAR_RPAREN: /* end of current subexpression */ + if (ctx->cflags & REG_EXTENDED && depth > 0) + { + parse_bre_rparen_empty: + if (!(ctx->cflags & REG_EXTENDED) && depth == 0) + return REG_EPAREN; + DPRINT(("tre_parse: empty: '%.*" STRF "'\n", + REST(ctx->re))); + /* We were expecting an atom, but instead the current + subexpression was closed. POSIX leaves the meaning of + this to be implementation-defined. We interpret this as + an empty expression (which matches an empty string). */ + result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (result == NULL) + return REG_ESPACE; + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re--; + } + else + goto parse_literal; + break; - case CHAR_BACKSLASH: - /* If this is "\(" or "\)" chew off the backslash and - try again. */ - if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && (*(ctx->re + 1) == CHAR_LPAREN - || *(ctx->re + 1) == CHAR_RPAREN)) - { - ctx->re++; - STACK_PUSHX(stack, int, PARSE_ATOM); - break; - } + case CHAR_LBRACKET: /* bracket expression */ + DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->re++; + status = tre_parse_bracket(ctx, &result); + if (status != REG_OK) + return status; + break; - /* If a macro is used, parse the expanded macro recursively. */ - { - tre_char_t buf[64]; - tre_expand_macro(ctx->re + 1, ctx->re_end, - buf, elementsof(buf)); - if (buf[0] != 0) + case CHAR_BACKSLASH: + /* Deal with "\(", "\)" or "\{" for BREs */ + if (!(ctx->cflags & REG_EXTENDED) + && ctx->re + 1 < ctx->re_end) { - tre_parse_ctx_t subctx; - memcpy(&subctx, ctx, sizeof(subctx)); - subctx.re = buf; - subctx.len = tre_strlen(buf); - subctx.nofirstsub = 1; - status = tre_parse(&subctx); - if (status != REG_OK) - return status; - ctx->re += 2; - ctx->position = subctx.position; - result = subctx.result; - break; + if (*(ctx->re + 1) == CHAR_LPAREN) + { + ctx->re++; + goto parse_bre_lparen; + } + else if (*(ctx->re + 1) == CHAR_RPAREN) + { + ctx->re++; + goto parse_bre_rparen_empty; + } + if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal; } - } - if (ctx->re + 1 >= ctx->re_end) - /* Trailing backslash. */ - return REG_EESCAPE; + if (ctx->re + 1 >= ctx->re_end) + /* Trailing backslash. */ + return REG_EESCAPE; -#ifdef REG_LITERAL - if (*(ctx->re + 1) == L'Q') - { - DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", - REST(ctx->re))); - ctx->cflags |= REG_LITERAL; - temporary_cflags |= REG_LITERAL; - ctx->re += 2; - STACK_PUSHX(stack, int, PARSE_ATOM); - break; - } -#endif /* REG_LITERAL */ + if (!(ctx->cflags & REG_ENHANCED)) + { + DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re))); + ctx->re++; + goto unenhanced_backslash; + } - DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); - ctx->re++; - switch (*ctx->re) + /* If a macro is used, parse the expanded macro recursively. */ { - case L'b': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB, -1); - ctx->re++; - break; - case L'B': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB_NEG, -1); - ctx->re++; - break; - case L'<': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOW, -1); - ctx->re++; - break; - case L'>': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOW, -1); - ctx->re++; - break; - case L'x': - ctx->re++; - if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) + tre_char_t buf[64]; + tre_expand_macro(ctx->re + 1, ctx->re_end, + buf, elementsof(buf)); + if (buf[0] != 0) { - /* 8 bit hex char. */ - char tmp[3] = {0, 0, 0}; - long val; - DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", - REST(ctx->re - 2))); - - if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) - { - tmp[0] = (char)ctx->re[0]; - ctx->re++; - } - if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end) - { - tmp[1] = (char)ctx->re[0]; - ctx->re++; - } - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, - (int)val, ctx->position); - ctx->position++; - break; - } - else if (ctx->re < ctx->re_end) - { - /* Wide char. */ - char tmp[32]; - long val; - int i = 0; - ctx->re++; - while (ctx->re_end - ctx->re >= 0) - { - if (ctx->re[0] == CHAR_RBRACE) - break; - if (tre_isxdigit(ctx->re[0])) - { - tmp[i] = (char)ctx->re[0]; - i++; - ctx->re++; - continue; - } - return REG_EBRACE; - } - ctx->re++; - tmp[i] = 0; - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, - ctx->position); - ctx->position++; + tre_parse_ctx_t subctx; + memcpy(&subctx, ctx, sizeof(subctx)); + subctx.re = buf; + subctx.len = tre_strlen(buf); + subctx.nofirstsub = 1; + status = tre_parse(&subctx); + if (status != REG_OK) + return status; + ctx->re += 2; + ctx->position = subctx.position; + result = subctx.result; break; } - /*FALLTHROUGH*/ - - default: - if (tre_isdigit(*ctx->re)) - { - /* Back reference. */ - int val = *ctx->re - L'0'; - DPRINT(("tre_parse: backref: '%.*" STRF "'\n", - REST(ctx->re - 1))); - result = tre_ast_new_literal(ctx->mem, BACKREF, val, - ctx->position); - if (result == NULL) - return REG_ESPACE; - ctx->position++; - ctx->max_backref = MAX(val, ctx->max_backref); - ctx->re++; - } - else - { - /* Escaped character. */ - DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", - REST(ctx->re - 1))); - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); - ctx->position++; - ctx->re++; - } - break; } - if (result == NULL) - return REG_ESPACE; - break; - case CHAR_PERIOD: /* the any-symbol */ - DPRINT(("tre_parse: any: '%.*" STRF "'\n", - REST(ctx->re))); - if (ctx->cflags & REG_NEWLINE) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, - ctx->position + 1); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; - ctx->position += 2; - } - else - { - result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, +#ifdef REG_LITERAL + if (*(ctx->re + 1) == L'Q') + { + DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->cflags |= REG_LITERAL; + temporary_cflags |= REG_LITERAL; + ctx->re += 2; + STACK_PUSHX(stack, int, 0); + STACK_PUSHX(stack, int, PARSE_ATOM); + break; + } +#endif /* REG_LITERAL */ + + DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); + ctx->re++; + switch (*ctx->re) + { + case L'b': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB, -1); + ctx->re++; + break; + case L'B': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_WB_NEG, -1); + ctx->re++; + break; + case L'<': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_BOW, -1); + ctx->re++; + break; + case L'>': + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_EOW, -1); + ctx->re++; + break; + case L'x': + ctx->re++; + if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) + { + /* 8 bit hex char. */ + char tmp[3] = {0, 0, 0}; + long val; + DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", + REST(ctx->re - 2))); + + if (tre_isxdigit_l(ctx->re[0], ctx->loc) && + ctx->re < ctx->re_end) + { + tmp[0] = (char)ctx->re[0]; + ctx->re++; + } + if (tre_isxdigit_l(ctx->re[0], ctx->loc) && + ctx->re < ctx->re_end) + { + tmp[1] = (char)ctx->re[0]; + ctx->re++; + } + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, + (int)val, ctx->position); + ctx->position++; + break; + } + else if (ctx->re < ctx->re_end) + { + /* Wide char. */ + char tmp[32]; + long val; + int i = 0; + ctx->re++; + while (ctx->re_end - ctx->re >= 0) + { + if (ctx->re[0] == CHAR_RBRACE) + break; + if (tre_isxdigit_l(ctx->re[0], ctx->loc)) + { + tmp[i] = (char)ctx->re[0]; + i++; + ctx->re++; + continue; + } + return REG_EBRACE; + } + ctx->re++; + tmp[i] = 0; + val = strtol(tmp, NULL, 16); + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, + ctx->position); + ctx->position++; + break; + } + /*FALLTHROUGH*/ + + default: + unenhanced_backslash: + if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) != + REG_EXTENDED && + tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0') + { + /* Back reference (only in BRE or enhanced). */ + int val = *ctx->re - L'0'; + DPRINT(("tre_parse: backref: '%.*" STRF "'\n", + REST(ctx->re - 1))); + result = tre_ast_new_literal(ctx->mem, BACKREF, val, + ctx->position); + if (result == NULL) + return REG_ESPACE; + + /* Set the backref with a submatch id in the invisible + * range (which will be overridden if a real submatch + * is needed) */ + result->submatch_id = ctx->submatch_id_invisible++; + + ctx->position++; + ctx->num_reorder_tags++; + ctx->max_backref = MAX(val, ctx->max_backref); + ctx->re++; + } + else + { + /* Escaped character. */ + DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", + REST(ctx->re - 1))); + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, + ctx->position); + ctx->position++; + ctx->re++; + } + break; + } + if (result == NULL) + return REG_ESPACE; + break; + + case CHAR_PERIOD: /* the any-symbol */ + DPRINT(("tre_parse: any: '%.*" STRF "'\n", + REST(ctx->re))); + if (ctx->cflags & REG_NEWLINE) + { + tre_ast_node_t *tmp1; + tre_ast_node_t *tmp2; + tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, ctx->position); - if (!result) - return REG_ESPACE; - ctx->position++; - } - ctx->re++; - break; + if (!tmp1) + return REG_ESPACE; + tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, + ctx->position + 1); + if (!tmp2) + return REG_ESPACE; + result = tre_ast_new_union(ctx->mem, tmp1, tmp2); + if (!result) + return REG_ESPACE; + ctx->position += 2; + } + else + { + result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, + ctx->position); + if (!result) + return REG_ESPACE; + ctx->position++; + } + ctx->re++; + break; - case CHAR_CARET: /* beginning of line assertion */ - /* '^' has a special meaning everywhere in EREs, and in the - beginning of the RE and after \( is BREs. */ - if (ctx->cflags & REG_EXTENDED - || (ctx->re - 2 >= ctx->re_start - && *(ctx->re - 2) == CHAR_BACKSLASH - && *(ctx->re - 1) == CHAR_LPAREN) - || ctx->re == ctx->re_start) - { - DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", - REST(ctx->re))); - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; - } - else - goto parse_literal; - break; + case CHAR_CARET: /* beginning of line assertion */ + /* '^' has a special meaning everywhere in EREs, at the + beginning of the RE and after \( is BREs. It is also + special in enhanced BREs at the beginning of each branches + of a union */ + if (ctx->cflags & REG_EXTENDED + || bre_branch_begin + || ctx->re == ctx->re_start) + { + DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", + REST(ctx->re))); + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_BOL, -1); + if (result == NULL) + return REG_ESPACE; + ctx->re++; + } + else + goto parse_literal; + break; - case CHAR_DOLLAR: /* end of line assertion. */ - /* '$' is special everywhere in EREs, and in the end of the - string and before \) is BREs. */ - if (ctx->cflags & REG_EXTENDED - || (ctx->re + 2 < ctx->re_end - && *(ctx->re + 1) == CHAR_BACKSLASH - && *(ctx->re + 2) == CHAR_RPAREN) - || ctx->re + 1 == ctx->re_end) - { - DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", - REST(ctx->re))); - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; - } - else - goto parse_literal; - break; + case CHAR_DOLLAR: /* end of line assertion. */ + /* '$' is special everywhere in EREs, and in the end of the + string and before \) is BREs. */ + if (ctx->cflags & REG_EXTENDED + || (ctx->re + 2 < ctx->re_end + && *(ctx->re + 1) == CHAR_BACKSLASH + && *(ctx->re + 2) == CHAR_RPAREN) + || ctx->re + 1 == ctx->re_end) + { + DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", + REST(ctx->re))); + result = tre_ast_new_literal(ctx->mem, ASSERTION, + ASSERT_AT_EOL, -1); + if (result == NULL) + return REG_ESPACE; + ctx->re++; + } + else + goto parse_literal; + break; - default: - parse_literal: + default: + parse_literal: - if (temporary_cflags && ctx->re + 1 < ctx->re_end - && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') - { - DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", - REST(ctx->re))); - ctx->cflags &= ~temporary_cflags; - temporary_cflags = 0; - ctx->re += 2; - STACK_PUSHX(stack, int, PARSE_PIECE); - break; - } + if (temporary_cflags && ctx->re + 1 < ctx->re_end + && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') + { + DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", + REST(ctx->re))); + ctx->cflags &= ~temporary_cflags; + temporary_cflags = 0; + ctx->re += 2; + if (ctx->re < ctx->re_end) + { + STACK_PUSHX(stack, int, 0); + STACK_PUSHX(stack, int, PARSE_ATOM); + } + else + { + result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!result) return REG_ESPACE; + } + break; + } - /* We are expecting an atom. If the subexpression (or the whole - regexp ends here, we interpret it as an empty expression - (which matches an empty string). */ - if ( + /* We are expecting an atom. If the subexpression (or the whole + regexp ends here, we interpret it as an empty expression + (which matches an empty string), which is an error. + Iterations of an empty expression is also an error. */ #ifdef REG_LITERAL - !(ctx->cflags & REG_LITERAL) && + if (!(ctx->cflags & REG_LITERAL)) + { +#endif /* REG_LITERAL */ + /* error on end of string */ + if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN + : REG_EMPTY; + /* error on unions and iterations of empty expressions */ + if (ctx->cflags & REG_EXTENDED) + { + if (ctx->re < ctx->re_end) + { + if (*ctx->re == CHAR_PIPE) return REG_EMPTY; + if (*ctx->re == CHAR_LBRACE) + { + ctx->re++; + empty_parse_bound: + /* We need to parse the bound first and return + any error, before returning REG_BADRPT */ + status = tre_parse_bound(ctx, NULL); +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + /* For ERE, if REG_NOMATCH is returned, we + treat the lbrace as a literal. */ + if (status == REG_NOMATCH) + { + ctx->re--; + /* Drop down to literal-handling code */ + } + else + { +#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + if (status != REG_OK) + return status; + return REG_BADRPT; +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + } +#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + } +#ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND + else +#endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ + if (*ctx->re == CHAR_STAR + || *ctx->re == CHAR_PLUS + || *ctx->re == CHAR_QUESTIONMARK) + { + return REG_BADRPT; + } + } + } + else if (ctx->re + 1 < ctx->re_end + && *ctx->re == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_LBRACE) + { + ctx->re += 2; + goto empty_parse_bound; + } +#ifdef REG_LITERAL + } #endif /* REG_LITERAL */ - (ctx->re >= ctx->re_end - || *ctx->re == CHAR_STAR - || (ctx->cflags & REG_EXTENDED - && (*ctx->re == CHAR_PIPE - || *ctx->re == CHAR_LBRACE - || *ctx->re == CHAR_PLUS - || *ctx->re == CHAR_QUESTIONMARK)) - /* Test for "\)" in BRE mode. */ - || (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *ctx->re == CHAR_BACKSLASH - && *(ctx->re + 1) == CHAR_LBRACE))) - { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - REST(ctx->re))); - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (!result) - return REG_ESPACE; - break; - } - DPRINT(("tre_parse: literal: '%.*" STRF "'\n", - REST(ctx->re))); - /* Note that we can't use an tre_isalpha() test here, since there - may be characters which are alphabetic but neither upper or - lower case. */ - if (ctx->cflags & REG_ICASE - && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - - /* XXX - Can there be more than one opposite-case - counterpoints for some character in some locale? Or - more than two characters which all should be regarded - the same character if case is ignored? If yes, there - does not seem to be a portable way to detect it. I guess - that at least for multi-character collating elements there - could be several opposite-case counterpoints, but they - cannot be supported portably anyway. */ - tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), - tre_toupper(*ctx->re), - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), - tre_tolower(*ctx->re), - ctx->position); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; - } - else - { - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, + DPRINT(("tre_parse: literal: '%.*" STRF "'\n", + REST(ctx->re))); + /* Note that we can't use an tre_isalpha() test here, since there + may be characters which are alphabetic but neither upper or + lower case. */ + if (ctx->cflags & REG_ICASE + && (tre_isupper_l(*ctx->re, ctx->loc) || + tre_islower_l(*ctx->re, ctx->loc))) + { + tre_ast_node_t *tmp1; + tre_ast_node_t *tmp2; + + /* XXX - Can there be more than one opposite-case + counterpoints for some character in some locale? Or + more than two characters which all should be regarded + the same character if case is ignored? If yes, there + does not seem to be a portable way to detect it. I guess + that at least for multi-character collating elements there + could be several opposite-case counterpoints, but they + cannot be supported portably anyway. */ + tmp1 = tre_ast_new_literal(ctx->mem, + tre_toupper_l(*ctx->re, ctx->loc), + tre_toupper_l(*ctx->re, ctx->loc), ctx->position); - if (!result) - return REG_ESPACE; - } - ctx->position++; - ctx->re++; - break; - } - break; + if (!tmp1) + return REG_ESPACE; + tmp2 = tre_ast_new_literal(ctx->mem, + tre_tolower_l(*ctx->re, ctx->loc), + tre_tolower_l(*ctx->re, ctx->loc), + ctx->position); + if (!tmp2) + return REG_ESPACE; + result = tre_ast_new_union(ctx->mem, tmp1, tmp2); + if (!result) + return REG_ESPACE; + } + else + { + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, + ctx->position); + if (!result) + return REG_ESPACE; + } + ctx->position++; + ctx->re++; + break; + } + break; + } case PARSE_MARK_FOR_SUBMATCH: { int submatch_id = tre_stack_pop_int(stack); - if (result->submatch_id >= 0) + ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */ + if (result->submatch_id >= 0 && + result->submatch_id < SUBMATCH_ID_INVISIBLE_START) { tre_ast_node_t *n, *tmp_node; + if (submatch_id >= SUBMATCH_ID_INVISIBLE_START) + break; n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); if (n == NULL) return REG_ESPACE; @@ -1712,14 +2310,11 @@ tre_parse(tre_parse_ctx_t *ctx) result = tmp_node; } result->submatch_id = submatch_id; - result->num_submatches++; + if (submatch_id < SUBMATCH_ID_INVISIBLE_START) + result->num_submatches++; break; } - case PARSE_RESTORE_CFLAGS: - ctx->cflags = tre_stack_pop_int(stack); - break; - default: assert(0); break; @@ -1730,10 +2325,9 @@ tre_parse(tre_parse_ctx_t *ctx) if (depth > 0) return REG_EPAREN; - if (status == REG_OK) - ctx->result = result; + ctx->result = result; - return status; + return REG_OK; } /* EOF */ diff --git a/contrib/tre/lib/tre-parse.h b/contrib/tre/lib/tre-parse.h index 8cfedf15f7..f0aeac22bd 100644 --- a/contrib/tre/lib/tre-parse.h +++ b/contrib/tre/lib/tre-parse.h @@ -9,6 +9,8 @@ #ifndef TRE_PARSE_H #define TRE_PARSE_H 1 +#include "xlocale_private.h" + /* Parse context. */ typedef struct { /* Memory allocator. The AST is allocated using this. */ @@ -23,13 +25,19 @@ typedef struct { const tre_char_t *re_start; /* The first character after the end of the regexp. */ const tre_char_t *re_end; + /* The current locale */ + locale_t loc; int len; /* Current submatch ID. */ int submatch_id; + /* Current invisible submatch ID. */ + int submatch_id_invisible; /* Current position (number of literal). */ int position; /* The highest back reference or -1 if none seen so far. */ int max_backref; + /* Number of tags that need reordering. */ + int num_reorder_tags; /* This flag is set if the regexp uses approximate matching. */ int have_approx; /* Compilation flags. */