2 tre-match-backtrack.c - TRE backtracking regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
35 #endif /* HAVE_CONFIG_H */
38 /* AIX requires this to be the first thing in the file. */
46 # ifndef alloca /* predefined by HP cc +Olibcalls */
52 #endif /* TRE_USE_ALLOCA */
59 #endif /* HAVE_WCHAR_H */
62 #endif /* HAVE_WCTYPE_H */
65 #endif /* !TRE_WCHAR */
68 #endif /* HAVE_MALLOC_H */
70 #include "tre-internal.h"
72 #include "tre-match-utils.h"
80 const wchar_t *str_wide;
81 #endif /* TRE_WCHAR */
82 tre_tnfa_transition_t *state;
88 #endif /* TRE_MBSTATE */
89 } tre_backtrack_item_t;
91 typedef struct tre_backtrack_struct {
92 tre_backtrack_item_t item;
93 struct tre_backtrack_struct *prev;
94 struct tre_backtrack_struct *next;
98 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
99 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
100 #else /* !TRE_WCHAR */
101 #define BT_STACK_WIDE_IN(_str_wide)
102 #define BT_STACK_WIDE_OUT
103 #endif /* !TRE_WCHAR */
106 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
107 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
108 #else /* !TRE_MBSTATE */
109 #define BT_STACK_MBSTATE_IN
110 #define BT_STACK_MBSTATE_OUT
111 #endif /* !TRE_MBSTATE */
114 #ifdef TRE_USE_ALLOCA
115 #define tre_bt_mem_new tre_mem_newa
116 #define tre_bt_mem_alloc tre_mem_alloca
117 #define tre_bt_mem_destroy(obj) do { } while (0)
118 #else /* !TRE_USE_ALLOCA */
119 #define tre_bt_mem_new tre_mem_new
120 #define tre_bt_mem_alloc tre_mem_alloc
121 #define tre_bt_mem_destroy tre_mem_destroy
122 #endif /* !TRE_USE_ALLOCA */
125 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
132 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
135 tre_bt_mem_destroy(mem); \
141 xfree(states_seen); \
146 s->item.tags = tre_bt_mem_alloc(mem, \
147 sizeof(*tags) * tnfa->num_tags); \
150 tre_bt_mem_destroy(mem); \
156 xfree(states_seen); \
163 stack = stack->next; \
164 stack->item.pos = (_pos); \
165 stack->item.str_byte = (_str_byte); \
166 BT_STACK_WIDE_IN(_str_wide); \
167 stack->item.state = (_state); \
168 stack->item.state_id = (_state_id); \
169 stack->item.next_c = (_next_c); \
170 for (i = 0; i < tnfa->num_tags; i++) \
171 stack->item.tags[i] = (_tags)[i]; \
172 BT_STACK_MBSTATE_IN; \
174 while (/*CONSTCOND*/0)
176 #define BT_STACK_POP() \
180 assert(stack->prev); \
181 pos = stack->item.pos; \
182 if (type == STR_USER) \
183 str_source->rewind(pos + pos_add_next, str_source->context); \
184 str_byte = stack->item.str_byte; \
186 state = stack->item.state; \
187 next_c = stack->item.next_c; \
188 for (i = 0; i < tnfa->num_tags; i++) \
189 tags[i] = stack->item.tags[i]; \
190 BT_STACK_MBSTATE_OUT; \
191 stack = stack->prev; \
193 while (/*CONSTCOND*/0)
196 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
199 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
200 int len, tre_str_type_t type, int *match_tags,
201 int eflags, int *match_end_ofs)
203 /* State variables required by GET_NEXT_WCHAR. */
204 tre_char_t prev_c = 0, next_c = 0;
205 const char *str_byte = string;
207 unsigned int pos_add_next = 1;
209 const wchar_t *str_wide = string;
212 #endif /* TRE_MBSTATE */
213 #endif /* TRE_WCHAR */
214 int reg_notbol = eflags & REG_NOTBOL;
215 int reg_noteol = eflags & REG_NOTEOL;
216 int reg_newline = tnfa->cflags & REG_NEWLINE;
217 int str_user_end = 0;
219 /* These are used to remember the necessary values of the above
220 variables to return to the position where the current search
223 const char *str_byte_start;
226 const wchar_t *str_wide_start;
227 #endif /* TRE_WCHAR */
229 mbstate_t mbstate_start;
230 #endif /* TRE_MBSTATE */
232 /* End offset of best match so far, or -1 if no match found yet. */
235 int *next_tags, *tags = NULL;
236 /* Current TNFA state. */
237 tre_tnfa_transition_t *state;
238 int *states_seen = NULL;
240 /* Memory allocator to for allocating the backtracking stack. */
241 tre_mem_t mem = tre_bt_mem_new();
243 /* The backtracking stack. */
244 tre_backtrack_t stack;
246 tre_tnfa_transition_t *trans_i;
247 regmatch_t *pmatch = NULL;
251 memset(&mbstate, '\0', sizeof(mbstate));
252 #endif /* TRE_MBSTATE */
256 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
265 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
266 DPRINT(("len = %d\n", len));
268 #ifdef TRE_USE_ALLOCA
269 tags = alloca(sizeof(*tags) * tnfa->num_tags);
270 pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
271 states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
272 #else /* !TRE_USE_ALLOCA */
275 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
282 if (tnfa->num_submatches)
284 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
291 if (tnfa->num_states)
293 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
300 #endif /* !TRE_USE_ALLOCA */
305 for (i = 0; i < tnfa->num_tags; i++)
311 for (i = 0; i < tnfa->num_states; i++)
317 if (type == STR_USER)
318 str_source->rewind(pos + pos_add_next, str_source->context);
321 next_c_start = next_c;
322 str_byte_start = str_byte;
324 str_wide_start = str_wide;
325 #endif /* TRE_WCHAR */
327 mbstate_start = mbstate;
328 #endif /* TRE_MBSTATE */
330 /* Handle initial states. */
332 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
334 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
335 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
337 DPRINT(("assert failed\n"));
342 /* Start from this state. */
343 state = trans_i->state;
344 next_tags = trans_i->tags;
348 /* Backtrack to this state. */
349 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
350 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
351 trans_i->state_id, next_c, tags, mbstate);
353 int *tmp = trans_i->tags;
356 stack->item.tags[*tmp++] = pos;
362 for (; *next_tags >= 0; next_tags++)
363 tags[*next_tags] = pos;
366 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
367 DPRINT(("pos:chr/code | state and tags\n"));
368 DPRINT(("-------------+------------------------------------------------\n"));
373 while (/*CONSTCOND*/1)
375 tre_tnfa_transition_t *next_state;
378 DPRINT(("start loop\n"));
379 if (state == tnfa->final)
381 DPRINT((" match found, %d %d\n", match_eo, pos));
385 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
389 /* This match wins the previous match. */
390 DPRINT((" win previous\n"));
393 for (i = 0; i < tnfa->num_tags; i++)
394 match_tags[i] = tags[i];
396 /* Our TNFAs never have transitions leaving from the final state,
397 so we jump right to backtracking. */
402 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
406 for (i = 0; i < tnfa->num_tags; i++)
407 DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
410 #endif /* TRE_DEBUG */
412 /* Go to the next character in the input string. */
415 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
417 /* This is a back reference state. All transitions leaving from
418 this state have the same back reference "assertion". Instead
419 of reading the next character, we match the back reference. */
420 int so, eo, bt = trans_i->u.backref;
424 DPRINT((" should match back reference %d\n", bt));
425 /* Get the substring we need to match against. Remember to
426 turn off REG_NOSUB temporarily. */
427 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB,
429 so = pmatch[bt].rm_so;
430 eo = pmatch[bt].rm_eo;
439 slen = MIN(bt_len, len - pos);
441 if (type == STR_BYTE)
443 DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n",
444 bt_len, so, eo, bt_len, (char*)string + so));
445 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
448 else if (type == STR_WIDE)
450 DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
451 bt_len, so, eo, bt_len, (wchar_t*)string + so));
452 DPRINT((" current string is '%.*" STRF "'\n",
453 slen, str_wide - 1));
455 #endif /* TRE_WCHAR */
461 if (type == STR_USER)
462 result = str_source->compare((unsigned)so, (unsigned)pos,
464 str_source->context);
466 else if (type == STR_WIDE)
467 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
469 #endif /* TRE_WCHAR */
471 result = strncmp((const char*)string + so, str_byte - 1,
474 else if (len - pos < bt_len)
477 else if (type == STR_WIDE)
478 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
480 #endif /* TRE_WCHAR */
482 result = memcmp((const char*)string + so, str_byte - 1,
487 /* Back reference matched. Check for infinite loop. */
490 if (empty_br_match && states_seen[trans_i->state_id])
492 DPRINT((" avoid loop\n"));
496 states_seen[trans_i->state_id] = empty_br_match;
498 /* Advance in input string and resync `prev_c', `next_c'
500 DPRINT((" back reference matched\n"));
501 str_byte += bt_len - 1;
503 str_wide += bt_len - 1;
504 #endif /* TRE_WCHAR */
507 DPRINT((" pos now %d\n", pos));
511 DPRINT((" back reference did not match\n"));
517 /* Check for end of string. */
520 if (type == STR_USER)
525 else if (next_c == L'\0')
534 /* Read the next character. */
539 for (trans_i = state; trans_i->state; trans_i++)
541 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
542 trans_i->code_min, trans_i->code_max,
543 trans_i->code_min, trans_i->code_max,
544 trans_i->assertions, trans_i->state_id));
545 if (trans_i->code_min <= (tre_cint_t)prev_c
546 && trans_i->code_max >= (tre_cint_t)prev_c)
548 if (trans_i->assertions
549 && (CHECK_ASSERTIONS(trans_i->assertions)
550 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
552 DPRINT((" assertion failed\n"));
556 if (next_state == NULL)
558 /* First matching transition. */
559 DPRINT((" Next state is %d\n", trans_i->state_id));
560 next_state = trans_i->state;
561 next_tags = trans_i->tags;
565 /* Second matching transition. We may need to backtrack here
566 to take this transition instead of the first one, so we
567 push this transition in the backtracking stack so we can
568 jump back here if needed. */
569 DPRINT((" saving state %d for backtracking\n",
571 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
572 trans_i->state_id, next_c, tags, mbstate);
575 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
576 stack->item.tags[*tmp] = pos;
578 #if 0 /* XXX - it's important not to look at all transitions here to keep
586 if (next_state != NULL)
588 /* Matching transitions were found. Take the first one. */
591 /* Update the tag values. */
593 while (*next_tags >= 0)
594 tags[*next_tags++] = pos;
599 /* A matching transition was not found. Try to backtrack. */
602 DPRINT((" backtracking\n"));
603 if (stack->item.state->assertions && ASSERT_BACKREF)
605 DPRINT((" states_seen[%d] = 0\n",
606 stack->item.state_id));
607 states_seen[stack->item.state_id] = 0;
612 else if (match_eo < 0)
614 /* Try starting from a later position in the input string. */
615 /* Check for end of string. */
620 DPRINT(("end of string.\n"));
628 DPRINT(("end of string.\n"));
632 DPRINT(("restarting from next start position\n"));
633 next_c = next_c_start;
635 mbstate = mbstate_start;
636 #endif /* TRE_MBSTATE */
637 str_byte = str_byte_start;
639 str_wide = str_wide_start;
640 #endif /* TRE_WCHAR */
645 DPRINT(("finished\n"));
651 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
652 *match_end_ofs = match_eo;
655 tre_bt_mem_destroy(mem);
656 #ifndef TRE_USE_ALLOCA
663 #endif /* !TRE_USE_ALLOCA */