2 tre-match-backtrack.c - TRE backtracking regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
35 #endif /* HAVE_CONFIG_H */
37 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
42 /* AIX requires this to be the first thing in the file. */
50 # ifndef alloca /* predefined by HP cc +Olibcalls */
56 #endif /* TRE_USE_ALLOCA */
63 #endif /* HAVE_WCHAR_H */
66 #endif /* HAVE_WCTYPE_H */
69 #endif /* !TRE_WCHAR */
72 #endif /* HAVE_MALLOC_H */
74 #include "tre-internal.h"
76 #include "tre-match-utils.h"
82 unsigned int pos_add_next;
85 const wchar_t *str_wide;
86 #endif /* TRE_WCHAR */
87 tre_tnfa_transition_t *state;
93 #endif /* TRE_MBSTATE */
94 } tre_backtrack_item_t;
96 typedef struct tre_backtrack_struct {
97 tre_backtrack_item_t item;
98 struct tre_backtrack_struct *prev;
99 struct tre_backtrack_struct *next;
103 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
104 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
105 #else /* !TRE_WCHAR */
106 #define BT_STACK_WIDE_IN(_str_wide)
107 #define BT_STACK_WIDE_OUT
108 #endif /* !TRE_WCHAR */
111 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
112 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
113 #else /* !TRE_MBSTATE */
114 #define BT_STACK_MBSTATE_IN
115 #define BT_STACK_MBSTATE_OUT
116 #endif /* !TRE_MBSTATE */
119 #ifdef TRE_USE_ALLOCA
120 #define tre_bt_mem_new tre_mem_newa
121 #define tre_bt_mem_alloc tre_mem_alloca
122 #define tre_bt_mem_destroy(obj) do { } while (0)
123 #else /* !TRE_USE_ALLOCA */
124 #define tre_bt_mem_new tre_mem_new
125 #define tre_bt_mem_alloc tre_mem_alloc
126 #define tre_bt_mem_destroy tre_mem_destroy
127 #endif /* !TRE_USE_ALLOCA */
130 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
136 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
139 tre_bt_mem_destroy(mem); \
145 xfree(states_seen); \
150 s->item.tags = tre_bt_mem_alloc(mem, \
151 num_tags * sizeof(*tags)); \
154 tre_bt_mem_destroy(mem); \
160 xfree(states_seen); \
167 stack = stack->next; \
168 stack->item.pos = (_pos); \
169 stack->item.pos_add_next = (_pos_add_next); \
170 stack->item.str_byte = (_str_byte); \
171 BT_STACK_WIDE_IN(_str_wide); \
172 stack->item.state = (_state); \
173 stack->item.state_id = (_state_id); \
174 stack->item.next_c = (_next_c); \
175 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
176 BT_STACK_MBSTATE_IN; \
178 while (/*CONSTCOND*/0)
180 #define BT_STACK_POP() \
183 assert(stack->prev); \
184 pos = stack->item.pos; \
185 pos_add_next = stack->item.pos_add_next; \
186 str_byte = stack->item.str_byte; \
188 state = stack->item.state; \
189 next_c = stack->item.next_c; \
190 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
191 BT_STACK_MBSTATE_OUT; \
192 stack = stack->prev; \
194 while (/*CONSTCOND*/0)
197 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
200 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
201 int len, tre_str_type_t type, tre_tag_t *match_tags,
202 int eflags, int *match_end_ofs)
204 /* State variables required by GET_NEXT_WCHAR. */
205 tre_char_t prev_c = 0, next_c = 0;
206 const char *str_byte = string;
208 unsigned int pos_add_next = 1;
210 const wchar_t *str_wide = string;
213 #endif /* TRE_MBSTATE */
214 #endif /* TRE_WCHAR */
215 int reg_notbol = eflags & REG_NOTBOL;
216 int reg_noteol = eflags & REG_NOTEOL;
217 int reg_newline = tnfa->cflags & REG_NEWLINE;
220 /* These are used to remember the necessary values of the above
221 variables to return to the position where the current search
224 const char *str_byte_start;
227 const wchar_t *str_wide_start;
228 #endif /* TRE_WCHAR */
230 mbstate_t mbstate_start;
231 #endif /* TRE_MBSTATE */
233 /* End offset of best match so far, or -1 if no match found yet. */
237 tre_tag_t *tags = NULL;
238 /* Current TNFA state. */
239 tre_tnfa_transition_t *state;
240 int *states_seen = NULL;
242 /* Memory allocator to for allocating the backtracking stack. */
243 tre_mem_t mem = tre_bt_mem_new();
245 /* The backtracking stack. */
246 tre_backtrack_t stack;
248 tre_tnfa_transition_t *trans_i;
249 regmatch_t *pmatch = NULL;
252 int num_tags = tnfa->num_tags;
258 memset(&mbstate, '\0', sizeof(mbstate));
259 #endif /* TRE_MBSTATE */
263 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
272 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
273 DPRINT(("len = %d\n", len));
276 int pbytes, sbytes, total_bytes;
278 /* Compute the length of the block we need. */
279 tbytes = sizeof(*tags) * num_tags;
280 pbytes = sizeof(*pmatch) * tnfa->num_submatches;
281 sbytes = sizeof(*states_seen) * tnfa->num_states;
283 (sizeof(long) - 1) * 2 /* for alignment paddings */
284 + tbytes + pbytes + sbytes;
286 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes));
287 /* Allocate the memory. */
288 #ifdef TRE_USE_ALLOCA
289 buf = alloca(total_bytes);
290 #else /* !TRE_USE_ALLOCA */
291 buf = xmalloc((unsigned)total_bytes);
292 #endif /* !TRE_USE_ALLOCA */
296 /* Get the various pointers within tmp_buf (properly aligned). */
298 tmp_buf = buf + tbytes;
299 tmp_buf += ALIGN(tmp_buf, long);
300 pmatch = (void *)tmp_buf;
302 tmp_buf += ALIGN(tmp_buf, long);
303 states_seen = (void *)tmp_buf;
308 memset(tags, 0, num_tags * sizeof(*tags));
310 memset(match_tags, 0, num_tags * sizeof(*match_tags));
311 for (i = 0; i < tnfa->num_states; i++)
319 next_c_start = next_c;
320 str_byte_start = str_byte;
322 str_wide_start = str_wide;
323 #endif /* TRE_WCHAR */
325 mbstate_start = mbstate;
326 #endif /* TRE_MBSTATE */
328 /* Handle initial states. */
330 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
332 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
333 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
335 DPRINT(("assert failed\n"));
340 /* Start from this state. */
341 state = trans_i->state;
342 next_tags = trans_i->tags;
346 /* Backtrack to this state. */
347 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
348 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state,
349 trans_i->state_id, next_c, tags, mbstate);
351 int *tmp = trans_i->tags;
355 tre_tag_set(stack->item.tags, *tmp++, pos, touch);
364 for (; *next_tags >= 0; next_tags++)
365 tre_tag_set(tags, *next_tags, pos, touch);
370 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
371 DPRINT(("pos:chr/code | state and tags\n"));
372 DPRINT(("-------------+------------------------------------------------\n"));
377 while (/*CONSTCOND*/1)
379 tre_tnfa_transition_t *next_state;
382 DPRINT(("start loop\n"));
384 if (match_eo >= 0 && tnfa->num_minimals)
388 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
390 tre_print_tags(match_tags, tnfa->num_tags);
392 #endif /* TRE_DEBUG */
393 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
395 int end = tnfa->minimal_tags[i];
396 int start = tnfa->minimal_tags[i + 1];
397 DPRINT((" Minimal start %d, end %d\n", start, end));
398 if (tre_minimal_tag_order(start, end, match_tags, tags) > 0)
407 DPRINT((" Keeping tags="));
408 tre_print_tags(tags, tnfa->num_tags);
410 #endif /* TRE_DEBUG */
415 DPRINT((" Throwing out tags="));
416 tre_print_tags(tags, tnfa->num_tags);
418 #endif /* TRE_DEBUG */
423 if (state == tnfa->final)
425 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos));
427 if (match_eo >= 0 && tnfa->num_minimals)
431 DPRINT(("Checking minimal conditions: match_eo=%d "
432 "match_tags=", match_eo));
433 tre_print_tags(match_tags, tnfa->num_tags);
435 #endif /* TRE_DEBUG */
436 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
438 int end = tnfa->minimal_tags[i];
439 int start = tnfa->minimal_tags[i + 1];
440 DPRINT((" Minimal start %d, end %d\n", start, end));
441 if ((compare = tre_minimal_tag_order(start, end,
442 match_tags, tags)) != 0)
448 DPRINT((" Throwing out new match, tags="));
449 tre_print_tags(tags, tnfa->num_tags);
451 #endif /* TRE_DEBUG */
454 else if (compare < 0)
457 DPRINT((" Throwing out old match, tags="));
458 tre_print_tags(match_tags, tnfa->num_tags);
460 #endif /* TRE_DEBUG */
468 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
471 /* This match wins the previous match. */
473 DPRINT((" win previous tags="));
474 tre_print_tags(tags, tnfa->num_tags);
476 #endif /* TRE_DEBUG */
479 memcpy(match_tags, tags, num_tags * sizeof(*tags));
481 /* Our TNFAs never have transitions leaving from the final state,
482 so we jump right to backtracking. */
487 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
489 tre_print_tags(tags, tnfa->num_tags);
491 #endif /* TRE_DEBUG */
493 /* Go to the next character in the input string. */
496 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
498 /* This is a back reference state. All transitions leaving from
499 this state have the same back reference "assertion". Instead
500 of reading the next character, we match the back reference. */
501 int so, eo, bt = trans_i->u.backref;
505 DPRINT((" should match back reference %d\n", bt));
506 /* Get the substring we need to match against. Remember to
507 turn off REG_NOSUB temporarily. */
508 ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
510 if (ret != REG_OK) goto error_exit;
511 so = pmatch[bt].rm_so;
512 eo = pmatch[bt].rm_eo;
521 slen = MIN(bt_len, len - pos);
523 if (type == STR_BYTE)
525 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
526 bt_len, so, eo, bt_len, (char*)string + so));
527 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
530 else if (type == STR_WIDE)
532 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n",
533 bt_len, so, eo, bt_len, (wchar_t*)string + so));
534 DPRINT((" current string is '%.*" STRF "'\n",
535 slen, str_wide - 1));
537 #endif /* TRE_WCHAR */
543 result = 1; /* Back reference of nomatch doesn't match */
548 if (type == STR_WIDE)
549 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
552 #endif /* TRE_WCHAR */
553 result = strncmp((const char*)string + so, str_byte - 1,
556 else if (len - pos < bt_len)
559 else if (type == STR_WIDE)
560 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
562 #endif /* TRE_WCHAR */
564 result = memcmp((const char*)string + so, str_byte - 1,
569 /* Back reference matched. Check for infinite loop. */
572 if (empty_br_match && states_seen[trans_i->state_id])
574 DPRINT((" avoid loop\n"));
578 states_seen[trans_i->state_id] = empty_br_match;
580 /* Advance in input string and resync `prev_c', `next_c'
582 DPRINT((" back reference matched\n"));
583 str_byte += bt_len - 1;
585 str_wide += bt_len - 1;
586 #endif /* TRE_WCHAR */
589 DPRINT((" pos now %d\n", pos));
593 DPRINT((" back reference did not match\n"));
599 /* Check for end of string. */
611 /* Read the next character. */
616 for (trans_i = state; trans_i->state; trans_i++)
618 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
619 trans_i->code_min, trans_i->code_max,
620 trans_i->code_min, trans_i->code_max,
621 trans_i->assertions, trans_i->state_id));
622 if (trans_i->code_min <= (tre_cint_t)prev_c
623 && trans_i->code_max >= (tre_cint_t)prev_c)
625 if (trans_i->assertions
626 && (CHECK_ASSERTIONS(trans_i->assertions)
627 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
629 DPRINT((" assertion failed\n"));
633 if (next_state == NULL)
635 /* First matching transition. */
636 DPRINT((" Next state is %d\n", trans_i->state_id));
637 next_state = trans_i->state;
638 next_tags = trans_i->tags;
642 /* Second matching transition. We may need to backtrack here
643 to take this transition instead of the first one, so we
644 push this transition in the backtracking stack so we can
645 jump back here if needed. */
646 DPRINT((" saving state %d for backtracking\n",
648 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide,
649 trans_i->state, trans_i->state_id, next_c,
653 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
654 tre_tag_set(stack->item.tags, *tmp, pos, touch);
657 #if 0 /* XXX - it's important not to look at all transitions here to keep
665 if (next_state != NULL)
667 /* Matching transitions were found. Take the first one. */
670 /* Update the tag values. */
673 while (*next_tags >= 0)
674 tre_tag_set(tags, *next_tags++, pos, touch);
681 /* A matching transition was not found. Try to backtrack. */
684 DPRINT((" backtracking\n"));
685 if (stack->item.state->assertions & ASSERT_BACKREF)
687 DPRINT((" states_seen[%d] = 0\n",
688 stack->item.state_id));
689 states_seen[stack->item.state_id] = 0;
694 else if (match_eo < 0)
696 /* Try starting from a later position in the input string. */
697 /* Check for end of string. */
698 if (pos == pos_start)
704 DPRINT(("end of string.\n"));
712 DPRINT(("end of string.\n"));
717 DPRINT(("restarting from next start position\n"));
718 next_c = next_c_start;
720 mbstate = mbstate_start;
721 #endif /* TRE_MBSTATE */
722 str_byte = str_byte_start;
724 str_wide = str_wide_start;
725 #endif /* TRE_WCHAR */
730 DPRINT(("finished\n"));
736 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
737 *match_end_ofs = match_eo;
740 tre_bt_mem_destroy(mem);
741 #ifndef TRE_USE_ALLOCA
744 #endif /* !TRE_USE_ALLOCA */