Import TRE regex library v0.8.0 to vendor branch
[dragonfly.git] / contrib / tre / lib / tre-match-backtrack.c
1 /*
2   tre-match-backtrack.c - TRE backtracking regex matching engine
3
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10   This matcher is for regexps that use back referencing.  Regexp matching
11   with back referencing is an NP-complete problem on the number of back
12   references.  The easiest way to match them is to use a backtracking
13   routine which basically goes through all possible paths in the TNFA
14   and chooses the one which results in the best (leftmost and longest)
15   match.  This can be spectacularly expensive and may run out of stack
16   space, but there really is no better known generic algorithm.  Quoting
17   Henry Spencer from comp.compilers:
18   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19
20     POSIX.2 REs require longest match, which is really exciting to
21     implement since the obsolete ("basic") variant also includes
22     \<digit>.  I haven't found a better way of tackling this than doing
23     a preliminary match using a DFA (or simulation) on a modified RE
24     that just replicates subREs for \<digit>, and then doing a
25     backtracking match to determine whether the subRE matches were
26     right.  This can be rather slow, but I console myself with the
27     thought that people who use \<digit> deserve very slow execution.
28     (Pun unintentional but very appropriate.)
29
30 */
31
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36
37 #ifdef TRE_USE_ALLOCA
38 /* AIX requires this to be the first thing in the file.  */
39 #ifndef __GNUC__
40 # if HAVE_ALLOCA_H
41 #  include <alloca.h>
42 # else
43 #  ifdef _AIX
44  #pragma alloca
45 #  else
46 #   ifndef alloca /* predefined by HP cc +Olibcalls */
47 char *alloca ();
48 #   endif
49 #  endif
50 # endif
51 #endif
52 #endif /* TRE_USE_ALLOCA */
53
54 #include <assert.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #ifdef HAVE_WCHAR_H
58 #include <wchar.h>
59 #endif /* HAVE_WCHAR_H */
60 #ifdef HAVE_WCTYPE_H
61 #include <wctype.h>
62 #endif /* HAVE_WCTYPE_H */
63 #ifndef TRE_WCHAR
64 #include <ctype.h>
65 #endif /* !TRE_WCHAR */
66 #ifdef HAVE_MALLOC_H
67 #include <malloc.h>
68 #endif /* HAVE_MALLOC_H */
69
70 #include "tre-internal.h"
71 #include "tre-mem.h"
72 #include "tre-match-utils.h"
73 #include "tre.h"
74 #include "xmalloc.h"
75
76 typedef struct {
77   int pos;
78   const char *str_byte;
79 #ifdef TRE_WCHAR
80   const wchar_t *str_wide;
81 #endif /* TRE_WCHAR */
82   tre_tnfa_transition_t *state;
83   int state_id;
84   int next_c;
85   int *tags;
86 #ifdef TRE_MBSTATE
87   mbstate_t mbstate;
88 #endif /* TRE_MBSTATE */
89 } tre_backtrack_item_t;
90
91 typedef struct tre_backtrack_struct {
92   tre_backtrack_item_t item;
93   struct tre_backtrack_struct *prev;
94   struct tre_backtrack_struct *next;
95 } *tre_backtrack_t;
96
97 #ifdef TRE_WCHAR
98 #define BT_STACK_WIDE_IN(_str_wide)     stack->item.str_wide = (_str_wide)
99 #define BT_STACK_WIDE_OUT               (str_wide) = stack->item.str_wide
100 #else /* !TRE_WCHAR */
101 #define BT_STACK_WIDE_IN(_str_wide)
102 #define BT_STACK_WIDE_OUT
103 #endif /* !TRE_WCHAR */
104
105 #ifdef TRE_MBSTATE
106 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
107 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
108 #else /* !TRE_MBSTATE */
109 #define BT_STACK_MBSTATE_IN
110 #define BT_STACK_MBSTATE_OUT
111 #endif /* !TRE_MBSTATE */
112
113
114 #ifdef TRE_USE_ALLOCA
115 #define tre_bt_mem_new            tre_mem_newa
116 #define tre_bt_mem_alloc          tre_mem_alloca
117 #define tre_bt_mem_destroy(obj)   do { } while (0)
118 #else /* !TRE_USE_ALLOCA */
119 #define tre_bt_mem_new            tre_mem_new
120 #define tre_bt_mem_alloc          tre_mem_alloc
121 #define tre_bt_mem_destroy        tre_mem_destroy
122 #endif /* !TRE_USE_ALLOCA */
123
124
125 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
126   do                                                                          \
127     {                                                                         \
128       int i;                                                                  \
129       if (!stack->next)                                                       \
130         {                                                                     \
131           tre_backtrack_t s;                                                  \
132           s = tre_bt_mem_alloc(mem, sizeof(*s));                              \
133           if (!s)                                                             \
134             {                                                                 \
135               tre_bt_mem_destroy(mem);                                        \
136               if (tags)                                                       \
137                 xfree(tags);                                                  \
138               if (pmatch)                                                     \
139                 xfree(pmatch);                                                \
140               if (states_seen)                                                \
141                 xfree(states_seen);                                           \
142               return REG_ESPACE;                                              \
143             }                                                                 \
144           s->prev = stack;                                                    \
145           s->next = NULL;                                                     \
146           s->item.tags = tre_bt_mem_alloc(mem,                                \
147                                           sizeof(*tags) * tnfa->num_tags);    \
148           if (!s->item.tags)                                                  \
149             {                                                                 \
150               tre_bt_mem_destroy(mem);                                        \
151               if (tags)                                                       \
152                 xfree(tags);                                                  \
153               if (pmatch)                                                     \
154                 xfree(pmatch);                                                \
155               if (states_seen)                                                \
156                 xfree(states_seen);                                           \
157               return REG_ESPACE;                                              \
158             }                                                                 \
159           stack->next = s;                                                    \
160           stack = s;                                                          \
161         }                                                                     \
162       else                                                                    \
163         stack = stack->next;                                                  \
164       stack->item.pos = (_pos);                                               \
165       stack->item.str_byte = (_str_byte);                                     \
166       BT_STACK_WIDE_IN(_str_wide);                                            \
167       stack->item.state = (_state);                                           \
168       stack->item.state_id = (_state_id);                                     \
169       stack->item.next_c = (_next_c);                                         \
170       for (i = 0; i < tnfa->num_tags; i++)                                    \
171         stack->item.tags[i] = (_tags)[i];                                     \
172       BT_STACK_MBSTATE_IN;                                                    \
173     }                                                                         \
174   while (/*CONSTCOND*/0)
175
176 #define BT_STACK_POP()                                                        \
177   do                                                                          \
178     {                                                                         \
179       int i;                                                                  \
180       assert(stack->prev);                                                    \
181       pos = stack->item.pos;                                                  \
182       if (type == STR_USER)                                                   \
183         str_source->rewind(pos + pos_add_next, str_source->context);          \
184       str_byte = stack->item.str_byte;                                        \
185       BT_STACK_WIDE_OUT;                                                      \
186       state = stack->item.state;                                              \
187       next_c = stack->item.next_c;                                            \
188       for (i = 0; i < tnfa->num_tags; i++)                                    \
189         tags[i] = stack->item.tags[i];                                        \
190       BT_STACK_MBSTATE_OUT;                                                   \
191       stack = stack->prev;                                                    \
192     }                                                                         \
193   while (/*CONSTCOND*/0)
194
195 #undef MIN
196 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
197
198 reg_errcode_t
199 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
200                        int len, tre_str_type_t type, int *match_tags,
201                        int eflags, int *match_end_ofs)
202 {
203   /* State variables required by GET_NEXT_WCHAR. */
204   tre_char_t prev_c = 0, next_c = 0;
205   const char *str_byte = string;
206   int pos = 0;
207   unsigned int pos_add_next = 1;
208 #ifdef TRE_WCHAR
209   const wchar_t *str_wide = string;
210 #ifdef TRE_MBSTATE
211   mbstate_t mbstate;
212 #endif /* TRE_MBSTATE */
213 #endif /* TRE_WCHAR */
214   int reg_notbol = eflags & REG_NOTBOL;
215   int reg_noteol = eflags & REG_NOTEOL;
216   int reg_newline = tnfa->cflags & REG_NEWLINE;
217   int str_user_end = 0;
218
219   /* These are used to remember the necessary values of the above
220      variables to return to the position where the current search
221      started from. */
222   int next_c_start;
223   const char *str_byte_start;
224   int pos_start = -1;
225 #ifdef TRE_WCHAR
226   const wchar_t *str_wide_start;
227 #endif /* TRE_WCHAR */
228 #ifdef TRE_MBSTATE
229   mbstate_t mbstate_start;
230 #endif /* TRE_MBSTATE */
231
232   /* End offset of best match so far, or -1 if no match found yet. */
233   int match_eo = -1;
234   /* Tag arrays. */
235   int *next_tags, *tags = NULL;
236   /* Current TNFA state. */
237   tre_tnfa_transition_t *state;
238   int *states_seen = NULL;
239
240   /* Memory allocator to for allocating the backtracking stack. */
241   tre_mem_t mem = tre_bt_mem_new();
242
243   /* The backtracking stack. */
244   tre_backtrack_t stack;
245
246   tre_tnfa_transition_t *trans_i;
247   regmatch_t *pmatch = NULL;
248   int ret;
249
250 #ifdef TRE_MBSTATE
251   memset(&mbstate, '\0', sizeof(mbstate));
252 #endif /* TRE_MBSTATE */
253
254   if (!mem)
255     return REG_ESPACE;
256   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
257   if (!stack)
258     {
259       ret = REG_ESPACE;
260       goto error_exit;
261     }
262   stack->prev = NULL;
263   stack->next = NULL;
264
265   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
266   DPRINT(("len = %d\n", len));
267
268 #ifdef TRE_USE_ALLOCA
269   tags = alloca(sizeof(*tags) * tnfa->num_tags);
270   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
271   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
272 #else /* !TRE_USE_ALLOCA */
273   if (tnfa->num_tags)
274     {
275       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
276       if (!tags)
277         {
278           ret = REG_ESPACE;
279           goto error_exit;
280         }
281     }
282   if (tnfa->num_submatches)
283     {
284       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
285       if (!pmatch)
286         {
287           ret = REG_ESPACE;
288           goto error_exit;
289         }
290     }
291   if (tnfa->num_states)
292     {
293       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
294       if (!states_seen)
295         {
296           ret = REG_ESPACE;
297           goto error_exit;
298         }
299     }
300 #endif /* !TRE_USE_ALLOCA */
301
302  retry:
303   {
304     int i;
305     for (i = 0; i < tnfa->num_tags; i++)
306       {
307         tags[i] = -1;
308         if (match_tags)
309           match_tags[i] = -1;
310       }
311     for (i = 0; i < tnfa->num_states; i++)
312       states_seen[i] = 0;
313   }
314
315   state = NULL;
316   pos = pos_start;
317   if (type == STR_USER)
318     str_source->rewind(pos + pos_add_next, str_source->context);
319   GET_NEXT_WCHAR();
320   pos_start = pos;
321   next_c_start = next_c;
322   str_byte_start = str_byte;
323 #ifdef TRE_WCHAR
324   str_wide_start = str_wide;
325 #endif /* TRE_WCHAR */
326 #ifdef TRE_MBSTATE
327   mbstate_start = mbstate;
328 #endif /* TRE_MBSTATE */
329
330   /* Handle initial states. */
331   next_tags = NULL;
332   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
333     {
334       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
335       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
336         {
337           DPRINT(("assert failed\n"));
338           continue;
339         }
340       if (state == NULL)
341         {
342           /* Start from this state. */
343           state = trans_i->state;
344           next_tags = trans_i->tags;
345         }
346       else
347         {
348           /* Backtrack to this state. */
349           DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
350           BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
351                         trans_i->state_id, next_c, tags, mbstate);
352           {
353             int *tmp = trans_i->tags;
354             if (tmp)
355               while (*tmp >= 0)
356                 stack->item.tags[*tmp++] = pos;
357           }
358         }
359     }
360
361   if (next_tags)
362     for (; *next_tags >= 0; next_tags++)
363       tags[*next_tags] = pos;
364
365
366   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
367   DPRINT(("pos:chr/code | state and tags\n"));
368   DPRINT(("-------------+------------------------------------------------\n"));
369
370   if (state == NULL)
371     goto backtrack;
372
373   while (/*CONSTCOND*/1)
374     {
375       tre_tnfa_transition_t *next_state;
376       int empty_br_match;
377
378       DPRINT(("start loop\n"));
379       if (state == tnfa->final)
380         {
381           DPRINT(("  match found, %d %d\n", match_eo, pos));
382           if (match_eo < pos
383               || (match_eo == pos
384                   && match_tags
385                   && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
386                                    tags, match_tags)))
387             {
388               int i;
389               /* This match wins the previous match. */
390               DPRINT(("  win previous\n"));
391               match_eo = pos;
392               if (match_tags)
393                 for (i = 0; i < tnfa->num_tags; i++)
394                   match_tags[i] = tags[i];
395             }
396           /* Our TNFAs never have transitions leaving from the final state,
397              so we jump right to backtracking. */
398           goto backtrack;
399         }
400
401 #ifdef TRE_DEBUG
402       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
403               state));
404       {
405         int i;
406         for (i = 0; i < tnfa->num_tags; i++)
407           DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
408         DPRINT(("\n"));
409       }
410 #endif /* TRE_DEBUG */
411
412       /* Go to the next character in the input string. */
413       empty_br_match = 0;
414       trans_i = state;
415       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
416         {
417           /* This is a back reference state.  All transitions leaving from
418              this state have the same back reference "assertion".  Instead
419              of reading the next character, we match the back reference. */
420           int so, eo, bt = trans_i->u.backref;
421           int bt_len;
422           int result;
423
424           DPRINT(("  should match back reference %d\n", bt));
425           /* Get the substring we need to match against.  Remember to
426              turn off REG_NOSUB temporarily. */
427           tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB,
428                           tnfa, tags, pos);
429           so = pmatch[bt].rm_so;
430           eo = pmatch[bt].rm_eo;
431           bt_len = eo - so;
432
433 #ifdef TRE_DEBUG
434           {
435             int slen;
436             if (len < 0)
437               slen = bt_len;
438             else
439               slen = MIN(bt_len, len - pos);
440
441             if (type == STR_BYTE)
442               {
443                 DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
444                         bt_len, so, eo, bt_len, (char*)string + so));
445                 DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
446               }
447 #ifdef TRE_WCHAR
448             else if (type == STR_WIDE)
449               {
450                 DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
451                         bt_len, so, eo, bt_len, (wchar_t*)string + so));
452                 DPRINT(("  current string is '%.*" STRF "'\n",
453                         slen, str_wide - 1));
454               }
455 #endif /* TRE_WCHAR */
456           }
457 #endif
458
459           if (len < 0)
460             {
461               if (type == STR_USER)
462                 result = str_source->compare((unsigned)so, (unsigned)pos,
463                                              (unsigned)bt_len,
464                                              str_source->context);
465 #ifdef TRE_WCHAR
466               else if (type == STR_WIDE)
467                 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
468                                  (size_t)bt_len);
469 #endif /* TRE_WCHAR */
470               else
471                 result = strncmp((const char*)string + so, str_byte - 1,
472                                  (size_t)bt_len);
473             }
474           else if (len - pos < bt_len)
475             result = 1;
476 #ifdef TRE_WCHAR
477           else if (type == STR_WIDE)
478             result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
479                              (size_t)bt_len);
480 #endif /* TRE_WCHAR */
481           else
482             result = memcmp((const char*)string + so, str_byte - 1,
483                             (size_t)bt_len);
484
485           if (result == 0)
486             {
487               /* Back reference matched.  Check for infinite loop. */
488               if (bt_len == 0)
489                 empty_br_match = 1;
490               if (empty_br_match && states_seen[trans_i->state_id])
491                 {
492                   DPRINT(("  avoid loop\n"));
493                   goto backtrack;
494                 }
495
496               states_seen[trans_i->state_id] = empty_br_match;
497
498               /* Advance in input string and resync `prev_c', `next_c'
499                  and pos. */
500               DPRINT(("  back reference matched\n"));
501               str_byte += bt_len - 1;
502 #ifdef TRE_WCHAR
503               str_wide += bt_len - 1;
504 #endif /* TRE_WCHAR */
505               pos += bt_len - 1;
506               GET_NEXT_WCHAR();
507               DPRINT(("  pos now %d\n", pos));
508             }
509           else
510             {
511               DPRINT(("  back reference did not match\n"));
512               goto backtrack;
513             }
514         }
515       else
516         {
517           /* Check for end of string. */
518           if (len < 0)
519             {
520               if (type == STR_USER)
521                 {
522                   if (str_user_end)
523                     goto backtrack;
524                 }
525               else if (next_c == L'\0')
526                 goto backtrack;
527             }
528           else
529             {
530               if (pos >= len)
531                 goto backtrack;
532             }
533
534           /* Read the next character. */
535           GET_NEXT_WCHAR();
536         }
537
538       next_state = NULL;
539       for (trans_i = state; trans_i->state; trans_i++)
540         {
541           DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
542                   trans_i->code_min, trans_i->code_max,
543                   trans_i->code_min, trans_i->code_max,
544                   trans_i->assertions, trans_i->state_id));
545           if (trans_i->code_min <= (tre_cint_t)prev_c
546               && trans_i->code_max >= (tre_cint_t)prev_c)
547             {
548               if (trans_i->assertions
549                   && (CHECK_ASSERTIONS(trans_i->assertions)
550                       || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
551                 {
552                   DPRINT(("  assertion failed\n"));
553                   continue;
554                 }
555
556               if (next_state == NULL)
557                 {
558                   /* First matching transition. */
559                   DPRINT(("  Next state is %d\n", trans_i->state_id));
560                   next_state = trans_i->state;
561                   next_tags = trans_i->tags;
562                 }
563               else
564                 {
565                   /* Second matching transition.  We may need to backtrack here
566                      to take this transition instead of the first one, so we
567                      push this transition in the backtracking stack so we can
568                      jump back here if needed. */
569                   DPRINT(("  saving state %d for backtracking\n",
570                           trans_i->state_id));
571                   BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
572                                 trans_i->state_id, next_c, tags, mbstate);
573                   {
574                     int *tmp;
575                     for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
576                       stack->item.tags[*tmp] = pos;
577                   }
578 #if 0 /* XXX - it's important not to look at all transitions here to keep
579          the stack small! */
580                   break;
581 #endif
582                 }
583             }
584         }
585
586       if (next_state != NULL)
587         {
588           /* Matching transitions were found.  Take the first one. */
589           state = next_state;
590
591           /* Update the tag values. */
592           if (next_tags)
593             while (*next_tags >= 0)
594               tags[*next_tags++] = pos;
595         }
596       else
597         {
598         backtrack:
599           /* A matching transition was not found.  Try to backtrack. */
600           if (stack->prev)
601             {
602               DPRINT(("  backtracking\n"));
603               if (stack->item.state->assertions && ASSERT_BACKREF)
604                 {
605                   DPRINT(("  states_seen[%d] = 0\n",
606                           stack->item.state_id));
607                   states_seen[stack->item.state_id] = 0;
608                 }
609
610               BT_STACK_POP();
611             }
612           else if (match_eo < 0)
613             {
614               /* Try starting from a later position in the input string. */
615               /* Check for end of string. */
616               if (len < 0)
617                 {
618                   if (next_c == L'\0')
619                     {
620                       DPRINT(("end of string.\n"));
621                       break;
622                     }
623                 }
624               else
625                 {
626                   if (pos >= len)
627                     {
628                       DPRINT(("end of string.\n"));
629                       break;
630                     }
631                 }
632               DPRINT(("restarting from next start position\n"));
633               next_c = next_c_start;
634 #ifdef TRE_MBSTATE
635               mbstate = mbstate_start;
636 #endif /* TRE_MBSTATE */
637               str_byte = str_byte_start;
638 #ifdef TRE_WCHAR
639               str_wide = str_wide_start;
640 #endif /* TRE_WCHAR */
641               goto retry;
642             }
643           else
644             {
645               DPRINT(("finished\n"));
646               break;
647             }
648         }
649     }
650
651   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
652   *match_end_ofs = match_eo;
653
654  error_exit:
655   tre_bt_mem_destroy(mem);
656 #ifndef TRE_USE_ALLOCA
657   if (tags)
658     xfree(tags);
659   if (pmatch)
660     xfree(pmatch);
661   if (states_seen)
662     xfree(states_seen);
663 #endif /* !TRE_USE_ALLOCA */
664
665   return ret;
666 }