TRE: Add local modifications to extend functionality
[dragonfly.git] / contrib / tre / lib / tre-match-parallel.c
1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.  All
12   matching paths in the TNFA are traversed in parallel.  When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29
30 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
31    info while running */
32 #undef TRE_USE_ALLOCA
33
34 #ifdef TRE_USE_ALLOCA
35 /* AIX requires this to be the first thing in the file.  */
36 #ifndef __GNUC__
37 # if HAVE_ALLOCA_H
38 #  include <alloca.h>
39 # else
40 #  ifdef _AIX
41  #pragma alloca
42 #  else
43 #   ifndef alloca /* predefined by HP cc +Olibcalls */
44 char *alloca ();
45 #   endif
46 #  endif
47 # endif
48 #endif
49 #endif /* TRE_USE_ALLOCA */
50
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #ifdef HAVE_WCHAR_H
55 #include <wchar.h>
56 #endif /* HAVE_WCHAR_H */
57 #ifdef HAVE_WCTYPE_H
58 #include <wctype.h>
59 #endif /* HAVE_WCTYPE_H */
60 #ifndef TRE_WCHAR
61 #include <ctype.h>
62 #endif /* !TRE_WCHAR */
63 #ifdef HAVE_MALLOC_H
64 #include <malloc.h>
65 #endif /* HAVE_MALLOC_H */
66
67 #include "tre-internal.h"
68 #include "tre-match-utils.h"
69 #include "tre.h"
70 #include "xmalloc.h"
71
72
73
74 typedef struct {
75   tre_tnfa_transition_t *state;
76   tre_tag_t *tags;
77 } tre_tnfa_reach_t;
78
79 typedef struct {
80   int pos;
81   tre_tag_t **tags;
82 } tre_reach_pos_t;
83
84
85 #ifdef TRE_DEBUG
86 static void
87 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
88 {
89   DPRINT((" %p", (void *)state));
90   if (num_tags > 0)
91     {
92       DPRINT(("/"));
93       tre_print_tags(tags, num_tags);
94     }
95 }
96
97 static void
98 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
99 {
100   while (reach->state != NULL)
101     {
102       tre_print_reach1(reach->state, reach->tags, num_tags);
103       reach++;
104     }
105   DPRINT(("\n"));
106
107 }
108 #endif /* TRE_DEBUG */
109
110 reg_errcode_t
111 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
112                       tre_str_type_t type, tre_tag_t *match_tags, int eflags,
113                       int *match_end_ofs)
114 {
115   /* State variables required by GET_NEXT_WCHAR. */
116   tre_char_t prev_c = 0, next_c = 0;
117   const char *str_byte = string;
118   int pos = -1;
119   unsigned int pos_add_next = 1;
120 #ifdef TRE_WCHAR
121   const wchar_t *str_wide = string;
122 #ifdef TRE_MBSTATE
123   mbstate_t mbstate;
124 #endif /* TRE_MBSTATE */
125 #endif /* TRE_WCHAR */
126   int reg_notbol = eflags & REG_NOTBOL;
127   int reg_noteol = eflags & REG_NOTEOL;
128   int reg_newline = tnfa->cflags & REG_NEWLINE;
129
130   char *buf;
131   tre_tnfa_transition_t *trans_i;
132   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
133   tre_reach_pos_t *reach_pos;
134   int *tag_i;
135   int num_tags, i;
136
137   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
138 #ifdef TRE_DEBUG
139   int once;
140 #endif /* TRE_DEBUG */
141   tre_tag_t *tmp_tags = NULL;
142   tre_tag_t *tmp_iptr;
143   int tbytes;
144   int touch = 1;
145
146 #ifdef TRE_MBSTATE
147   memset(&mbstate, '\0', sizeof(mbstate));
148 #endif /* TRE_MBSTATE */
149
150   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
151
152   if (!match_tags)
153     num_tags = 0;
154   else
155     num_tags = tnfa->num_tags;
156
157   /* Allocate memory for temporary data required for matching.  This needs to
158      be done for every matching operation to be thread safe.  This allocates
159      everything in a single large block from the stack frame using alloca()
160      or with malloc() if alloca is unavailable. */
161   {
162     int rbytes, pbytes, total_bytes;
163     char *tmp_buf;
164     /* Compute the length of the block we need. */
165     tbytes = sizeof(*tmp_tags) * num_tags;
166     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
167     pbytes = sizeof(*reach_pos) * tnfa->num_states;
168     total_bytes =
169       (sizeof(long) - 1) * 4 /* for alignment paddings */
170       + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
171
172     DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
173     /* Allocate the memory. */
174 #ifdef TRE_USE_ALLOCA
175     buf = alloca(total_bytes);
176 #else /* !TRE_USE_ALLOCA */
177     buf = xmalloc((unsigned)total_bytes);
178 #endif /* !TRE_USE_ALLOCA */
179     if (buf == NULL)
180       return REG_ESPACE;
181     memset(buf, 0, (size_t)total_bytes);
182
183     /* Get the various pointers within tmp_buf (properly aligned). */
184     tmp_tags = (void *)buf;
185     tmp_buf = buf + tbytes;
186     tmp_buf += ALIGN(tmp_buf, long);
187     reach_next = (void *)tmp_buf;
188     tmp_buf += rbytes;
189     tmp_buf += ALIGN(tmp_buf, long);
190     reach = (void *)tmp_buf;
191     tmp_buf += rbytes;
192     tmp_buf += ALIGN(tmp_buf, long);
193     reach_pos = (void *)tmp_buf;
194     tmp_buf += pbytes;
195     tmp_buf += ALIGN(tmp_buf, long);
196     for (i = 0; i < tnfa->num_states; i++)
197       {
198         reach[i].tags = (void *)tmp_buf;
199         tmp_buf += tbytes;
200         reach_next[i].tags = (void *)tmp_buf;
201         tmp_buf += tbytes;
202       }
203   }
204
205   for (i = 0; i < tnfa->num_states; i++)
206     reach_pos[i].pos = -1;
207
208   /* If only one character can start a match, find it first. */
209   if (tnfa->first_char >= 0 && str_byte)
210     {
211       const char *orig_str = str_byte;
212       int first = tnfa->first_char;
213       int found_high_bit = 0;
214
215
216       if (type == STR_BYTE)
217         {
218           if (len >= 0)
219             str_byte = memchr(orig_str, first, (size_t)len);
220           else
221             str_byte = strchr(orig_str, first);
222         }
223       else if (type == STR_MBS)
224         {
225           /*
226            * If the match character is ASCII, try to match the character
227            * directly, but if a high bit character is found, we stop there.
228            */
229           if (first < 0x80)
230             {
231               if (len >= 0)
232                 {
233                   int i;
234                   for (i = 0; ; str_byte++, i++)
235                     {
236                       if (i >= len)
237                         {
238                           str_byte = NULL;
239                           break;
240                         }
241                       if (*str_byte == first)
242                         break;
243                       if (*str_byte & 0x80)
244                         {
245                           found_high_bit = 1;
246                           break;
247                         }
248                     }
249                 }
250               else
251                 {
252                   for (; ; str_byte++)
253                     {
254                       if (!*str_byte)
255                         {
256                           str_byte = NULL;
257                           break;
258                         }
259                       if (*str_byte == first)
260                         break;
261                       if (*str_byte & 0x80)
262                         {
263                           found_high_bit = 1;
264                           break;
265                         }
266                     }
267                 }
268             }
269           else
270             {
271               if (len >= 0)
272                 {
273                   int i;
274                   for (i = 0; ; str_byte++, i++)
275                     {
276                       if (i >= len)
277                         {
278                           str_byte = NULL;
279                           break;
280                         }
281                       if (*str_byte & 0x80)
282                         {
283                           found_high_bit = 1;
284                           break;
285                         }
286                     }
287                 }
288               else
289                 {
290                   for (; ; str_byte++)
291                     {
292                       if (!*str_byte)
293                         {
294                           str_byte = NULL;
295                           break;
296                         }
297                       if (*str_byte & 0x80)
298                         {
299                           found_high_bit = 1;
300                           break;
301                         }
302                     }
303                 }
304             }
305         }
306       if (str_byte == NULL)
307         {
308 #ifndef TRE_USE_ALLOCA
309           if (buf)
310             xfree(buf);
311 #endif /* !TRE_USE_ALLOCA */
312           return REG_NOMATCH;
313         }
314       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
315       if (!found_high_bit)
316         {
317           if (str_byte >= orig_str + 1)
318             prev_c = (unsigned char)*(str_byte - 1);
319           next_c = (unsigned char)*str_byte;
320           pos = str_byte - orig_str;
321           if (len < 0 || pos < len)
322             str_byte++;
323         }
324       else
325         {
326           if (str_byte == orig_str)
327             goto no_first_optimization;
328           /*
329            * Back up one character, fix up the position, then call
330            * GET_NEXT_WCHAR() to process the multibyte character.
331            */
332           /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
333           next_c = (unsigned char)*(str_byte - 1);
334           pos = (str_byte - 1) - orig_str;
335           GET_NEXT_WCHAR();
336         }
337     }
338   else
339     {
340 no_first_optimization:
341       GET_NEXT_WCHAR();
342       pos = 0;
343     }
344
345   DPRINT(("length: %d\n", len));
346   DPRINT(("pos:chr/code | states and tags\n"));
347   DPRINT(("-------------+------------------------------------------------\n"));
348
349   reach_next_i = reach_next;
350   while (/*CONSTCOND*/1)
351     {
352       /* If no match found yet, add the initial states to `reach_next'. */
353       if (match_eo < 0)
354         {
355           DPRINT((" init >"));
356           trans_i = tnfa->initial;
357           while (trans_i->state != NULL)
358             {
359               if (reach_pos[trans_i->state_id].pos < pos)
360                 {
361                   if (trans_i->assertions
362                       && CHECK_ASSERTIONS(trans_i->assertions))
363                     {
364                       DPRINT(("assertion failed\n"));
365                       trans_i++;
366                       continue;
367                     }
368
369                   DPRINT((" %p", (void *)trans_i->state));
370                   reach_next_i->state = trans_i->state;
371                   memset(reach_next_i->tags, 0, tbytes);
372                   tag_i = trans_i->tags;
373                   if (tag_i)
374                     {
375                       while (*tag_i >= 0)
376                         {
377                           if (*tag_i < num_tags)
378                             tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
379                           tag_i++;
380                         }
381                         touch++;
382                     }
383                   if (reach_next_i->state == tnfa->final)
384                     {
385                       DPRINT(("  found empty match\n"));
386                       match_eo = pos;
387                       memcpy(match_tags, reach_next_i->tags, tbytes);
388                     }
389                   reach_pos[trans_i->state_id].pos = pos;
390                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
391                   reach_next_i++;
392                 }
393               trans_i++;
394             }
395           DPRINT(("\n"));
396           reach_next_i->state = NULL;
397         }
398       else
399         {
400           if (num_tags == 0 || reach_next_i == reach_next)
401             /*¬†We have found a match. */
402             break;
403         }
404
405       /* Check for end of string. */
406       if (len < 0)
407         {
408           if (next_c == L'\0')
409             break;
410         }
411       else
412         {
413           if (pos >= len)
414             break;
415         }
416
417       GET_NEXT_WCHAR();
418
419 #ifdef TRE_DEBUG
420       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
421       tre_print_reach(tnfa, reach_next, num_tags);
422 #endif /* TRE_DEBUG */
423
424       /* Swap `reach' and `reach_next'. */
425       reach_i = reach;
426       reach = reach_next;
427       reach_next = reach_i;
428
429 #ifdef TRE_DEBUG
430       once = 0;
431 #endif /* TRE_DEBUG */
432
433       /* For each state in `reach' see if there is a transition leaving with
434          the current input symbol to a state not yet in `reach_next', and
435          add the destination states to `reach_next'. */
436       reach_next_i = reach_next;
437       for (reach_i = reach; reach_i->state; reach_i++)
438         {
439           for (trans_i = reach_i->state; trans_i->state; trans_i++)
440             {
441               /* Does this transition match the input symbol? */
442               if (trans_i->code_min <= (tre_cint_t)prev_c &&
443                   trans_i->code_max >= (tre_cint_t)prev_c)
444                 {
445                   if (trans_i->assertions
446                       && (CHECK_ASSERTIONS(trans_i->assertions)
447                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
448                     {
449                       DPRINT(("assertion failed\n"));
450                       continue;
451                     }
452
453                   /* Compute the tags after this transition. */
454                   memcpy(tmp_tags, reach_i->tags, tbytes);
455                   tag_i = trans_i->tags;
456                   if (tag_i != NULL)
457                     {
458                       while (*tag_i >= 0)
459                         {
460                           if (*tag_i < num_tags)
461                             tre_tag_set(tmp_tags, *tag_i, pos, touch);
462                           tag_i++;
463                         }
464                         touch++;
465                     }
466
467                   /* For each new transition, weed out those that don't
468                      fulfill the minimal matching conditions. */
469                   if (tnfa->num_minimals && match_eo >= 0)
470                     {
471                       int skip = 0;
472 #ifdef TRE_DEBUG
473                       if (!once)
474                         {
475                           DPRINT(("Checking minimal conditions: match_eo=%d "
476                                   "match_tags=", match_eo));
477                           tre_print_tags(match_tags, num_tags);
478                           DPRINT(("\n"));
479                           once++;
480                         }
481 #endif /* TRE_DEBUG */
482                       for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
483                         {
484                            int end = tnfa->minimal_tags[i];
485                            int start = tnfa->minimal_tags[i + 1];
486                            DPRINT(("  Minimal start %d, end %d\n", start, end));
487                            if (tre_minimal_tag_order(start, end, match_tags,
488                                                      tmp_tags) > 0)
489                              {
490                                 skip = 1;
491                                 break;
492                              }
493                         }
494                       if (skip)
495                         {
496 #ifdef TRE_DEBUG
497                            DPRINT(("     Throwing out"));
498                            tre_print_reach1(reach_i->state, tmp_tags,
499                                              num_tags);
500                            DPRINT(("\n"));
501 #endif /* TRE_DEBUG */
502                            continue;
503                         }
504                     }
505
506                   if (reach_pos[trans_i->state_id].pos < pos)
507                     {
508                       /* Found an unvisited node. */
509                       reach_next_i->state = trans_i->state;
510                       tmp_iptr = reach_next_i->tags;
511                       reach_next_i->tags = tmp_tags;
512                       tmp_tags = tmp_iptr;
513                       reach_pos[trans_i->state_id].pos = pos;
514                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
515
516                       if (reach_next_i->state == tnfa->final
517                           && (match_eo == -1
518                               || (num_tags > 0
519                                   && tre_tag_get(reach_next_i->tags, 0) <=
520                                   tre_tag_get(match_tags, 0))))
521                         {
522 #ifdef TRE_DEBUG
523                           DPRINT(("  found match"));
524                           tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
525                           DPRINT(("\n"));
526 #endif /* TRE_DEBUG */
527                           match_eo = pos;
528                           memcpy(match_tags, reach_next_i->tags, tbytes);
529                         }
530                       reach_next_i++;
531
532                     }
533                   else
534                     {
535                       assert(reach_pos[trans_i->state_id].pos == pos);
536                       /* Another path has also reached this state.  We choose
537                          the winner by examining the tag values for both
538                          paths. */
539                       if (tre_tag_order(num_tags, tnfa->tag_directions,
540                                         tmp_tags,
541                                         *reach_pos[trans_i->state_id].tags))
542                         {
543                           /* The new path wins. */
544                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
545                           *reach_pos[trans_i->state_id].tags = tmp_tags;
546                           if (trans_i->state == tnfa->final)
547                             {
548 #ifdef TRE_DEBUG
549                               DPRINT(("  found better match"));
550                               tre_print_reach1(trans_i->state, tmp_tags, num_tags);
551                               DPRINT(("\n"));
552 #endif /* TRE_DEBUG */
553                               match_eo = pos;
554                               memcpy(match_tags, tmp_tags, tbytes);
555                             }
556                           tmp_tags = tmp_iptr;
557                         }
558                     }
559                 }
560             }
561         }
562       reach_next_i->state = NULL;
563     }
564
565   DPRINT(("match end offset = %d\n", match_eo));
566
567   *match_end_ofs = match_eo;
568 #ifdef TRE_DEBUG
569   if (match_tags)
570     {
571       DPRINT(("Winning tags="));
572       tre_print_tags_all(match_tags, num_tags);
573       DPRINT((" touch=%d\n", touch));
574     }
575 #endif /* TRE_DEBUG */
576
577 #ifndef TRE_USE_ALLOCA
578   if (buf)
579     xfree(buf);
580 #endif /* !TRE_USE_ALLOCA */
581
582   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
583 }
584
585 /* EOF */