0d6ec173c53321e823cdb6bafc4ababc1096df8b
[dragonfly.git] / contrib / tre / lib / tre-match-parallel.c
1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.  All
12   matching paths in the TNFA are traversed in parallel.  When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29
30 #ifdef TRE_USE_ALLOCA
31 /* AIX requires this to be the first thing in the file.  */
32 #ifndef __GNUC__
33 # if HAVE_ALLOCA_H
34 #  include <alloca.h>
35 # else
36 #  ifdef _AIX
37  #pragma alloca
38 #  else
39 #   ifndef alloca /* predefined by HP cc +Olibcalls */
40 char *alloca ();
41 #   endif
42 #  endif
43 # endif
44 #endif
45 #endif /* TRE_USE_ALLOCA */
46
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #ifdef HAVE_WCHAR_H
51 #include <wchar.h>
52 #endif /* HAVE_WCHAR_H */
53 #ifdef HAVE_WCTYPE_H
54 #include <wctype.h>
55 #endif /* HAVE_WCTYPE_H */
56 #ifndef TRE_WCHAR
57 #include <ctype.h>
58 #endif /* !TRE_WCHAR */
59 #ifdef HAVE_MALLOC_H
60 #include <malloc.h>
61 #endif /* HAVE_MALLOC_H */
62
63 #include "tre-internal.h"
64 #include "tre-match-utils.h"
65 #include "tre.h"
66 #include "xmalloc.h"
67
68
69
70 typedef struct {
71   tre_tnfa_transition_t *state;
72   int *tags;
73 } tre_tnfa_reach_t;
74
75 typedef struct {
76   int pos;
77   int **tags;
78 } tre_reach_pos_t;
79
80
81 #ifdef TRE_DEBUG
82 static void
83 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
84 {
85   int i;
86
87   while (reach->state != NULL)
88     {
89       DPRINT((" %p", (void *)reach->state));
90       if (num_tags > 0)
91         {
92           DPRINT(("/"));
93           for (i = 0; i < num_tags; i++)
94             {
95               DPRINT(("%d:%d", i, reach->tags[i]));
96               if (i < (num_tags-1))
97                 DPRINT((","));
98             }
99         }
100       reach++;
101     }
102   DPRINT(("\n"));
103
104 }
105 #endif /* TRE_DEBUG */
106
107 reg_errcode_t
108 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
109                       tre_str_type_t type, int *match_tags, int eflags,
110                       int *match_end_ofs)
111 {
112   /* State variables required by GET_NEXT_WCHAR. */
113   tre_char_t prev_c = 0, next_c = 0;
114   const char *str_byte = string;
115   int pos = -1;
116   unsigned int pos_add_next = 1;
117 #ifdef TRE_WCHAR
118   const wchar_t *str_wide = string;
119 #ifdef TRE_MBSTATE
120   mbstate_t mbstate;
121 #endif /* TRE_MBSTATE */
122 #endif /* TRE_WCHAR */
123   int reg_notbol = eflags & REG_NOTBOL;
124   int reg_noteol = eflags & REG_NOTEOL;
125   int reg_newline = tnfa->cflags & REG_NEWLINE;
126   int str_user_end = 0;
127
128   char *buf;
129   tre_tnfa_transition_t *trans_i;
130   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
131   tre_reach_pos_t *reach_pos;
132   int *tag_i;
133   int num_tags, i;
134
135   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
136   int new_match = 0;
137   int *tmp_tags = NULL;
138   int *tmp_iptr;
139
140 #ifdef TRE_MBSTATE
141   memset(&mbstate, '\0', sizeof(mbstate));
142 #endif /* TRE_MBSTATE */
143
144   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
145
146   if (!match_tags)
147     num_tags = 0;
148   else
149     num_tags = tnfa->num_tags;
150
151   /* Allocate memory for temporary data required for matching.  This needs to
152      be done for every matching operation to be thread safe.  This allocates
153      everything in a single large block from the stack frame using alloca()
154      or with malloc() if alloca is unavailable. */
155   {
156     int tbytes, rbytes, pbytes, xbytes, total_bytes;
157     char *tmp_buf;
158     /* Compute the length of the block we need. */
159     tbytes = sizeof(*tmp_tags) * num_tags;
160     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
161     pbytes = sizeof(*reach_pos) * tnfa->num_states;
162     xbytes = sizeof(int) * num_tags;
163     total_bytes =
164       (sizeof(long) - 1) * 4 /* for alignment paddings */
165       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
166
167     /* Allocate the memory. */
168 #ifdef TRE_USE_ALLOCA
169     buf = alloca(total_bytes);
170 #else /* !TRE_USE_ALLOCA */
171     buf = xmalloc((unsigned)total_bytes);
172 #endif /* !TRE_USE_ALLOCA */
173     if (buf == NULL)
174       return REG_ESPACE;
175     memset(buf, 0, (size_t)total_bytes);
176
177     /* Get the various pointers within tmp_buf (properly aligned). */
178     tmp_tags = (void *)buf;
179     tmp_buf = buf + tbytes;
180     tmp_buf += ALIGN(tmp_buf, long);
181     reach_next = (void *)tmp_buf;
182     tmp_buf += rbytes;
183     tmp_buf += ALIGN(tmp_buf, long);
184     reach = (void *)tmp_buf;
185     tmp_buf += rbytes;
186     tmp_buf += ALIGN(tmp_buf, long);
187     reach_pos = (void *)tmp_buf;
188     tmp_buf += pbytes;
189     tmp_buf += ALIGN(tmp_buf, long);
190     for (i = 0; i < tnfa->num_states; i++)
191       {
192         reach[i].tags = (void *)tmp_buf;
193         tmp_buf += xbytes;
194         reach_next[i].tags = (void *)tmp_buf;
195         tmp_buf += xbytes;
196       }
197   }
198
199   for (i = 0; i < tnfa->num_states; i++)
200     reach_pos[i].pos = -1;
201
202   /* If only one character can start a match, find it first. */
203   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
204     {
205       const char *orig_str = str_byte;
206       int first = tnfa->first_char;
207
208       if (len >= 0)
209         str_byte = memchr(orig_str, first, (size_t)len);
210       else
211         str_byte = strchr(orig_str, first);
212       if (str_byte == NULL)
213         {
214 #ifndef TRE_USE_ALLOCA
215           if (buf)
216             xfree(buf);
217 #endif /* !TRE_USE_ALLOCA */
218           return REG_NOMATCH;
219         }
220       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
221       if (str_byte >= orig_str + 1)
222         prev_c = (unsigned char)*(str_byte - 1);
223       next_c = (unsigned char)*str_byte;
224       pos = str_byte - orig_str;
225       if (len < 0 || pos < len)
226         str_byte++;
227     }
228   else
229     {
230       GET_NEXT_WCHAR();
231       pos = 0;
232     }
233
234 #if 0
235   /* Skip over characters that cannot possibly be the first character
236      of a match. */
237   if (tnfa->firstpos_chars != NULL)
238     {
239       char *chars = tnfa->firstpos_chars;
240
241       if (len < 0)
242         {
243           const char *orig_str = str_byte;
244           /* XXX - use strpbrk() and wcspbrk() because they might be
245              optimized for the target architecture.  Try also strcspn()
246              and wcscspn() and compare the speeds. */
247           while (next_c != L'\0' && !chars[next_c])
248             {
249               next_c = *str_byte++;
250             }
251           prev_c = *(str_byte - 2);
252           pos += str_byte - orig_str;
253           DPRINT(("skipped %d chars\n", str_byte - orig_str));
254         }
255       else
256         {
257           while (pos <= len && !chars[next_c])
258             {
259               prev_c = next_c;
260               next_c = (unsigned char)(*str_byte++);
261               pos++;
262             }
263         }
264     }
265 #endif
266
267   DPRINT(("length: %d\n", len));
268   DPRINT(("pos:chr/code | states and tags\n"));
269   DPRINT(("-------------+------------------------------------------------\n"));
270
271   reach_next_i = reach_next;
272   while (/*CONSTCOND*/1)
273     {
274       /* If no match found yet, add the initial states to `reach_next'. */
275       if (match_eo < 0)
276         {
277           DPRINT((" init >"));
278           trans_i = tnfa->initial;
279           while (trans_i->state != NULL)
280             {
281               if (reach_pos[trans_i->state_id].pos < pos)
282                 {
283                   if (trans_i->assertions
284                       && CHECK_ASSERTIONS(trans_i->assertions))
285                     {
286                       DPRINT(("assertion failed\n"));
287                       trans_i++;
288                       continue;
289                     }
290
291                   DPRINT((" %p", (void *)trans_i->state));
292                   reach_next_i->state = trans_i->state;
293                   for (i = 0; i < num_tags; i++)
294                     reach_next_i->tags[i] = -1;
295                   tag_i = trans_i->tags;
296                   if (tag_i)
297                     while (*tag_i >= 0)
298                       {
299                         if (*tag_i < num_tags)
300                           reach_next_i->tags[*tag_i] = pos;
301                         tag_i++;
302                       }
303                   if (reach_next_i->state == tnfa->final)
304                     {
305                       DPRINT(("  found empty match\n"));
306                       match_eo = pos;
307                       new_match = 1;
308                       for (i = 0; i < num_tags; i++)
309                         match_tags[i] = reach_next_i->tags[i];
310                     }
311                   reach_pos[trans_i->state_id].pos = pos;
312                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
313                   reach_next_i++;
314                 }
315               trans_i++;
316             }
317           DPRINT(("\n"));
318           reach_next_i->state = NULL;
319         }
320       else
321         {
322           if (num_tags == 0 || reach_next_i == reach_next)
323             /*¬†We have found a match. */
324             break;
325         }
326
327       /* Check for end of string. */
328       if (len < 0)
329         {
330           if (type == STR_USER)
331             {
332               if (str_user_end)
333                 break;
334             }
335           else if (next_c == L'\0')
336             break;
337         }
338       else
339         {
340           if (pos >= len)
341             break;
342         }
343
344       GET_NEXT_WCHAR();
345
346 #ifdef TRE_DEBUG
347       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
348       tre_print_reach(tnfa, reach_next, num_tags);
349       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
350       tre_print_reach(tnfa, reach_next, num_tags);
351 #endif /* TRE_DEBUG */
352
353       /* Swap `reach' and `reach_next'. */
354       reach_i = reach;
355       reach = reach_next;
356       reach_next = reach_i;
357
358       /* For each state in `reach', weed out states that don't fulfill the
359          minimal matching conditions. */
360       if (tnfa->num_minimals && new_match)
361         {
362           new_match = 0;
363           reach_next_i = reach_next;
364           for (reach_i = reach; reach_i->state; reach_i++)
365             {
366               int skip = 0;
367               for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
368                 {
369                   int end = tnfa->minimal_tags[i];
370                   int start = tnfa->minimal_tags[i + 1];
371                   DPRINT(("  Minimal start %d, end %d\n", start, end));
372                   if (end >= num_tags)
373                     {
374                       DPRINT(("  Throwing %p out.\n", reach_i->state));
375                       skip = 1;
376                       break;
377                     }
378                   else if (reach_i->tags[start] == match_tags[start]
379                            && reach_i->tags[end] < match_tags[end])
380                     {
381                       DPRINT(("  Throwing %p out because t%d < %d\n",
382                               reach_i->state, end, match_tags[end]));
383                       skip = 1;
384                       break;
385                     }
386                 }
387               if (!skip)
388                 {
389                   reach_next_i->state = reach_i->state;
390                   tmp_iptr = reach_next_i->tags;
391                   reach_next_i->tags = reach_i->tags;
392                   reach_i->tags = tmp_iptr;
393                   reach_next_i++;
394                 }
395             }
396           reach_next_i->state = NULL;
397
398           /* Swap `reach' and `reach_next'. */
399           reach_i = reach;
400           reach = reach_next;
401           reach_next = reach_i;
402         }
403
404       /* For each state in `reach' see if there is a transition leaving with
405          the current input symbol to a state not yet in `reach_next', and
406          add the destination states to `reach_next'. */
407       reach_next_i = reach_next;
408       for (reach_i = reach; reach_i->state; reach_i++)
409         {
410           for (trans_i = reach_i->state; trans_i->state; trans_i++)
411             {
412               /* Does this transition match the input symbol? */
413               if (trans_i->code_min <= (tre_cint_t)prev_c &&
414                   trans_i->code_max >= (tre_cint_t)prev_c)
415                 {
416                   if (trans_i->assertions
417                       && (CHECK_ASSERTIONS(trans_i->assertions)
418                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
419                     {
420                       DPRINT(("assertion failed\n"));
421                       continue;
422                     }
423
424                   /* Compute the tags after this transition. */
425                   for (i = 0; i < num_tags; i++)
426                     tmp_tags[i] = reach_i->tags[i];
427                   tag_i = trans_i->tags;
428                   if (tag_i != NULL)
429                     while (*tag_i >= 0)
430                       {
431                         if (*tag_i < num_tags)
432                           tmp_tags[*tag_i] = pos;
433                         tag_i++;
434                       }
435
436                   if (reach_pos[trans_i->state_id].pos < pos)
437                     {
438                       /* Found an unvisited node. */
439                       reach_next_i->state = trans_i->state;
440                       tmp_iptr = reach_next_i->tags;
441                       reach_next_i->tags = tmp_tags;
442                       tmp_tags = tmp_iptr;
443                       reach_pos[trans_i->state_id].pos = pos;
444                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
445
446                       if (reach_next_i->state == tnfa->final
447                           && (match_eo == -1
448                               || (num_tags > 0
449                                   && reach_next_i->tags[0] <= match_tags[0])))
450                         {
451                           DPRINT(("  found match %p\n", trans_i->state));
452                           match_eo = pos;
453                           new_match = 1;
454                           for (i = 0; i < num_tags; i++)
455                             match_tags[i] = reach_next_i->tags[i];
456                         }
457                       reach_next_i++;
458
459                     }
460                   else
461                     {
462                       assert(reach_pos[trans_i->state_id].pos == pos);
463                       /* Another path has also reached this state.  We choose
464                          the winner by examining the tag values for both
465                          paths. */
466                       if (tre_tag_order(num_tags, tnfa->tag_directions,
467                                         tmp_tags,
468                                         *reach_pos[trans_i->state_id].tags))
469                         {
470                           /* The new path wins. */
471                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
472                           *reach_pos[trans_i->state_id].tags = tmp_tags;
473                           if (trans_i->state == tnfa->final)
474                             {
475                               DPRINT(("  found better match\n"));
476                               match_eo = pos;
477                               new_match = 1;
478                               for (i = 0; i < num_tags; i++)
479                                 match_tags[i] = tmp_tags[i];
480                             }
481                           tmp_tags = tmp_iptr;
482                         }
483                     }
484                 }
485             }
486         }
487       reach_next_i->state = NULL;
488     }
489
490   DPRINT(("match end offset = %d\n", match_eo));
491
492 #ifndef TRE_USE_ALLOCA
493   if (buf)
494     xfree(buf);
495 #endif /* !TRE_USE_ALLOCA */
496
497   *match_end_ofs = match_eo;
498   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
499 }
500
501 /* EOF */