TRE: Add local modifications to extend functionality
[dragonfly.git] / contrib / tre / lib / tre-match-utils.h
1 /*
2   tre-match-utils.h - TRE matcher helper definitions
3
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6
7 */
8
9 #include "tre-internal.h"
10
11 #define str_source ((const tre_str_source*)string)
12
13 #ifdef TRE_WCHAR
14
15 #ifdef TRE_MULTIBYTE
16
17 /* Wide character and multibyte support. */
18
19 /*
20  * Because all multibyte encodings are exclusively single-shift encoding,
21  * with the shift codes having the high bit set, we can make an optimization
22  * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
23  * is detected, and just do a direct copy for ASCII characters.
24  */
25 #define GET_NEXT_WCHAR()                                                      \
26   do {                                                                        \
27     prev_c = next_c;                                                          \
28     switch (type)                                                             \
29       {                                                                       \
30       case STR_BYTE:                                                          \
31         pos++;                                                                \
32         if (len >= 0 && pos >= len)                                           \
33           next_c = '\0';                                                      \
34         else                                                                  \
35           next_c = (unsigned char)(*str_byte++);                              \
36         break;                                                                \
37       case STR_WIDE:                                                          \
38         pos++;                                                                \
39         if (len >= 0 && pos >= len)                                           \
40           next_c = L'\0';                                                     \
41         else                                                                  \
42           next_c = *str_wide++;                                               \
43         break;                                                                \
44       case STR_MBS:                                                           \
45         pos += pos_add_next;                                                  \
46         if (__builtin_expect(len >= 0 && pos >= len, 0))                      \
47           {                                                                   \
48             next_c = L'\0';                                                   \
49             pos_add_next = 1;                                                 \
50           }                                                                   \
51         else if (__builtin_expect(!(*str_byte & 0x80), 1))                    \
52           {                                                                   \
53             next_c = (unsigned char)(*str_byte++);                            \
54             pos_add_next = 1;                                                 \
55           }                                                                   \
56         else                                                                  \
57           {                                                                   \
58             size_t w;                                                         \
59             int max;                                                          \
60             if (len >= 0)                                                     \
61               max = len - pos;                                                \
62             else                                                              \
63               max = 32;                                                       \
64             w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,       \
65                               tnfa->loc);                                     \
66             if (w == (size_t)-1 || w == (size_t)-2)                           \
67               return REG_ILLSEQ;                                              \
68             if (w == 0 && len >= 0)                                           \
69               {                                                               \
70                 pos_add_next = 1;                                             \
71                 next_c = 0;                                                   \
72                 str_byte++;                                                   \
73               }                                                               \
74             else                                                              \
75               {                                                               \
76                 pos_add_next = w;                                             \
77                 str_byte += w;                                                \
78               }                                                               \
79           }                                                                   \
80         break;                                                                \
81       }                                                                       \
82   } while(/*CONSTCOND*/0)
83
84 #else /* !TRE_MULTIBYTE */
85
86 /* Wide character support, no multibyte support. */
87 #error TRE_MULTIBYTE undefined
88
89 #define GET_NEXT_WCHAR()                                                      \
90   do {                                                                        \
91     prev_c = next_c;                                                          \
92     if (type == STR_BYTE)                                                     \
93       {                                                                       \
94         pos++;                                                                \
95         if (len >= 0 && pos >= len)                                           \
96           next_c = '\0';                                                      \
97         else                                                                  \
98           next_c = (unsigned char)(*str_byte++);                              \
99       }                                                                       \
100     else if (type == STR_WIDE)                                                \
101       {                                                                       \
102         pos++;                                                                \
103         if (len >= 0 && pos >= len)                                           \
104           next_c = L'\0';                                                     \
105         else                                                                  \
106           next_c = *str_wide++;                                               \
107       }                                                                       \
108   } while(/*CONSTCOND*/0)
109
110 #endif /* !TRE_MULTIBYTE */
111
112 #else /* !TRE_WCHAR */
113
114 /* No wide character or multibyte support. */
115 #error TRE_WCHAR undefined
116
117 #define GET_NEXT_WCHAR()                                                      \
118   do {                                                                        \
119     prev_c = next_c;                                                          \
120     if (type == STR_BYTE)                                                     \
121       {                                                                       \
122         pos++;                                                                \
123         if (len >= 0 && pos >= len)                                           \
124           next_c = '\0';                                                      \
125         else                                                                  \
126           next_c = (unsigned char)(*str_byte++);                              \
127       }                                                                       \
128   } while(/*CONSTCOND*/0)
129
130 #endif /* !TRE_WCHAR */
131
132
133
134 /* Assumes tre_tnfa_t *tnfa in scope */
135 #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
136
137 #define CHECK_ASSERTIONS(assertions)                                          \
138   (((assertions & ASSERT_AT_BOL)                                              \
139     && (pos > 0 || reg_notbol)                                                \
140     && (prev_c != L'\n' || !reg_newline))                                     \
141    || ((assertions & ASSERT_AT_EOL)                                           \
142        && (next_c != L'\0' || reg_noteol)                                     \
143        && (next_c != L'\n' || !reg_newline))                                  \
144    || ((assertions & ASSERT_AT_BOW)                                           \
145        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))                    \
146    || ((assertions & ASSERT_AT_EOW)                                           \
147        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
148    || ((assertions & ASSERT_AT_WB)                                            \
149        && (pos != 0 && next_c != L'\0'                                        \
150            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
151    || ((assertions & ASSERT_AT_WB_NEG)                                        \
152        && (pos == 0 || next_c == L'\0'                                        \
153            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
154
155 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
156   ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
157     && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
158                                       tnfa))
159
160
161
162
163 inline static int
164 tre_tag_get(const tre_tag_t *tags, int i)
165 {
166   tags += i;
167   return tags->count > 0 ? tags->value : -1;
168 }
169
170 inline static void
171 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
172 {
173   tags += i;
174   if (tags->count++ == 0)
175     tags->first = val;
176   tags->value = val;
177   tags->touch = touch;
178 }
179
180 inline static void
181 tre_tag_reset(tre_tag_t *tags, int i)
182 {
183   tags[i].count = 0;
184 }
185
186 inline static int
187 tre_tag_touch_get(const tre_tag_t *tags, int i)
188 {
189   return tags[i].touch;
190 }
191
192 #ifdef TRE_DEBUG
193 inline static void
194 tre_print_tags(const tre_tag_t *tags, int num_tags)
195 {
196   int i;
197   for (i = 0; i < num_tags; i++, tags++)
198     {
199       switch(tags->count)
200         {
201         case 0:
202           DPRINT(("%d:(0,-1)", i));
203           break;
204         case 1:
205           DPRINT(("%d:(1,%d)", i, tags->first));
206           break;
207         default:
208           DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
209                   tags->value));
210           break;
211         }
212       if (i < (num_tags - 1))
213         DPRINT((" "));
214     }
215 }
216
217 inline static void
218 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
219 {
220   int i;
221   for (i = 0; i < num_tags; i++, tags++)
222     {
223       switch(tags->count)
224         {
225         case 0:
226           DPRINT(("%d:(0,-1)/%d", i, tags->touch));
227           break;
228         case 1:
229           DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
230           break;
231         default:
232           DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
233                   tags->value, tags->touch));
234           break;
235         }
236       if (i < (num_tags - 1))
237         DPRINT((" "));
238     }
239 }
240 #endif /* TRE_DEBUG */
241
242 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
243  * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
244 inline static int
245 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
246                       const tre_tag_t *tags2)
247 {
248   const tre_tag_t *t1, *t2;
249
250   t1 = tags1 + start;
251   t2 = tags2 + start;
252   /* We need both start tags to be set */
253   if (t1->count == 0 || t2->count == 0)
254     return 0;
255
256   /* The start tags must be equal */
257   if (t1->value != t2->value)
258     return 0;
259
260   t1 = tags1 + end;
261   t2 = tags2 + end;
262   /* For the end tags, we prefer set over unset, because unset means that
263    * the end tag is still growing */
264   if (t1->count == 0)
265     {
266       /* if t2 is set, t1 loses since it is unset */
267       if (t2->count != 0)
268         return -1;
269     }
270   /* if t2 not set, t1 wins since it is set */
271   else if (t2->count == 0)
272     return 1;
273
274   /* least current value wins */
275   return t2->value - t1->value;
276 }
277
278 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
279  * (t1 loses if < 0, t1 wins if > 0) */
280 inline static int
281 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
282                 const tre_tag_t *t2)
283 {
284   int diff;
285
286   t1 += i;
287   t2 += i;
288   switch (dir)
289     {
290     case TRE_TAG_MINIMIZE:
291       /* least current value wins (because tags are initialized to all zeros,
292        * unset wins over set; also, tre_minimal_tag_order() will have already
293        * been run, which checks for being unset) */
294       return t2->value - t1->value;
295
296     case TRE_TAG_MAXIMIZE:
297       /* prefer set */
298       if (t1->count == 0)
299         {
300           /* if neither t1 and t2 are set, try next tag */
301           if (t2->count == 0)
302             return 0;
303           /* t2 is set, t1 loses since it is unset */
304           return -1;
305         }
306       /* if t2 not set, t1 wins since it is set */
307       else if (t2->count == 0)
308         return 1;
309       /* greatest initial value wins */
310       if ((diff = t1->first - t2->first) != 0)
311         return diff;
312       /* least number of times the tag was set, wins */
313       if ((diff = t2->count - t1->count) != 0)
314         return diff;
315       /* if the tags were only set once, they only have initial values */
316       if (t1->count == 1)
317         return 0;
318       /* greatest current value wins */
319       return t1->value - t2->value;
320
321     case TRE_TAG_LEFT_MAXIMIZE:
322       /* prefer set */
323       if (t1->count == 0)
324         {
325           /* if neither t1 and t2 are set, try next tag */
326           if (t2->count == 0)
327             return 0;
328           /* t2 is set, t1 loses since it is unset */
329           return -1;
330         }
331       /* if t2 not set, t1 wins since it is set */
332       else if (t2->count == 0)
333         return 1;
334       /* least initial value wins */
335       if ((diff = t2->first - t1->first) != 0)
336         return diff;
337       /* least number of times the tag was set, wins */
338       if ((diff = t2->count - t1->count) != 0)
339         return diff;
340       /* if the tags were only set once, they only have initial values */
341       if (t1->count == 1)
342         return 0;
343       /* greatest current value wins */
344       return t1->value - t2->value;
345
346     default:
347       /* Shouldn't happen: only assert if TRE_DEBUG defined */
348       assert(0);
349       break;
350     }
351   return 0;
352 }
353
354 #ifdef TRE_DEBUG
355 #define _MORE_DEBUGGING
356 #endif /* TRE_DEBUG */
357
358 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
359 inline static int
360 #ifdef _MORE_DEBUGGING
361 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
362               const tre_tag_t *t1, const tre_tag_t *t2)
363 #else /* !_MORE_DEBUGGING */
364 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
365               const tre_tag_t *t1, const tre_tag_t *t2)
366 #endif /* !_MORE_DEBUGGING */
367 {
368   int i, ret;
369
370   for (i = 0; i < num_tags; i++)
371     {
372       if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
373         return (ret > 0);
374     }
375   /*  assert(0);*/
376   return 0;
377 }
378
379 #ifdef _MORE_DEBUGGING
380 inline static int
381 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
382               const tre_tag_t *t1, const tre_tag_t *t2)
383 {
384   int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
385   DPRINT(("tre_tag_order: "));
386   tre_print_tags(t1, num_tags);
387   DPRINT((" %s ", ret ? "wins" : "doesn't win"));
388   tre_print_tags(t2, num_tags);
389   DPRINT(("\n"));
390   return ret;
391 }
392 #endif /* _MORE_DEBUGGING */
393
394 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
395
396 inline static int
397 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
398                   const tre_tnfa_t * __restrict tnfa)
399 {
400   int match = 0;
401   int i;
402   tre_bracket_match_t *b;
403   tre_cint_t uc, lc;
404   int we, ue, le, got_equiv = 0;
405   int icase = ((tnfa->cflags & REG_ICASE) != 0);
406
407   DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
408   if (icase)
409     {
410       if (tre_islower_l(wc, tnfa->loc))
411         {
412           lc = wc;
413           uc = tre_toupper_l(wc, tnfa->loc);
414         }
415       else if (tre_isupper_l(wc, tnfa->loc))
416         {
417           uc = wc;
418           lc = tre_tolower_l(wc, tnfa->loc);
419         }
420       else
421         {
422           icase = 0;
423         }
424     }
425   for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
426        i++, b++)
427     {
428       switch (b->type)
429         {
430         case TRE_BRACKET_MATCH_TYPE_CHAR:
431           if (icase)
432             match = (b->value == uc || b->value == lc);
433           else
434             match = (b->value == wc);
435           break;
436         case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
437           {
438             tre_cint_t start = b->value, end;
439             if (++i >= list->num_bracket_matches ||
440                 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
441               {
442                 DPRINT(("tre_bracket_match: no following range end\n"));
443                 assert(0);
444                 goto error;
445               }
446             end = b->value;
447             if (!got_equiv)
448               {
449                 if (icase)
450                   {
451                     ue = __collate_equiv_value(tnfa->loc, &uc, 1);
452                     le = __collate_equiv_value(tnfa->loc, &lc, 1);
453                   }
454                 else
455                     we = __collate_equiv_value(tnfa->loc, &wc, 1);
456                 got_equiv = 1;
457               }
458             if (icase)
459               match = ((start <= ue && ue <= end) ||
460                       (start <= le && le <= end));
461             else
462               match = (start <= we && we <= end);
463             break;
464           }
465         case TRE_BRACKET_MATCH_TYPE_RANGE_END:
466           DPRINT(("tre_bracket_match: range end without preceeding start\n"));
467           assert(0);
468           break;
469         case TRE_BRACKET_MATCH_TYPE_CLASS:
470           if (icase)
471             match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
472                      tre_isctype_l(lc, b->value, tnfa->loc));
473           else
474             match = (tre_isctype_l(wc, b->value, tnfa->loc));
475           break;
476         case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
477           if (!got_equiv)
478             {
479               if (icase)
480                 {
481                   ue = __collate_equiv_value(tnfa->loc, &uc, 1);
482                   le = __collate_equiv_value(tnfa->loc, &lc, 1);
483                 }
484               else
485                   we = __collate_equiv_value(tnfa->loc, &wc, 1);
486               got_equiv = 1;
487             }
488           if (icase)
489             match = (b->value == ue || b->value == le);
490           else
491             match = (b->value == we);
492           break;
493         default:
494           DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
495           assert(0);
496           break;
497         }
498         if (match)
499           break;
500     }
501 error:
502   if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
503     if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
504     match = !match;
505   }
506   return match;
507 }