1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5 Software Foundation, Inc.
6 This file is part of the GNU C Library.
7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License along
20 with this program; if not, write to the Free Software Foundation,
21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
25 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
26 Idx n) internal_function;
27 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
28 static void match_ctx_free (re_match_context_t *cache) internal_function;
29 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
30 Idx str_idx, Idx from, Idx to)
32 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
34 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
35 Idx str_idx) internal_function;
36 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
37 Idx node, Idx str_idx)
39 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
40 re_dfastate_t **limited_sts, Idx last_node,
43 static reg_errcode_t re_search_internal (const regex_t *preg,
44 const char *string, Idx length,
45 Idx start, Idx last_start, Idx stop,
46 size_t nmatch, regmatch_t pmatch[],
47 int eflags) internal_function;
48 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
49 const char *string1, Idx length1,
50 const char *string2, Idx length2,
51 Idx start, regoff_t range,
52 struct re_registers *regs,
53 Idx stop, bool ret_len) internal_function;
54 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
55 const char *string, Idx length, Idx start,
56 regoff_t range, Idx stop,
57 struct re_registers *regs,
58 bool ret_len) internal_function;
59 static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
60 Idx nregs, int regs_allocated)
62 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
64 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
65 Idx *p_match_first) internal_function;
66 static Idx check_halt_state_context (const re_match_context_t *mctx,
67 const re_dfastate_t *state, Idx idx)
69 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
70 regmatch_t *prev_idx_match, Idx cur_node,
71 Idx cur_idx, Idx nmatch) internal_function;
72 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
73 Idx str_idx, Idx dest_node, Idx nregs,
75 re_node_set *eps_via_nodes)
77 static reg_errcode_t set_regs (const regex_t *preg,
78 const re_match_context_t *mctx,
79 size_t nmatch, regmatch_t *pmatch,
80 bool fl_backtrack) internal_function;
81 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
85 static int sift_states_iter_mb (const re_match_context_t *mctx,
86 re_sift_context_t *sctx,
87 Idx node_idx, Idx str_idx, Idx max_str_idx)
89 #endif /* RE_ENABLE_I18N */
90 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
91 re_sift_context_t *sctx)
93 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
94 re_sift_context_t *sctx, Idx str_idx,
95 re_node_set *cur_dest)
97 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
98 re_sift_context_t *sctx,
100 re_node_set *dest_nodes)
102 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
103 re_node_set *dest_nodes,
104 const re_node_set *candidates)
106 static bool check_dst_limits (const re_match_context_t *mctx,
107 const re_node_set *limits,
108 Idx dst_node, Idx dst_idx, Idx src_node,
109 Idx src_idx) internal_function;
110 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
111 int boundaries, Idx subexp_idx,
112 Idx from_node, Idx bkref_idx)
114 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
115 Idx limit, Idx subexp_idx,
116 Idx node, Idx str_idx,
117 Idx bkref_idx) internal_function;
118 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
119 re_node_set *dest_nodes,
120 const re_node_set *candidates,
122 struct re_backref_cache_entry *bkref_ents,
123 Idx str_idx) internal_function;
124 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
125 re_sift_context_t *sctx,
126 Idx str_idx, const re_node_set *candidates)
128 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
130 re_dfastate_t **src, Idx num)
132 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
133 re_match_context_t *mctx) internal_function;
134 static re_dfastate_t *transit_state (reg_errcode_t *err,
135 re_match_context_t *mctx,
136 re_dfastate_t *state) internal_function;
137 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
138 re_match_context_t *mctx,
139 re_dfastate_t *next_state)
141 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
142 re_node_set *cur_nodes,
143 Idx str_idx) internal_function;
145 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
146 re_match_context_t *mctx,
147 re_dfastate_t *pstate)
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
152 re_dfastate_t *pstate)
154 #endif /* RE_ENABLE_I18N */
155 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
156 const re_node_set *nodes)
158 static reg_errcode_t get_subexp (re_match_context_t *mctx,
159 Idx bkref_node, Idx bkref_str_idx)
161 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
162 const re_sub_match_top_t *sub_top,
163 re_sub_match_last_t *sub_last,
164 Idx bkref_node, Idx bkref_str)
166 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
167 Idx subexp_idx, int type) internal_function;
168 static reg_errcode_t check_arrival (re_match_context_t *mctx,
169 state_array_t *path, Idx top_node,
170 Idx top_str, Idx last_node, Idx last_str,
171 int type) internal_function;
172 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
174 re_node_set *cur_nodes,
175 re_node_set *next_nodes)
177 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
178 re_node_set *cur_nodes,
179 Idx ex_subexp, int type)
181 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
182 re_node_set *dst_nodes,
183 Idx target, Idx ex_subexp,
184 int type) internal_function;
185 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
186 re_node_set *cur_nodes, Idx cur_str,
187 Idx subexp_num, int type)
189 static bool build_trtable (const re_dfa_t *dfa,
190 re_dfastate_t *state) internal_function;
191 #ifdef RE_ENABLE_I18N
192 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
193 const re_string_t *input, Idx idx)
196 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
200 #endif /* RE_ENABLE_I18N */
201 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
202 const re_dfastate_t *state,
203 re_node_set *states_node,
204 bitset_t *states_ch) internal_function;
205 static bool check_node_accept (const re_match_context_t *mctx,
206 const re_token_t *node, Idx idx)
208 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
211 /* Entry point for POSIX code. */
213 /* regexec searches for a given pattern, specified by PREG, in the
216 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
217 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
218 least NMATCH elements, and we set them to the offsets of the
219 corresponding matched substrings.
221 EFLAGS specifies `execution flags' which affect matching: if
222 REG_NOTBOL is set, then ^ does not match at the beginning of the
223 string; if REG_NOTEOL is set, then $ does not match at the end.
225 We return 0 if we find a match and REG_NOMATCH if not. */
228 regexec (preg, string, nmatch, pmatch, eflags)
229 const regex_t *_Restrict_ preg;
230 const char *_Restrict_ string;
232 regmatch_t pmatch[_Restrict_arr_];
238 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
241 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
244 if (eflags & REG_STARTEND)
246 start = pmatch[0].rm_so;
247 length = pmatch[0].rm_eo;
252 length = strlen (string);
255 __libc_lock_lock (dfa->lock);
257 err = re_search_internal (preg, string, length, start, length,
258 length, 0, NULL, eflags);
260 err = re_search_internal (preg, string, length, start, length,
261 length, nmatch, pmatch, eflags);
262 __libc_lock_unlock (dfa->lock);
263 return err != REG_NOERROR;
267 # include <shlib-compat.h>
268 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
270 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
271 __typeof__ (__regexec) __compat_regexec;
274 attribute_compat_text_section
275 __compat_regexec (const regex_t *_Restrict_ preg,
276 const char *_Restrict_ string, size_t nmatch,
277 regmatch_t pmatch[], int eflags)
279 return regexec (preg, string, nmatch, pmatch,
280 eflags & (REG_NOTBOL | REG_NOTEOL));
282 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
286 /* Entry points for GNU code. */
288 /* re_match, re_search, re_match_2, re_search_2
290 The former two functions operate on STRING with length LENGTH,
291 while the later two operate on concatenation of STRING1 and STRING2
292 with lengths LENGTH1 and LENGTH2, respectively.
294 re_match() matches the compiled pattern in BUFP against the string,
295 starting at index START.
297 re_search() first tries matching at index START, then it tries to match
298 starting from index START + 1, and so on. The last start position tried
299 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
302 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
303 the first STOP characters of the concatenation of the strings should be
306 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
307 and all groups is stored in REGS. (For the "_2" variants, the offsets are
308 computed relative to the concatenation, not relative to the individual
311 On success, re_match* functions return the length of the match, re_search*
312 return the position of the start of the match. Return value -1 means no
313 match was found and -2 indicates an internal error. */
316 re_match (bufp, string, length, start, regs)
317 struct re_pattern_buffer *bufp;
320 struct re_registers *regs;
322 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
325 weak_alias (__re_match, re_match)
329 re_search (bufp, string, length, start, range, regs)
330 struct re_pattern_buffer *bufp;
334 struct re_registers *regs;
336 return re_search_stub (bufp, string, length, start, range, length, regs,
340 weak_alias (__re_search, re_search)
344 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
345 struct re_pattern_buffer *bufp;
346 const char *string1, *string2;
347 Idx length1, length2, start, stop;
348 struct re_registers *regs;
350 return re_search_2_stub (bufp, string1, length1, string2, length2,
351 start, 0, regs, stop, true);
354 weak_alias (__re_match_2, re_match_2)
358 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
359 struct re_pattern_buffer *bufp;
360 const char *string1, *string2;
361 Idx length1, length2, start, stop;
363 struct re_registers *regs;
365 return re_search_2_stub (bufp, string1, length1, string2, length2,
366 start, range, regs, stop, false);
369 weak_alias (__re_search_2, re_search_2)
374 re_search_2_stub (struct re_pattern_buffer *bufp,
375 const char *string1, Idx length1,
376 const char *string2, Idx length2,
377 Idx start, regoff_t range, struct re_registers *regs,
378 Idx stop, bool ret_len)
382 Idx len = length1 + length2;
385 verify (! TYPE_SIGNED (Idx));
386 if (BE (len < length1, 0))
388 /* if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
391 /* Concatenate the strings. */
395 s = re_malloc (char, len);
397 if (BE (s == NULL, 0))
400 memcpy (__mempcpy (s, string1, length1), string2, length2);
402 memcpy (s, string1, length1);
403 memcpy (s + length1, string2, length2);
412 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
418 /* The parameters have the same meaning as those of re_search.
419 Additional parameters:
420 If RET_LEN is true the length of the match is returned (re_match style);
421 otherwise the position of the match is returned. */
425 re_search_stub (struct re_pattern_buffer *bufp,
426 const char *string, Idx length,
427 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
430 reg_errcode_t result;
436 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
438 Idx last_start = start + range;
440 /* Check for out-of-range. */
441 verify (! TYPE_SIGNED (Idx));
442 /* if (BE (start < 0, 0))
444 if (BE (start > length, 0))
446 if (BE (length < last_start || (0 <= range && last_start < start), 0))
448 else if (BE (/* last_start < 0 || */ (range < 0 && start <= last_start), 0))
451 __libc_lock_lock (dfa->lock);
453 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
454 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
456 /* Compile fastmap if we haven't yet. */
457 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
458 re_compile_fastmap (bufp);
460 if (BE (bufp->no_sub, 0))
463 /* We need at least 1 register. */
466 else if (BE (bufp->regs_allocated == REGS_FIXED
467 && regs->num_regs <= bufp->re_nsub, 0))
469 nregs = regs->num_regs;
470 if (BE (nregs < 1, 0))
472 /* Nothing can be copied to regs. */
478 nregs = bufp->re_nsub + 1;
479 pmatch = re_malloc (regmatch_t, nregs);
480 if (BE (pmatch == NULL, 0))
486 result = re_search_internal (bufp, string, length, start, last_start, stop,
487 nregs, pmatch, eflags);
491 /* I hope we needn't fill ther regs with -1's when no match was found. */
492 if (result != REG_NOERROR)
494 else if (regs != NULL)
496 /* If caller wants register contents data back, copy them. */
497 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
498 bufp->regs_allocated);
499 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
503 if (BE (rval == 0, 1))
507 assert (pmatch[0].rm_so == start);
508 rval = pmatch[0].rm_eo - start;
511 rval = pmatch[0].rm_so;
515 __libc_lock_unlock (dfa->lock);
521 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
524 int rval = REGS_REALLOCATE;
526 Idx need_regs = nregs + 1;
527 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
530 /* Have the register data arrays been allocated? */
531 if (regs_allocated == REGS_UNALLOCATED)
532 { /* No. So allocate them with malloc. */
533 regs->start = re_malloc (regoff_t, need_regs);
534 if (BE (regs->start == NULL, 0))
535 return REGS_UNALLOCATED;
536 regs->end = re_malloc (regoff_t, need_regs);
537 if (BE (regs->end == NULL, 0))
539 re_free (regs->start);
540 return REGS_UNALLOCATED;
542 regs->num_regs = need_regs;
544 else if (regs_allocated == REGS_REALLOCATE)
545 { /* Yes. If we need more elements than were already
546 allocated, reallocate them. If we need fewer, just
548 if (BE (need_regs > regs->num_regs, 0))
550 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
552 if (BE (new_start == NULL, 0))
553 return REGS_UNALLOCATED;
554 new_end = re_realloc (regs->end, regoff_t, need_regs);
555 if (BE (new_end == NULL, 0))
558 return REGS_UNALLOCATED;
560 regs->start = new_start;
562 regs->num_regs = need_regs;
567 assert (regs_allocated == REGS_FIXED);
568 /* This function may not be called with REGS_FIXED and nregs too big. */
569 assert (regs->num_regs >= nregs);
574 for (i = 0; i < nregs; ++i)
576 regs->start[i] = pmatch[i].rm_so;
577 regs->end[i] = pmatch[i].rm_eo;
579 for ( ; i < regs->num_regs; ++i)
580 regs->start[i] = regs->end[i] = -1;
585 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
586 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
587 this memory for recording register information. STARTS and ENDS
588 must be allocated using the malloc library routine, and must each
589 be at least NUM_REGS * sizeof (regoff_t) bytes long.
591 If NUM_REGS == 0, then subsequent matches should allocate their own
594 Unless this function is called, the first search or match using
595 PATTERN_BUFFER will allocate its own register data, without
596 freeing the old data. */
599 re_set_registers (bufp, regs, num_regs, starts, ends)
600 struct re_pattern_buffer *bufp;
601 struct re_registers *regs;
602 __re_size_t num_regs;
603 regoff_t *starts, *ends;
607 bufp->regs_allocated = REGS_REALLOCATE;
608 regs->num_regs = num_regs;
609 regs->start = starts;
614 bufp->regs_allocated = REGS_UNALLOCATED;
616 regs->start = regs->end = NULL;
620 weak_alias (__re_set_registers, re_set_registers)
623 /* Entry points compatible with 4.2 BSD regex library. We don't define
624 them unless specifically requested. */
626 #if defined _REGEX_RE_COMP || defined _LIBC
634 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
636 #endif /* _REGEX_RE_COMP */
638 /* Internal entry point. */
640 /* Searches for a compiled pattern PREG in the string STRING, whose
641 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
642 meaning as with regexec. LAST_START is START + RANGE, where
643 START and RANGE have the same meaning as with re_search.
644 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
645 otherwise return the error code.
646 Note: We assume front end functions already check ranges.
647 (0 <= LAST_START && LAST_START <= LENGTH) */
650 internal_function __attribute_warn_unused_result__
651 re_search_internal (const regex_t *preg,
652 const char *string, Idx length,
653 Idx start, Idx last_start, Idx stop,
654 size_t nmatch, regmatch_t pmatch[],
658 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
659 Idx left_lim, right_lim;
661 bool fl_longest_match;
664 Idx match_last = REG_MISSING;
668 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
669 re_match_context_t mctx = { .dfa = dfa };
671 re_match_context_t mctx;
673 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
674 && start != last_start && !preg->can_be_null)
675 ? preg->fastmap : NULL);
676 RE_TRANSLATE_TYPE t = preg->translate;
678 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
679 memset (&mctx, '\0', sizeof (re_match_context_t));
683 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
684 nmatch -= extra_nmatch;
686 /* Check if the DFA haven't been compiled. */
687 if (BE (preg->used == 0 || dfa->init_state == NULL
688 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
689 || dfa->init_state_begbuf == NULL, 0))
693 /* We assume front-end functions already check them. */
694 assert (0 <= last_start && last_start <= length);
697 /* If initial states with non-begbuf contexts have no elements,
698 the regex must be anchored. If preg->newline_anchor is set,
699 we'll never use init_state_nl, so do not check it. */
700 if (dfa->init_state->nodes.nelem == 0
701 && dfa->init_state_word->nodes.nelem == 0
702 && (dfa->init_state_nl->nodes.nelem == 0
703 || !preg->newline_anchor))
705 if (start != 0 && last_start != 0)
707 start = last_start = 0;
710 /* We must check the longest matching, if nmatch > 0. */
711 fl_longest_match = (nmatch != 0 || dfa->nbackref);
713 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
714 preg->translate, (preg->syntax & RE_ICASE) != 0,
716 if (BE (err != REG_NOERROR, 0))
718 mctx.input.stop = stop;
719 mctx.input.raw_stop = stop;
720 mctx.input.newline_anchor = preg->newline_anchor;
722 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
723 if (BE (err != REG_NOERROR, 0))
726 /* We will log all the DFA states through which the dfa pass,
727 if nmatch > 1, or this dfa has "multibyte node", which is a
728 back-reference or a node which can accept multibyte character or
729 multi character collating element. */
730 if (nmatch > 1 || dfa->has_mb_node)
732 /* Avoid overflow. */
733 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
739 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
740 if (BE (mctx.state_log == NULL, 0))
747 mctx.state_log = NULL;
750 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
751 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
753 /* Check incrementally whether of not the input string match. */
754 incr = (last_start < start) ? -1 : 1;
755 left_lim = (last_start < start) ? last_start : start;
756 right_lim = (last_start < start) ? start : last_start;
757 sb = dfa->mb_cur_max == 1;
760 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
761 | (start <= last_start ? 2 : 0)
762 | (t != NULL ? 1 : 0))
765 for (;; match_first += incr)
768 if (match_first < left_lim || right_lim < match_first)
771 /* Advance as rapidly as possible through the string, until we
772 find a plausible place to start matching. This may be done
773 with varying efficiency, so there are various possibilities:
774 only the most common of them are specialized, in order to
775 save on code size. We use a switch statement for speed. */
783 /* Fastmap with single-byte translation, match forward. */
784 while (BE (match_first < right_lim, 1)
785 && !fastmap[t[(unsigned char) string[match_first]]])
787 goto forward_match_found_start_or_reached_end;
790 /* Fastmap without translation, match forward. */
791 while (BE (match_first < right_lim, 1)
792 && !fastmap[(unsigned char) string[match_first]])
795 forward_match_found_start_or_reached_end:
796 if (BE (match_first == right_lim, 0))
798 ch = match_first >= length
799 ? 0 : (unsigned char) string[match_first];
800 if (!fastmap[t ? t[ch] : ch])
807 /* Fastmap without multi-byte translation, match backwards. */
808 while (match_first >= left_lim)
810 ch = match_first >= length
811 ? 0 : (unsigned char) string[match_first];
812 if (fastmap[t ? t[ch] : ch])
816 if (match_first < left_lim)
821 /* In this case, we can't determine easily the current byte,
822 since it might be a component byte of a multibyte
823 character. Then we use the constructed buffer instead. */
826 /* If MATCH_FIRST is out of the valid range, reconstruct the
828 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
829 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
831 err = re_string_reconstruct (&mctx.input, match_first,
833 if (BE (err != REG_NOERROR, 0))
836 offset = match_first - mctx.input.raw_mbs_idx;
838 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
839 Note that MATCH_FIRST must not be smaller than 0. */
840 ch = (match_first >= length
841 ? 0 : re_string_byte_at (&mctx.input, offset));
845 if (match_first < left_lim || match_first > right_lim)
854 /* Reconstruct the buffers so that the matcher can assume that
855 the matching starts from the beginning of the buffer. */
856 err = re_string_reconstruct (&mctx.input, match_first, eflags);
857 if (BE (err != REG_NOERROR, 0))
860 #ifdef RE_ENABLE_I18N
861 /* Don't consider this char as a possible match start if it part,
862 yet isn't the head, of a multibyte character. */
863 if (!sb && !re_string_first_byte (&mctx.input, 0))
867 /* It seems to be appropriate one, then use the matcher. */
868 /* We assume that the matching starts from 0. */
869 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
870 match_last = check_matching (&mctx, fl_longest_match,
871 start <= last_start ? &match_first : NULL);
872 if (match_last != REG_MISSING)
874 if (BE (match_last == REG_ERROR, 0))
881 mctx.match_last = match_last;
882 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
884 re_dfastate_t *pstate = mctx.state_log[match_last];
885 mctx.last_node = check_halt_state_context (&mctx, pstate,
888 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
891 err = prune_impossible_nodes (&mctx);
892 if (err == REG_NOERROR)
894 if (BE (err != REG_NOMATCH, 0))
896 match_last = REG_MISSING;
899 break; /* We found a match. */
903 match_ctx_clean (&mctx);
907 assert (match_last != REG_MISSING);
908 assert (err == REG_NOERROR);
911 /* Set pmatch[] if we need. */
916 /* Initialize registers. */
917 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
918 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
920 /* Set the points where matching start/end. */
922 pmatch[0].rm_eo = mctx.match_last;
923 /* FIXME: This function should fail if mctx.match_last exceeds
924 the maximum possible regoff_t value. We need a new error
925 code REG_OVERFLOW. */
927 if (!preg->no_sub && nmatch > 1)
929 err = set_regs (preg, &mctx, nmatch, pmatch,
930 dfa->has_plural_match && dfa->nbackref > 0);
931 if (BE (err != REG_NOERROR, 0))
935 /* At last, add the offset to the each registers, since we slided
936 the buffers so that we could assume that the matching starts
938 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
939 if (pmatch[reg_idx].rm_so != -1)
941 #ifdef RE_ENABLE_I18N
942 if (BE (mctx.input.offsets_needed != 0, 0))
944 pmatch[reg_idx].rm_so =
945 (pmatch[reg_idx].rm_so == mctx.input.valid_len
946 ? mctx.input.valid_raw_len
947 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
948 pmatch[reg_idx].rm_eo =
949 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
950 ? mctx.input.valid_raw_len
951 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
954 assert (mctx.input.offsets_needed == 0);
956 pmatch[reg_idx].rm_so += match_first;
957 pmatch[reg_idx].rm_eo += match_first;
959 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
961 pmatch[nmatch + reg_idx].rm_so = -1;
962 pmatch[nmatch + reg_idx].rm_eo = -1;
966 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
967 if (dfa->subexp_map[reg_idx] != reg_idx)
969 pmatch[reg_idx + 1].rm_so
970 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
971 pmatch[reg_idx + 1].rm_eo
972 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
977 re_free (mctx.state_log);
979 match_ctx_free (&mctx);
980 re_string_destruct (&mctx.input);
985 internal_function __attribute_warn_unused_result__
986 prune_impossible_nodes (re_match_context_t *mctx)
988 const re_dfa_t *const dfa = mctx->dfa;
989 Idx halt_node, match_last;
991 re_dfastate_t **sifted_states;
992 re_dfastate_t **lim_states = NULL;
993 re_sift_context_t sctx;
995 assert (mctx->state_log != NULL);
997 match_last = mctx->match_last;
998 halt_node = mctx->last_node;
1000 /* Avoid overflow. */
1001 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
1004 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
1005 if (BE (sifted_states == NULL, 0))
1012 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1013 if (BE (lim_states == NULL, 0))
1020 memset (lim_states, '\0',
1021 sizeof (re_dfastate_t *) * (match_last + 1));
1022 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1024 ret = sift_states_backward (mctx, &sctx);
1025 re_node_set_free (&sctx.limits);
1026 if (BE (ret != REG_NOERROR, 0))
1028 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1033 if (! REG_VALID_INDEX (match_last))
1038 } while (mctx->state_log[match_last] == NULL
1039 || !mctx->state_log[match_last]->halt);
1040 halt_node = check_halt_state_context (mctx,
1041 mctx->state_log[match_last],
1044 ret = merge_state_array (dfa, sifted_states, lim_states,
1046 re_free (lim_states);
1048 if (BE (ret != REG_NOERROR, 0))
1053 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1054 ret = sift_states_backward (mctx, &sctx);
1055 re_node_set_free (&sctx.limits);
1056 if (BE (ret != REG_NOERROR, 0))
1058 if (sifted_states[0] == NULL)
1064 re_free (mctx->state_log);
1065 mctx->state_log = sifted_states;
1066 sifted_states = NULL;
1067 mctx->last_node = halt_node;
1068 mctx->match_last = match_last;
1071 re_free (sifted_states);
1072 re_free (lim_states);
1076 /* Acquire an initial state and return it.
1077 We must select appropriate initial state depending on the context,
1078 since initial states may have constraints like "\<", "^", etc.. */
1080 static inline re_dfastate_t *
1081 __attribute ((always_inline)) internal_function
1082 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1085 const re_dfa_t *const dfa = mctx->dfa;
1086 if (dfa->init_state->has_constraint)
1088 unsigned int context;
1089 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1090 if (IS_WORD_CONTEXT (context))
1091 return dfa->init_state_word;
1092 else if (IS_ORDINARY_CONTEXT (context))
1093 return dfa->init_state;
1094 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1095 return dfa->init_state_begbuf;
1096 else if (IS_NEWLINE_CONTEXT (context))
1097 return dfa->init_state_nl;
1098 else if (IS_BEGBUF_CONTEXT (context))
1100 /* It is relatively rare case, then calculate on demand. */
1101 return re_acquire_state_context (err, dfa,
1102 dfa->init_state->entrance_nodes,
1106 /* Must not happen? */
1107 return dfa->init_state;
1110 return dfa->init_state;
1113 /* Check whether the regular expression match input string INPUT or not,
1114 and return the index where the matching end. Return REG_MISSING if
1115 there is no match, and return REG_ERROR in case of an error.
1116 FL_LONGEST_MATCH means we want the POSIX longest matching.
1117 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1118 next place where we may want to try matching.
1119 Note that the matcher assume that the maching starts from the current
1120 index of the buffer. */
1123 internal_function __attribute_warn_unused_result__
1124 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1127 const re_dfa_t *const dfa = mctx->dfa;
1130 Idx match_last = REG_MISSING;
1131 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1132 re_dfastate_t *cur_state;
1133 bool at_init_state = p_match_first != NULL;
1134 Idx next_start_idx = cur_str_idx;
1137 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1138 /* An initial state must not be NULL (invalid). */
1139 if (BE (cur_state == NULL, 0))
1141 assert (err == REG_ESPACE);
1145 if (mctx->state_log != NULL)
1147 mctx->state_log[cur_str_idx] = cur_state;
1149 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1150 later. E.g. Processing back references. */
1151 if (BE (dfa->nbackref, 0))
1153 at_init_state = false;
1154 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1155 if (BE (err != REG_NOERROR, 0))
1158 if (cur_state->has_backref)
1160 err = transit_state_bkref (mctx, &cur_state->nodes);
1161 if (BE (err != REG_NOERROR, 0))
1167 /* If the RE accepts NULL string. */
1168 if (BE (cur_state->halt, 0))
1170 if (!cur_state->has_constraint
1171 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1173 if (!fl_longest_match)
1177 match_last = cur_str_idx;
1183 while (!re_string_eoi (&mctx->input))
1185 re_dfastate_t *old_state = cur_state;
1186 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1188 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1189 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1190 && mctx->input.valid_len < mctx->input.len))
1192 err = extend_buffers (mctx);
1193 if (BE (err != REG_NOERROR, 0))
1195 assert (err == REG_ESPACE);
1200 cur_state = transit_state (&err, mctx, cur_state);
1201 if (mctx->state_log != NULL)
1202 cur_state = merge_state_with_log (&err, mctx, cur_state);
1204 if (cur_state == NULL)
1206 /* Reached the invalid state or an error. Try to recover a valid
1207 state using the state log, if available and if we have not
1208 already found a valid (even if not the longest) match. */
1209 if (BE (err != REG_NOERROR, 0))
1212 if (mctx->state_log == NULL
1213 || (match && !fl_longest_match)
1214 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1218 if (BE (at_init_state, 0))
1220 if (old_state == cur_state)
1221 next_start_idx = next_char_idx;
1223 at_init_state = false;
1226 if (cur_state->halt)
1228 /* Reached a halt state.
1229 Check the halt state can satisfy the current context. */
1230 if (!cur_state->has_constraint
1231 || check_halt_state_context (mctx, cur_state,
1232 re_string_cur_idx (&mctx->input)))
1234 /* We found an appropriate halt state. */
1235 match_last = re_string_cur_idx (&mctx->input);
1238 /* We found a match, do not modify match_first below. */
1239 p_match_first = NULL;
1240 if (!fl_longest_match)
1247 *p_match_first += next_start_idx;
1252 /* Check NODE match the current context. */
1256 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1258 re_token_type_t type = dfa->nodes[node].type;
1259 unsigned int constraint = dfa->nodes[node].constraint;
1260 if (type != END_OF_RE)
1264 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1269 /* Check the halt state STATE match the current context.
1270 Return 0 if not match, if the node, STATE has, is a halt node and
1271 match the context, return the node. */
1275 check_halt_state_context (const re_match_context_t *mctx,
1276 const re_dfastate_t *state, Idx idx)
1279 unsigned int context;
1281 assert (state->halt);
1283 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1284 for (i = 0; i < state->nodes.nelem; ++i)
1285 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1286 return state->nodes.elems[i];
1290 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1291 corresponding to the DFA).
1292 Return the destination node, and update EPS_VIA_NODES;
1293 return REG_MISSING in case of errors. */
1297 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1298 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1299 struct re_fail_stack_t *fs)
1301 const re_dfa_t *const dfa = mctx->dfa;
1304 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1306 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1307 re_node_set *edests = &dfa->edests[node];
1309 ok = re_node_set_insert (eps_via_nodes, node);
1312 /* Pick up a valid destination, or return REG_MISSING if none
1314 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1316 Idx candidate = edests->elems[i];
1317 if (!re_node_set_contains (cur_nodes, candidate))
1319 if (dest_node == REG_MISSING)
1320 dest_node = candidate;
1324 /* In order to avoid infinite loop like "(a*)*", return the second
1325 epsilon-transition if the first was already considered. */
1326 if (re_node_set_contains (eps_via_nodes, dest_node))
1329 /* Otherwise, push the second epsilon-transition on the fail stack. */
1331 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1335 /* We know we are going to exit. */
1344 re_token_type_t type = dfa->nodes[node].type;
1346 #ifdef RE_ENABLE_I18N
1347 if (dfa->nodes[node].accept_mb)
1348 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1350 #endif /* RE_ENABLE_I18N */
1351 if (type == OP_BACK_REF)
1353 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1354 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1357 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1361 char *buf = (char *) re_string_get_buffer (&mctx->input);
1362 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1371 ok = re_node_set_insert (eps_via_nodes, node);
1374 dest_node = dfa->edests[node].elems[0];
1375 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1382 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1384 Idx dest_node = dfa->nexts[node];
1385 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1386 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1387 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1390 re_node_set_empty (eps_via_nodes);
1397 static reg_errcode_t
1398 internal_function __attribute_warn_unused_result__
1399 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1400 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1403 Idx num = fs->num++;
1404 if (fs->num == fs->alloc)
1406 struct re_fail_stack_ent_t *new_array;
1407 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1409 if (new_array == NULL)
1412 fs->stack = new_array;
1414 fs->stack[num].idx = str_idx;
1415 fs->stack[num].node = dest_node;
1416 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1417 if (fs->stack[num].regs == NULL)
1419 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1420 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1426 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1427 regmatch_t *regs, re_node_set *eps_via_nodes)
1429 Idx num = --fs->num;
1430 assert (REG_VALID_INDEX (num));
1431 *pidx = fs->stack[num].idx;
1432 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1433 re_node_set_free (eps_via_nodes);
1434 re_free (fs->stack[num].regs);
1435 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1436 return fs->stack[num].node;
1439 /* Set the positions where the subexpressions are starts/ends to registers
1441 Note: We assume that pmatch[0] is already set, and
1442 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1444 static reg_errcode_t
1445 internal_function __attribute_warn_unused_result__
1446 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1447 regmatch_t *pmatch, bool fl_backtrack)
1449 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1451 re_node_set eps_via_nodes;
1452 struct re_fail_stack_t *fs;
1453 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1454 regmatch_t *prev_idx_match;
1455 bool prev_idx_match_malloced = false;
1458 assert (nmatch > 1);
1459 assert (mctx->state_log != NULL);
1464 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1465 if (fs->stack == NULL)
1471 cur_node = dfa->init_node;
1472 re_node_set_init_empty (&eps_via_nodes);
1474 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1475 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1478 prev_idx_match = re_malloc (regmatch_t, nmatch);
1479 if (prev_idx_match == NULL)
1481 free_fail_stack_return (fs);
1484 prev_idx_match_malloced = true;
1486 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1488 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1490 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1492 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1497 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1498 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1500 if (reg_idx == nmatch)
1502 re_node_set_free (&eps_via_nodes);
1503 if (prev_idx_match_malloced)
1504 re_free (prev_idx_match);
1505 return free_fail_stack_return (fs);
1507 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1512 re_node_set_free (&eps_via_nodes);
1513 if (prev_idx_match_malloced)
1514 re_free (prev_idx_match);
1519 /* Proceed to next node. */
1520 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1521 &eps_via_nodes, fs);
1523 if (BE (! REG_VALID_INDEX (cur_node), 0))
1525 if (BE (cur_node == REG_ERROR, 0))
1527 re_node_set_free (&eps_via_nodes);
1528 if (prev_idx_match_malloced)
1529 re_free (prev_idx_match);
1530 free_fail_stack_return (fs);
1534 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1538 re_node_set_free (&eps_via_nodes);
1539 if (prev_idx_match_malloced)
1540 re_free (prev_idx_match);
1545 re_node_set_free (&eps_via_nodes);
1546 if (prev_idx_match_malloced)
1547 re_free (prev_idx_match);
1548 return free_fail_stack_return (fs);
1551 static reg_errcode_t
1553 free_fail_stack_return (struct re_fail_stack_t *fs)
1558 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1560 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1561 re_free (fs->stack[fs_idx].regs);
1563 re_free (fs->stack);
1570 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1571 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1573 int type = dfa->nodes[cur_node].type;
1574 if (type == OP_OPEN_SUBEXP)
1576 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1578 /* We are at the first node of this sub expression. */
1579 if (reg_num < nmatch)
1581 pmatch[reg_num].rm_so = cur_idx;
1582 pmatch[reg_num].rm_eo = -1;
1585 else if (type == OP_CLOSE_SUBEXP)
1587 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1588 if (reg_num < nmatch)
1590 /* We are at the last node of this sub expression. */
1591 if (pmatch[reg_num].rm_so < cur_idx)
1593 pmatch[reg_num].rm_eo = cur_idx;
1594 /* This is a non-empty match or we are not inside an optional
1595 subexpression. Accept this right away. */
1596 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1600 if (dfa->nodes[cur_node].opt_subexp
1601 && prev_idx_match[reg_num].rm_so != -1)
1602 /* We transited through an empty match for an optional
1603 subexpression, like (a?)*, and this is not the subexp's
1604 first match. Copy back the old content of the registers
1605 so that matches of an inner subexpression are undone as
1606 well, like in ((a?))*. */
1607 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1609 /* We completed a subexpression, but it may be part of
1610 an optional one, so do not update PREV_IDX_MATCH. */
1611 pmatch[reg_num].rm_eo = cur_idx;
1617 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1618 and sift the nodes in each states according to the following rules.
1619 Updated state_log will be wrote to STATE_LOG.
1621 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1622 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1623 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1624 the LAST_NODE, we throw away the node `a'.
1625 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1626 string `s' and transit to `b':
1627 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1629 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1630 thrown away, we throw away the node `a'.
1631 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1632 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1634 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1635 we throw away the node `a'. */
1637 #define STATE_NODE_CONTAINS(state,node) \
1638 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1640 static reg_errcode_t
1642 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1646 Idx str_idx = sctx->last_str_idx;
1647 re_node_set cur_dest;
1650 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1653 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1654 transit to the last_node and the last_node itself. */
1655 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1656 if (BE (err != REG_NOERROR, 0))
1658 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1659 if (BE (err != REG_NOERROR, 0))
1662 /* Then check each states in the state_log. */
1665 /* Update counters. */
1666 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1667 if (null_cnt > mctx->max_mb_elem_len)
1669 memset (sctx->sifted_states, '\0',
1670 sizeof (re_dfastate_t *) * str_idx);
1671 re_node_set_free (&cur_dest);
1674 re_node_set_empty (&cur_dest);
1677 if (mctx->state_log[str_idx])
1679 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1680 if (BE (err != REG_NOERROR, 0))
1684 /* Add all the nodes which satisfy the following conditions:
1685 - It can epsilon transit to a node in CUR_DEST.
1687 And update state_log. */
1688 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1689 if (BE (err != REG_NOERROR, 0))
1694 re_node_set_free (&cur_dest);
1698 static reg_errcode_t
1699 internal_function __attribute_warn_unused_result__
1700 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1701 Idx str_idx, re_node_set *cur_dest)
1703 const re_dfa_t *const dfa = mctx->dfa;
1704 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1707 /* Then build the next sifted state.
1708 We build the next sifted state on `cur_dest', and update
1709 `sifted_states[str_idx]' with `cur_dest'.
1711 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1712 `cur_src' points the node_set of the old `state_log[str_idx]'
1713 (with the epsilon nodes pre-filtered out). */
1714 for (i = 0; i < cur_src->nelem; i++)
1716 Idx prev_node = cur_src->elems[i];
1721 re_token_type_t type = dfa->nodes[prev_node].type;
1722 assert (!IS_EPSILON_NODE (type));
1724 #ifdef RE_ENABLE_I18N
1725 /* If the node may accept `multi byte'. */
1726 if (dfa->nodes[prev_node].accept_mb)
1727 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1728 str_idx, sctx->last_str_idx);
1729 #endif /* RE_ENABLE_I18N */
1731 /* We don't check backreferences here.
1732 See update_cur_sifted_state(). */
1734 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1735 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1736 dfa->nexts[prev_node]))
1742 if (sctx->limits.nelem)
1744 Idx to_idx = str_idx + naccepted;
1745 if (check_dst_limits (mctx, &sctx->limits,
1746 dfa->nexts[prev_node], to_idx,
1747 prev_node, str_idx))
1750 ok = re_node_set_insert (cur_dest, prev_node);
1758 /* Helper functions. */
1760 static reg_errcode_t
1762 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1764 Idx top = mctx->state_log_top;
1766 if (next_state_log_idx >= mctx->input.bufs_len
1767 || (next_state_log_idx >= mctx->input.valid_len
1768 && mctx->input.valid_len < mctx->input.len))
1771 err = extend_buffers (mctx);
1772 if (BE (err != REG_NOERROR, 0))
1776 if (top < next_state_log_idx)
1778 memset (mctx->state_log + top + 1, '\0',
1779 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1780 mctx->state_log_top = next_state_log_idx;
1785 static reg_errcode_t
1787 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1788 re_dfastate_t **src, Idx num)
1792 for (st_idx = 0; st_idx < num; ++st_idx)
1794 if (dst[st_idx] == NULL)
1795 dst[st_idx] = src[st_idx];
1796 else if (src[st_idx] != NULL)
1798 re_node_set merged_set;
1799 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1800 &src[st_idx]->nodes);
1801 if (BE (err != REG_NOERROR, 0))
1803 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1804 re_node_set_free (&merged_set);
1805 if (BE (err != REG_NOERROR, 0))
1812 static reg_errcode_t
1814 update_cur_sifted_state (const re_match_context_t *mctx,
1815 re_sift_context_t *sctx, Idx str_idx,
1816 re_node_set *dest_nodes)
1818 const re_dfa_t *const dfa = mctx->dfa;
1819 reg_errcode_t err = REG_NOERROR;
1820 const re_node_set *candidates;
1821 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1822 : &mctx->state_log[str_idx]->nodes);
1824 if (dest_nodes->nelem == 0)
1825 sctx->sifted_states[str_idx] = NULL;
1830 /* At first, add the nodes which can epsilon transit to a node in
1832 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1833 if (BE (err != REG_NOERROR, 0))
1836 /* Then, check the limitations in the current sift_context. */
1837 if (sctx->limits.nelem)
1839 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1840 mctx->bkref_ents, str_idx);
1841 if (BE (err != REG_NOERROR, 0))
1846 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1847 if (BE (err != REG_NOERROR, 0))
1851 if (candidates && mctx->state_log[str_idx]->has_backref)
1853 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1854 if (BE (err != REG_NOERROR, 0))
1860 static reg_errcode_t
1861 internal_function __attribute_warn_unused_result__
1862 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1863 const re_node_set *candidates)
1865 reg_errcode_t err = REG_NOERROR;
1868 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1869 if (BE (err != REG_NOERROR, 0))
1872 if (!state->inveclosure.alloc)
1874 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1875 if (BE (err != REG_NOERROR, 0))
1877 for (i = 0; i < dest_nodes->nelem; i++)
1879 err = re_node_set_merge (&state->inveclosure,
1880 dfa->inveclosures + dest_nodes->elems[i]);
1881 if (BE (err != REG_NOERROR, 0))
1885 return re_node_set_add_intersect (dest_nodes, candidates,
1886 &state->inveclosure);
1889 static reg_errcode_t
1891 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1892 const re_node_set *candidates)
1896 re_node_set *inv_eclosure = dfa->inveclosures + node;
1897 re_node_set except_nodes;
1898 re_node_set_init_empty (&except_nodes);
1899 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1901 Idx cur_node = inv_eclosure->elems[ecl_idx];
1902 if (cur_node == node)
1904 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1906 Idx edst1 = dfa->edests[cur_node].elems[0];
1907 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1908 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1909 if ((!re_node_set_contains (inv_eclosure, edst1)
1910 && re_node_set_contains (dest_nodes, edst1))
1911 || (REG_VALID_NONZERO_INDEX (edst2)
1912 && !re_node_set_contains (inv_eclosure, edst2)
1913 && re_node_set_contains (dest_nodes, edst2)))
1915 err = re_node_set_add_intersect (&except_nodes, candidates,
1916 dfa->inveclosures + cur_node);
1917 if (BE (err != REG_NOERROR, 0))
1919 re_node_set_free (&except_nodes);
1925 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1927 Idx cur_node = inv_eclosure->elems[ecl_idx];
1928 if (!re_node_set_contains (&except_nodes, cur_node))
1930 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1931 re_node_set_remove_at (dest_nodes, idx);
1934 re_node_set_free (&except_nodes);
1940 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1941 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1943 const re_dfa_t *const dfa = mctx->dfa;
1944 Idx lim_idx, src_pos, dst_pos;
1946 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1947 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1948 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1951 struct re_backref_cache_entry *ent;
1952 ent = mctx->bkref_ents + limits->elems[lim_idx];
1953 subexp_idx = dfa->nodes[ent->node].opr.idx;
1955 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1956 subexp_idx, dst_node, dst_idx,
1958 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1959 subexp_idx, src_node, src_idx,
1963 <src> <dst> ( <subexp> )
1964 ( <subexp> ) <src> <dst>
1965 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1966 if (src_pos == dst_pos)
1967 continue; /* This is unrelated limitation. */
1976 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1977 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1979 const re_dfa_t *const dfa = mctx->dfa;
1980 const re_node_set *eclosures = dfa->eclosures + from_node;
1983 /* Else, we are on the boundary: examine the nodes on the epsilon
1985 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1987 Idx node = eclosures->elems[node_idx];
1988 switch (dfa->nodes[node].type)
1991 if (bkref_idx != REG_MISSING)
1993 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1999 if (ent->node != node)
2002 if (subexp_idx < BITSET_WORD_BITS
2003 && !(ent->eps_reachable_subexps_map
2004 & ((bitset_word_t) 1 << subexp_idx)))
2007 /* Recurse trying to reach the OP_OPEN_SUBEXP and
2008 OP_CLOSE_SUBEXP cases below. But, if the
2009 destination node is the same node as the source
2010 node, don't recurse because it would cause an
2011 infinite loop: a regex that exhibits this behavior
2013 dst = dfa->edests[node].elems[0];
2014 if (dst == from_node)
2018 else /* if (boundaries & 2) */
2023 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2025 if (cpos == -1 /* && (boundaries & 1) */)
2027 if (cpos == 0 && (boundaries & 2))
2030 if (subexp_idx < BITSET_WORD_BITS)
2031 ent->eps_reachable_subexps_map
2032 &= ~((bitset_word_t) 1 << subexp_idx);
2034 while (ent++->more);
2038 case OP_OPEN_SUBEXP:
2039 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2043 case OP_CLOSE_SUBEXP:
2044 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2053 return (boundaries & 2) ? 1 : 0;
2058 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2059 Idx subexp_idx, Idx from_node, Idx str_idx,
2062 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2065 /* If we are outside the range of the subexpression, return -1 or 1. */
2066 if (str_idx < lim->subexp_from)
2069 if (lim->subexp_to < str_idx)
2072 /* If we are within the subexpression, return 0. */
2073 boundaries = (str_idx == lim->subexp_from);
2074 boundaries |= (str_idx == lim->subexp_to) << 1;
2075 if (boundaries == 0)
2078 /* Else, examine epsilon closure. */
2079 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2080 from_node, bkref_idx);
2083 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2084 which are against limitations from DEST_NODES. */
2086 static reg_errcode_t
2088 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2089 const re_node_set *candidates, re_node_set *limits,
2090 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2093 Idx node_idx, lim_idx;
2095 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2098 struct re_backref_cache_entry *ent;
2099 ent = bkref_ents + limits->elems[lim_idx];
2101 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2102 continue; /* This is unrelated limitation. */
2104 subexp_idx = dfa->nodes[ent->node].opr.idx;
2105 if (ent->subexp_to == str_idx)
2107 Idx ops_node = REG_MISSING;
2108 Idx cls_node = REG_MISSING;
2109 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2111 Idx node = dest_nodes->elems[node_idx];
2112 re_token_type_t type = dfa->nodes[node].type;
2113 if (type == OP_OPEN_SUBEXP
2114 && subexp_idx == dfa->nodes[node].opr.idx)
2116 else if (type == OP_CLOSE_SUBEXP
2117 && subexp_idx == dfa->nodes[node].opr.idx)
2121 /* Check the limitation of the open subexpression. */
2122 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2123 if (REG_VALID_INDEX (ops_node))
2125 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2127 if (BE (err != REG_NOERROR, 0))
2131 /* Check the limitation of the close subexpression. */
2132 if (REG_VALID_INDEX (cls_node))
2133 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2135 Idx node = dest_nodes->elems[node_idx];
2136 if (!re_node_set_contains (dfa->inveclosures + node,
2138 && !re_node_set_contains (dfa->eclosures + node,
2141 /* It is against this limitation.
2142 Remove it form the current sifted state. */
2143 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2145 if (BE (err != REG_NOERROR, 0))
2151 else /* (ent->subexp_to != str_idx) */
2153 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2155 Idx node = dest_nodes->elems[node_idx];
2156 re_token_type_t type = dfa->nodes[node].type;
2157 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2159 if (subexp_idx != dfa->nodes[node].opr.idx)
2161 /* It is against this limitation.
2162 Remove it form the current sifted state. */
2163 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2165 if (BE (err != REG_NOERROR, 0))
2174 static reg_errcode_t
2175 internal_function __attribute_warn_unused_result__
2176 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2177 Idx str_idx, const re_node_set *candidates)
2179 const re_dfa_t *const dfa = mctx->dfa;
2182 re_sift_context_t local_sctx;
2183 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2185 if (first_idx == REG_MISSING)
2188 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2190 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2193 re_token_type_t type;
2194 struct re_backref_cache_entry *entry;
2195 node = candidates->elems[node_idx];
2196 type = dfa->nodes[node].type;
2197 /* Avoid infinite loop for the REs like "()\1+". */
2198 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2200 if (type != OP_BACK_REF)
2203 entry = mctx->bkref_ents + first_idx;
2204 enabled_idx = first_idx;
2211 re_dfastate_t *cur_state;
2213 if (entry->node != node)
2215 subexp_len = entry->subexp_to - entry->subexp_from;
2216 to_idx = str_idx + subexp_len;
2217 dst_node = (subexp_len ? dfa->nexts[node]
2218 : dfa->edests[node].elems[0]);
2220 if (to_idx > sctx->last_str_idx
2221 || sctx->sifted_states[to_idx] == NULL
2222 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2223 || check_dst_limits (mctx, &sctx->limits, node,
2224 str_idx, dst_node, to_idx))
2227 if (local_sctx.sifted_states == NULL)
2230 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2231 if (BE (err != REG_NOERROR, 0))
2234 local_sctx.last_node = node;
2235 local_sctx.last_str_idx = str_idx;
2236 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2242 cur_state = local_sctx.sifted_states[str_idx];
2243 err = sift_states_backward (mctx, &local_sctx);
2244 if (BE (err != REG_NOERROR, 0))
2246 if (sctx->limited_states != NULL)
2248 err = merge_state_array (dfa, sctx->limited_states,
2249 local_sctx.sifted_states,
2251 if (BE (err != REG_NOERROR, 0))
2254 local_sctx.sifted_states[str_idx] = cur_state;
2255 re_node_set_remove (&local_sctx.limits, enabled_idx);
2257 /* mctx->bkref_ents may have changed, reload the pointer. */
2258 entry = mctx->bkref_ents + enabled_idx;
2260 while (enabled_idx++, entry++->more);
2264 if (local_sctx.sifted_states != NULL)
2266 re_node_set_free (&local_sctx.limits);
2273 #ifdef RE_ENABLE_I18N
2276 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2277 Idx node_idx, Idx str_idx, Idx max_str_idx)
2279 const re_dfa_t *const dfa = mctx->dfa;
2281 /* Check the node can accept `multi byte'. */
2282 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2283 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2284 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2285 dfa->nexts[node_idx]))
2286 /* The node can't accept the `multi byte', or the
2287 destination was already thrown away, then the node
2288 could't accept the current input `multi byte'. */
2290 /* Otherwise, it is sure that the node could accept
2291 `naccepted' bytes input. */
2294 #endif /* RE_ENABLE_I18N */
2297 /* Functions for state transition. */
2299 /* Return the next state to which the current state STATE will transit by
2300 accepting the current input byte, and update STATE_LOG if necessary.
2301 If STATE can accept a multibyte char/collating element/back reference
2302 update the destination of STATE_LOG. */
2304 static re_dfastate_t *
2305 internal_function __attribute_warn_unused_result__
2306 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2307 re_dfastate_t *state)
2309 re_dfastate_t **trtable;
2312 #ifdef RE_ENABLE_I18N
2313 /* If the current state can accept multibyte. */
2314 if (BE (state->accept_mb, 0))
2316 *err = transit_state_mb (mctx, state);
2317 if (BE (*err != REG_NOERROR, 0))
2320 #endif /* RE_ENABLE_I18N */
2322 /* Then decide the next state with the single byte. */
2325 /* don't use transition table */
2326 return transit_state_sb (err, mctx, state);
2329 /* Use transition table */
2330 ch = re_string_fetch_byte (&mctx->input);
2333 trtable = state->trtable;
2334 if (BE (trtable != NULL, 1))
2337 trtable = state->word_trtable;
2338 if (BE (trtable != NULL, 1))
2340 unsigned int context;
2342 = re_string_context_at (&mctx->input,
2343 re_string_cur_idx (&mctx->input) - 1,
2345 if (IS_WORD_CONTEXT (context))
2346 return trtable[ch + SBC_MAX];
2351 if (!build_trtable (mctx->dfa, state))
2357 /* Retry, we now have a transition table. */
2361 /* Update the state_log if we need */
2362 static re_dfastate_t *
2364 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2365 re_dfastate_t *next_state)
2367 const re_dfa_t *const dfa = mctx->dfa;
2368 Idx cur_idx = re_string_cur_idx (&mctx->input);
2370 if (cur_idx > mctx->state_log_top)
2372 mctx->state_log[cur_idx] = next_state;
2373 mctx->state_log_top = cur_idx;
2375 else if (mctx->state_log[cur_idx] == 0)
2377 mctx->state_log[cur_idx] = next_state;
2381 re_dfastate_t *pstate;
2382 unsigned int context;
2383 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2384 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2385 the destination of a multibyte char/collating element/
2386 back reference. Then the next state is the union set of
2387 these destinations and the results of the transition table. */
2388 pstate = mctx->state_log[cur_idx];
2389 log_nodes = pstate->entrance_nodes;
2390 if (next_state != NULL)
2392 table_nodes = next_state->entrance_nodes;
2393 *err = re_node_set_init_union (&next_nodes, table_nodes,
2395 if (BE (*err != REG_NOERROR, 0))
2399 next_nodes = *log_nodes;
2400 /* Note: We already add the nodes of the initial state,
2401 then we don't need to add them here. */
2403 context = re_string_context_at (&mctx->input,
2404 re_string_cur_idx (&mctx->input) - 1,
2406 next_state = mctx->state_log[cur_idx]
2407 = re_acquire_state_context (err, dfa, &next_nodes, context);
2408 /* We don't need to check errors here, since the return value of
2409 this function is next_state and ERR is already set. */
2411 if (table_nodes != NULL)
2412 re_node_set_free (&next_nodes);
2415 if (BE (dfa->nbackref, 0) && next_state != NULL)
2417 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2418 later. We must check them here, since the back references in the
2419 next state might use them. */
2420 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2422 if (BE (*err != REG_NOERROR, 0))
2425 /* If the next state has back references. */
2426 if (next_state->has_backref)
2428 *err = transit_state_bkref (mctx, &next_state->nodes);
2429 if (BE (*err != REG_NOERROR, 0))
2431 next_state = mctx->state_log[cur_idx];
2438 /* Skip bytes in the input that correspond to part of a
2439 multi-byte match, then look in the log for a state
2440 from which to restart matching. */
2441 static re_dfastate_t *
2443 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2445 re_dfastate_t *cur_state;
2448 Idx max = mctx->state_log_top;
2449 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2453 if (++cur_str_idx > max)
2455 re_string_skip_bytes (&mctx->input, 1);
2457 while (mctx->state_log[cur_str_idx] == NULL);
2459 cur_state = merge_state_with_log (err, mctx, NULL);
2461 while (*err == REG_NOERROR && cur_state == NULL);
2465 /* Helper functions for transit_state. */
2467 /* From the node set CUR_NODES, pick up the nodes whose types are
2468 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2469 expression. And register them to use them later for evaluating the
2470 correspoding back references. */
2472 static reg_errcode_t
2474 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2477 const re_dfa_t *const dfa = mctx->dfa;
2481 /* TODO: This isn't efficient.
2482 Because there might be more than one nodes whose types are
2483 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2486 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2488 Idx node = cur_nodes->elems[node_idx];
2489 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2490 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2491 && (dfa->used_bkref_map
2492 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2494 err = match_ctx_add_subtop (mctx, node, str_idx);
2495 if (BE (err != REG_NOERROR, 0))
2503 /* Return the next state to which the current state STATE will transit by
2504 accepting the current input byte. */
2506 static re_dfastate_t *
2507 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2508 re_dfastate_t *state)
2510 const re_dfa_t *const dfa = mctx->dfa;
2511 re_node_set next_nodes;
2512 re_dfastate_t *next_state;
2513 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2514 unsigned int context;
2516 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2517 if (BE (*err != REG_NOERROR, 0))
2519 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2521 Idx cur_node = state->nodes.elems[node_cnt];
2522 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2524 *err = re_node_set_merge (&next_nodes,
2525 dfa->eclosures + dfa->nexts[cur_node]);
2526 if (BE (*err != REG_NOERROR, 0))
2528 re_node_set_free (&next_nodes);
2533 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2534 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2535 /* We don't need to check errors here, since the return value of
2536 this function is next_state and ERR is already set. */
2538 re_node_set_free (&next_nodes);
2539 re_string_skip_bytes (&mctx->input, 1);
2544 #ifdef RE_ENABLE_I18N
2545 static reg_errcode_t
2547 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2549 const re_dfa_t *const dfa = mctx->dfa;
2553 for (i = 0; i < pstate->nodes.nelem; ++i)
2555 re_node_set dest_nodes, *new_nodes;
2556 Idx cur_node_idx = pstate->nodes.elems[i];
2559 unsigned int context;
2560 re_dfastate_t *dest_state;
2562 if (!dfa->nodes[cur_node_idx].accept_mb)
2565 if (dfa->nodes[cur_node_idx].constraint)
2567 context = re_string_context_at (&mctx->input,
2568 re_string_cur_idx (&mctx->input),
2570 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2575 /* How many bytes the node can accept? */
2576 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2577 re_string_cur_idx (&mctx->input));
2581 /* The node can accepts `naccepted' bytes. */
2582 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2583 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2584 : mctx->max_mb_elem_len);
2585 err = clean_state_log_if_needed (mctx, dest_idx);
2586 if (BE (err != REG_NOERROR, 0))
2589 assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2591 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2593 dest_state = mctx->state_log[dest_idx];
2594 if (dest_state == NULL)
2595 dest_nodes = *new_nodes;
2598 err = re_node_set_init_union (&dest_nodes,
2599 dest_state->entrance_nodes, new_nodes);
2600 if (BE (err != REG_NOERROR, 0))
2603 context = re_string_context_at (&mctx->input, dest_idx - 1,
2605 mctx->state_log[dest_idx]
2606 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2607 if (dest_state != NULL)
2608 re_node_set_free (&dest_nodes);
2609 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2614 #endif /* RE_ENABLE_I18N */
2616 static reg_errcode_t
2618 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2620 const re_dfa_t *const dfa = mctx->dfa;
2623 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2625 for (i = 0; i < nodes->nelem; ++i)
2627 Idx dest_str_idx, prev_nelem, bkc_idx;
2628 Idx node_idx = nodes->elems[i];
2629 unsigned int context;
2630 const re_token_t *node = dfa->nodes + node_idx;
2631 re_node_set *new_dest_nodes;
2633 /* Check whether `node' is a backreference or not. */
2634 if (node->type != OP_BACK_REF)
2637 if (node->constraint)
2639 context = re_string_context_at (&mctx->input, cur_str_idx,
2641 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2645 /* `node' is a backreference.
2646 Check the substring which the substring matched. */
2647 bkc_idx = mctx->nbkref_ents;
2648 err = get_subexp (mctx, node_idx, cur_str_idx);
2649 if (BE (err != REG_NOERROR, 0))
2652 /* And add the epsilon closures (which is `new_dest_nodes') of
2653 the backreference to appropriate state_log. */
2655 assert (dfa->nexts[node_idx] != REG_MISSING);
2657 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2660 re_dfastate_t *dest_state;
2661 struct re_backref_cache_entry *bkref_ent;
2662 bkref_ent = mctx->bkref_ents + bkc_idx;
2663 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2665 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2666 new_dest_nodes = (subexp_len == 0
2667 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2668 : dfa->eclosures + dfa->nexts[node_idx]);
2669 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2670 - bkref_ent->subexp_from);
2671 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2673 dest_state = mctx->state_log[dest_str_idx];
2674 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2675 : mctx->state_log[cur_str_idx]->nodes.nelem);
2676 /* Add `new_dest_node' to state_log. */
2677 if (dest_state == NULL)
2679 mctx->state_log[dest_str_idx]
2680 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2682 if (BE (mctx->state_log[dest_str_idx] == NULL
2683 && err != REG_NOERROR, 0))
2688 re_node_set dest_nodes;
2689 err = re_node_set_init_union (&dest_nodes,
2690 dest_state->entrance_nodes,
2692 if (BE (err != REG_NOERROR, 0))
2694 re_node_set_free (&dest_nodes);
2697 mctx->state_log[dest_str_idx]
2698 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2699 re_node_set_free (&dest_nodes);
2700 if (BE (mctx->state_log[dest_str_idx] == NULL
2701 && err != REG_NOERROR, 0))
2704 /* We need to check recursively if the backreference can epsilon
2707 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2709 err = check_subexp_matching_top (mctx, new_dest_nodes,
2711 if (BE (err != REG_NOERROR, 0))
2713 err = transit_state_bkref (mctx, new_dest_nodes);
2714 if (BE (err != REG_NOERROR, 0))
2724 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2725 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2726 Note that we might collect inappropriate candidates here.
2727 However, the cost of checking them strictly here is too high, then we
2728 delay these checking for prune_impossible_nodes(). */
2730 static reg_errcode_t
2731 internal_function __attribute_warn_unused_result__
2732 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2734 const re_dfa_t *const dfa = mctx->dfa;
2735 Idx subexp_num, sub_top_idx;
2736 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2737 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2738 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2739 if (cache_idx != REG_MISSING)
2741 const struct re_backref_cache_entry *entry
2742 = mctx->bkref_ents + cache_idx;
2744 if (entry->node == bkref_node)
2745 return REG_NOERROR; /* We already checked it. */
2746 while (entry++->more);
2749 subexp_num = dfa->nodes[bkref_node].opr.idx;
2751 /* For each sub expression */
2752 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2755 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2756 re_sub_match_last_t *sub_last;
2757 Idx sub_last_idx, sl_str, bkref_str_off;
2759 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2760 continue; /* It isn't related. */
2762 sl_str = sub_top->str_idx;
2763 bkref_str_off = bkref_str_idx;
2764 /* At first, check the last node of sub expressions we already
2766 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2768 regoff_t sl_str_diff;
2769 sub_last = sub_top->lasts[sub_last_idx];
2770 sl_str_diff = sub_last->str_idx - sl_str;
2771 /* The matched string by the sub expression match with the substring
2772 at the back reference? */
2773 if (sl_str_diff > 0)
2775 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2777 /* Not enough chars for a successful match. */
2778 if (bkref_str_off + sl_str_diff > mctx->input.len)
2781 err = clean_state_log_if_needed (mctx,
2784 if (BE (err != REG_NOERROR, 0))
2786 buf = (const char *) re_string_get_buffer (&mctx->input);
2788 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2789 /* We don't need to search this sub expression any more. */
2792 bkref_str_off += sl_str_diff;
2793 sl_str += sl_str_diff;
2794 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2797 /* Reload buf, since the preceding call might have reallocated
2799 buf = (const char *) re_string_get_buffer (&mctx->input);
2801 if (err == REG_NOMATCH)
2803 if (BE (err != REG_NOERROR, 0))
2807 if (sub_last_idx < sub_top->nlasts)
2809 if (sub_last_idx > 0)
2811 /* Then, search for the other last nodes of the sub expression. */
2812 for (; sl_str <= bkref_str_idx; ++sl_str)
2815 regoff_t sl_str_off;
2816 const re_node_set *nodes;
2817 sl_str_off = sl_str - sub_top->str_idx;
2818 /* The matched string by the sub expression match with the substring
2819 at the back reference? */
2822 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2824 /* If we are at the end of the input, we cannot match. */
2825 if (bkref_str_off >= mctx->input.len)
2828 err = extend_buffers (mctx);
2829 if (BE (err != REG_NOERROR, 0))
2832 buf = (const char *) re_string_get_buffer (&mctx->input);
2834 if (buf [bkref_str_off++] != buf[sl_str - 1])
2835 break; /* We don't need to search this sub expression
2838 if (mctx->state_log[sl_str] == NULL)
2840 /* Does this state have a ')' of the sub expression? */
2841 nodes = &mctx->state_log[sl_str]->nodes;
2842 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2844 if (cls_node == REG_MISSING)
2846 if (sub_top->path == NULL)
2848 sub_top->path = calloc (sizeof (state_array_t),
2849 sl_str - sub_top->str_idx + 1);
2850 if (sub_top->path == NULL)
2853 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2854 in the current context? */
2855 err = check_arrival (mctx, sub_top->path, sub_top->node,
2856 sub_top->str_idx, cls_node, sl_str,
2858 if (err == REG_NOMATCH)
2860 if (BE (err != REG_NOERROR, 0))
2862 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2863 if (BE (sub_last == NULL, 0))
2865 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2867 if (err == REG_NOMATCH)
2874 /* Helper functions for get_subexp(). */
2876 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2877 If it can arrive, register the sub expression expressed with SUB_TOP
2880 static reg_errcode_t
2882 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2883 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2887 /* Can the subexpression arrive the back reference? */
2888 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2889 sub_last->str_idx, bkref_node, bkref_str,
2891 if (err != REG_NOERROR)
2893 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2895 if (BE (err != REG_NOERROR, 0))
2897 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2898 return clean_state_log_if_needed (mctx, to_idx);
2901 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2902 Search '(' if FL_OPEN, or search ')' otherwise.
2903 TODO: This function isn't efficient...
2904 Because there might be more than one nodes whose types are
2905 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2911 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2912 Idx subexp_idx, int type)
2915 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2917 Idx cls_node = nodes->elems[cls_idx];
2918 const re_token_t *node = dfa->nodes + cls_node;
2919 if (node->type == type
2920 && node->opr.idx == subexp_idx)
2926 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2927 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2929 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2931 static reg_errcode_t
2932 internal_function __attribute_warn_unused_result__
2933 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2934 Idx top_str, Idx last_node, Idx last_str, int type)
2936 const re_dfa_t *const dfa = mctx->dfa;
2937 reg_errcode_t err = REG_NOERROR;
2938 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2939 re_dfastate_t *cur_state = NULL;
2940 re_node_set *cur_nodes, next_nodes;
2941 re_dfastate_t **backup_state_log;
2942 unsigned int context;
2944 subexp_num = dfa->nodes[top_node].opr.idx;
2945 /* Extend the buffer if we need. */
2946 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2948 re_dfastate_t **new_array;
2949 Idx old_alloc = path->alloc;
2950 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2951 if (BE (new_alloc < old_alloc, 0)
2952 || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2954 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2955 if (BE (new_array == NULL, 0))
2957 path->array = new_array;
2958 path->alloc = new_alloc;
2959 memset (new_array + old_alloc, '\0',
2960 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2963 str_idx = path->next_idx ? path->next_idx : top_str;
2965 /* Temporary modify MCTX. */
2966 backup_state_log = mctx->state_log;
2967 backup_cur_idx = mctx->input.cur_idx;
2968 mctx->state_log = path->array;
2969 mctx->input.cur_idx = str_idx;
2971 /* Setup initial node set. */
2972 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2973 if (str_idx == top_str)
2975 err = re_node_set_init_1 (&next_nodes, top_node);
2976 if (BE (err != REG_NOERROR, 0))
2978 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2979 if (BE (err != REG_NOERROR, 0))
2981 re_node_set_free (&next_nodes);
2987 cur_state = mctx->state_log[str_idx];
2988 if (cur_state && cur_state->has_backref)
2990 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2991 if (BE (err != REG_NOERROR, 0))
2995 re_node_set_init_empty (&next_nodes);
2997 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2999 if (next_nodes.nelem)
3001 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3003 if (BE (err != REG_NOERROR, 0))
3005 re_node_set_free (&next_nodes);
3009 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3010 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3012 re_node_set_free (&next_nodes);
3015 mctx->state_log[str_idx] = cur_state;
3018 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3020 re_node_set_empty (&next_nodes);
3021 if (mctx->state_log[str_idx + 1])
3023 err = re_node_set_merge (&next_nodes,
3024 &mctx->state_log[str_idx + 1]->nodes);
3025 if (BE (err != REG_NOERROR, 0))
3027 re_node_set_free (&next_nodes);
3033 err = check_arrival_add_next_nodes (mctx, str_idx,
3034 &cur_state->non_eps_nodes,
3036 if (BE (err != REG_NOERROR, 0))
3038 re_node_set_free (&next_nodes);
3043 if (next_nodes.nelem)
3045 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3046 if (BE (err != REG_NOERROR, 0))
3048 re_node_set_free (&next_nodes);
3051 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3053 if (BE (err != REG_NOERROR, 0))
3055 re_node_set_free (&next_nodes);
3059 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3060 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3061 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3063 re_node_set_free (&next_nodes);
3066 mctx->state_log[str_idx] = cur_state;
3067 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3069 re_node_set_free (&next_nodes);
3070 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3071 : &mctx->state_log[last_str]->nodes);
3072 path->next_idx = str_idx;
3075 mctx->state_log = backup_state_log;
3076 mctx->input.cur_idx = backup_cur_idx;
3078 /* Then check the current node set has the node LAST_NODE. */
3079 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3085 /* Helper functions for check_arrival. */
3087 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3089 TODO: This function is similar to the functions transit_state*(),
3090 however this function has many additional works.
3091 Can't we unify them? */
3093 static reg_errcode_t
3094 internal_function __attribute_warn_unused_result__
3095 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3096 re_node_set *cur_nodes, re_node_set *next_nodes)
3098 const re_dfa_t *const dfa = mctx->dfa;
3101 #ifdef RE_ENABLE_I18N
3102 reg_errcode_t err = REG_NOERROR;
3104 re_node_set union_set;
3105 re_node_set_init_empty (&union_set);
3106 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3109 Idx cur_node = cur_nodes->elems[cur_idx];
3111 re_token_type_t type = dfa->nodes[cur_node].type;
3112 assert (!IS_EPSILON_NODE (type));
3114 #ifdef RE_ENABLE_I18N
3115 /* If the node may accept `multi byte'. */
3116 if (dfa->nodes[cur_node].accept_mb)
3118 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3122 re_dfastate_t *dest_state;
3123 Idx next_node = dfa->nexts[cur_node];
3124 Idx next_idx = str_idx + naccepted;
3125 dest_state = mctx->state_log[next_idx];
3126 re_node_set_empty (&union_set);
3129 err = re_node_set_merge (&union_set, &dest_state->nodes);
3130 if (BE (err != REG_NOERROR, 0))
3132 re_node_set_free (&union_set);
3136 ok = re_node_set_insert (&union_set, next_node);
3139 re_node_set_free (&union_set);
3142 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3144 if (BE (mctx->state_log[next_idx] == NULL
3145 && err != REG_NOERROR, 0))
3147 re_node_set_free (&union_set);
3152 #endif /* RE_ENABLE_I18N */
3154 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3156 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3159 re_node_set_free (&union_set);
3164 re_node_set_free (&union_set);
3168 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3169 CUR_NODES, however exclude the nodes which are:
3170 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3171 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3174 static reg_errcode_t
3176 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3177 Idx ex_subexp, int type)
3180 Idx idx, outside_node;
3181 re_node_set new_nodes;
3183 assert (cur_nodes->nelem);
3185 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3186 if (BE (err != REG_NOERROR, 0))
3188 /* Create a new node set NEW_NODES with the nodes which are epsilon
3189 closures of the node in CUR_NODES. */
3191 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3193 Idx cur_node = cur_nodes->elems[idx];
3194 const re_node_set *eclosure = dfa->eclosures + cur_node;
3195 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3196 if (outside_node == REG_MISSING)
3198 /* There are no problematic nodes, just merge them. */
3199 err = re_node_set_merge (&new_nodes, eclosure);
3200 if (BE (err != REG_NOERROR, 0))
3202 re_node_set_free (&new_nodes);
3208 /* There are problematic nodes, re-calculate incrementally. */
3209 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3211 if (BE (err != REG_NOERROR, 0))
3213 re_node_set_free (&new_nodes);
3218 re_node_set_free (cur_nodes);
3219 *cur_nodes = new_nodes;
3223 /* Helper function for check_arrival_expand_ecl.
3224 Check incrementally the epsilon closure of TARGET, and if it isn't
3225 problematic append it to DST_NODES. */
3227 static reg_errcode_t
3228 internal_function __attribute_warn_unused_result__
3229 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3230 Idx target, Idx ex_subexp, int type)
3233 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3237 if (dfa->nodes[cur_node].type == type
3238 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3240 if (type == OP_CLOSE_SUBEXP)
3242 ok = re_node_set_insert (dst_nodes, cur_node);
3248 ok = re_node_set_insert (dst_nodes, cur_node);
3251 if (dfa->edests[cur_node].nelem == 0)
3253 if (dfa->edests[cur_node].nelem == 2)
3256 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3257 dfa->edests[cur_node].elems[1],
3259 if (BE (err != REG_NOERROR, 0))
3262 cur_node = dfa->edests[cur_node].elems[0];
3268 /* For all the back references in the current state, calculate the
3269 destination of the back references by the appropriate entry
3270 in MCTX->BKREF_ENTS. */
3272 static reg_errcode_t
3273 internal_function __attribute_warn_unused_result__
3274 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3275 Idx cur_str, Idx subexp_num, int type)
3277 const re_dfa_t *const dfa = mctx->dfa;
3279 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3280 struct re_backref_cache_entry *ent;
3282 if (cache_idx_start == REG_MISSING)
3286 ent = mctx->bkref_ents + cache_idx_start;
3289 Idx to_idx, next_node;
3291 /* Is this entry ENT is appropriate? */
3292 if (!re_node_set_contains (cur_nodes, ent->node))
3295 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3296 /* Calculate the destination of the back reference, and append it
3297 to MCTX->STATE_LOG. */
3298 if (to_idx == cur_str)
3300 /* The backreference did epsilon transit, we must re-check all the
3301 node in the current state. */
3302 re_node_set new_dests;
3303 reg_errcode_t err2, err3;
3304 next_node = dfa->edests[ent->node].elems[0];
3305 if (re_node_set_contains (cur_nodes, next_node))
3307 err = re_node_set_init_1 (&new_dests, next_node);
3308 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3309 err3 = re_node_set_merge (cur_nodes, &new_dests);
3310 re_node_set_free (&new_dests);
3311 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3312 || err3 != REG_NOERROR, 0))
3314 err = (err != REG_NOERROR ? err
3315 : (err2 != REG_NOERROR ? err2 : err3));
3318 /* TODO: It is still inefficient... */
3323 re_node_set union_set;
3324 next_node = dfa->nexts[ent->node];
3325 if (mctx->state_log[to_idx])
3328 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3331 err = re_node_set_init_copy (&union_set,
3332 &mctx->state_log[to_idx]->nodes);
3333 ok = re_node_set_insert (&union_set, next_node);
3334 if (BE (err != REG_NOERROR || ! ok, 0))
3336 re_node_set_free (&union_set);
3337 err = err != REG_NOERROR ? err : REG_ESPACE;
3343 err = re_node_set_init_1 (&union_set, next_node);
3344 if (BE (err != REG_NOERROR, 0))
3347 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3348 re_node_set_free (&union_set);
3349 if (BE (mctx->state_log[to_idx] == NULL
3350 && err != REG_NOERROR, 0))
3354 while (ent++->more);
3358 /* Build transition table for the state.
3359 Return true if successful. */
3363 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3368 bool need_word_trtable = false;
3369 bitset_word_t elem, mask;
3370 bool dests_node_malloced = false;
3371 bool dest_states_malloced = false;
3372 Idx ndests; /* Number of the destination states from `state'. */
3373 re_dfastate_t **trtable;
3374 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3375 re_node_set follows, *dests_node;
3377 bitset_t acceptable;
3381 re_node_set dests_node[SBC_MAX];
3382 bitset_t dests_ch[SBC_MAX];
3385 /* We build DFA states which corresponds to the destination nodes
3386 from `state'. `dests_node[i]' represents the nodes which i-th
3387 destination state contains, and `dests_ch[i]' represents the
3388 characters which i-th destination state accepts. */
3389 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3390 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3393 dests_alloc = re_malloc (struct dests_alloc, 1);
3394 if (BE (dests_alloc == NULL, 0))
3396 dests_node_malloced = true;
3398 dests_node = dests_alloc->dests_node;
3399 dests_ch = dests_alloc->dests_ch;
3401 /* Initialize transiton table. */
3402 state->word_trtable = state->trtable = NULL;
3404 /* At first, group all nodes belonging to `state' into several
3406 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3407 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3409 if (dests_node_malloced)
3413 state->trtable = (re_dfastate_t **)
3414 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3420 err = re_node_set_alloc (&follows, ndests + 1);
3421 if (BE (err != REG_NOERROR, 0))
3424 /* Avoid arithmetic overflow in size calculation. */
3425 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3426 / (3 * sizeof (re_dfastate_t *)))
3431 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3432 + ndests * 3 * sizeof (re_dfastate_t *)))
3433 dest_states = (re_dfastate_t **)
3434 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3437 dest_states = (re_dfastate_t **)
3438 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3439 if (BE (dest_states == NULL, 0))
3442 if (dest_states_malloced)
3444 re_node_set_free (&follows);
3445 for (i = 0; i < ndests; ++i)
3446 re_node_set_free (dests_node + i);
3447 if (dests_node_malloced)
3451 dest_states_malloced = true;
3453 dest_states_word = dest_states + ndests;
3454 dest_states_nl = dest_states_word + ndests;
3455 bitset_empty (acceptable);
3457 /* Then build the states for all destinations. */
3458 for (i = 0; i < ndests; ++i)
3461 re_node_set_empty (&follows);
3462 /* Merge the follows of this destination states. */
3463 for (j = 0; j < dests_node[i].nelem; ++j)
3465 next_node = dfa->nexts[dests_node[i].elems[j]];
3466 if (next_node != REG_MISSING)
3468 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3469 if (BE (err != REG_NOERROR, 0))
3473 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3474 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3476 /* If the new state has context constraint,
3477 build appropriate states for these contexts. */
3478 if (dest_states[i]->has_constraint)
3480 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3482 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3485 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3486 need_word_trtable = true;
3488 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3490 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3495 dest_states_word[i] = dest_states[i];
3496 dest_states_nl[i] = dest_states[i];
3498 bitset_merge (acceptable, dests_ch[i]);
3501 if (!BE (need_word_trtable, 0))
3503 /* We don't care about whether the following character is a word
3504 character, or we are in a single-byte character set so we can
3505 discern by looking at the character code: allocate a
3506 256-entry transition table. */
3507 trtable = state->trtable =
3508 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3509 if (BE (trtable == NULL, 0))
3512 /* For all characters ch...: */
3513 for (i = 0; i < BITSET_WORDS; ++i)
3514 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3516 mask <<= 1, elem >>= 1, ++ch)
3517 if (BE (elem & 1, 0))
3519 /* There must be exactly one destination which accepts
3520 character ch. See group_nodes_into_DFAstates. */
3521 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3524 /* j-th destination accepts the word character ch. */
3525 if (dfa->word_char[i] & mask)
3526 trtable[ch] = dest_states_word[j];
3528 trtable[ch] = dest_states[j];
3533 /* We care about whether the following character is a word
3534 character, and we are in a multi-byte character set: discern
3535 by looking at the character code: build two 256-entry
3536 transition tables, one starting at trtable[0] and one
3537 starting at trtable[SBC_MAX]. */
3538 trtable = state->word_trtable =
3539 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3540 if (BE (trtable == NULL, 0))
3543 /* For all characters ch...: */
3544 for (i = 0; i < BITSET_WORDS; ++i)
3545 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3547 mask <<= 1, elem >>= 1, ++ch)
3548 if (BE (elem & 1, 0))
3550 /* There must be exactly one destination which accepts
3551 character ch. See group_nodes_into_DFAstates. */
3552 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3555 /* j-th destination accepts the word character ch. */
3556 trtable[ch] = dest_states[j];
3557 trtable[ch + SBC_MAX] = dest_states_word[j];
3562 if (bitset_contain (acceptable, NEWLINE_CHAR))
3564 /* The current state accepts newline character. */
3565 for (j = 0; j < ndests; ++j)
3566 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3568 /* k-th destination accepts newline character. */
3569 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3570 if (need_word_trtable)
3571 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3572 /* There must be only one destination which accepts
3573 newline. See group_nodes_into_DFAstates. */
3578 if (dest_states_malloced)
3581 re_node_set_free (&follows);
3582 for (i = 0; i < ndests; ++i)
3583 re_node_set_free (dests_node + i);
3585 if (dests_node_malloced)
3591 /* Group all nodes belonging to STATE into several destinations.
3592 Then for all destinations, set the nodes belonging to the destination
3593 to DESTS_NODE[i] and set the characters accepted by the destination
3594 to DEST_CH[i]. This function return the number of destinations. */
3598 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3599 re_node_set *dests_node, bitset_t *dests_ch)
3604 Idx ndests; /* Number of the destinations from `state'. */
3605 bitset_t accepts; /* Characters a node can accept. */
3606 const re_node_set *cur_nodes = &state->nodes;
3607 bitset_empty (accepts);
3610 /* For all the nodes belonging to `state', */
3611 for (i = 0; i < cur_nodes->nelem; ++i)
3613 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3614 re_token_type_t type = node->type;
3615 unsigned int constraint = node->constraint;
3617 /* Enumerate all single byte character this node can accept. */
3618 if (type == CHARACTER)
3619 bitset_set (accepts, node->opr.c);
3620 else if (type == SIMPLE_BRACKET)
3622 bitset_merge (accepts, node->opr.sbcset);
3624 else if (type == OP_PERIOD)
3626 #ifdef RE_ENABLE_I18N
3627 if (dfa->mb_cur_max > 1)
3628 bitset_merge (accepts, dfa->sb_char);
3631 bitset_set_all (accepts);
3632 if (!(dfa->syntax & RE_DOT_NEWLINE))
3633 bitset_clear (accepts, '\n');
3634 if (dfa->syntax & RE_DOT_NOT_NULL)
3635 bitset_clear (accepts, '\0');
3637 #ifdef RE_ENABLE_I18N
3638 else if (type == OP_UTF8_PERIOD)
3640 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3641 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3643 bitset_merge (accepts, utf8_sb_map);
3644 if (!(dfa->syntax & RE_DOT_NEWLINE))
3645 bitset_clear (accepts, '\n');
3646 if (dfa->syntax & RE_DOT_NOT_NULL)
3647 bitset_clear (accepts, '\0');
3653 /* Check the `accepts' and sift the characters which are not
3654 match it the context. */
3657 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3659 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3660 bitset_empty (accepts);
3661 if (accepts_newline)
3662 bitset_set (accepts, NEWLINE_CHAR);
3666 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3668 bitset_empty (accepts);
3672 if (constraint & NEXT_WORD_CONSTRAINT)
3674 bitset_word_t any_set = 0;
3675 if (type == CHARACTER && !node->word_char)
3677 bitset_empty (accepts);
3680 #ifdef RE_ENABLE_I18N
3681 if (dfa->mb_cur_max > 1)
3682 for (j = 0; j < BITSET_WORDS; ++j)
3683 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3686 for (j = 0; j < BITSET_WORDS; ++j)
3687 any_set |= (accepts[j] &= dfa->word_char[j]);
3691 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3693 bitset_word_t any_set = 0;
3694 if (type == CHARACTER && node->word_char)
3696 bitset_empty (accepts);
3699 #ifdef RE_ENABLE_I18N
3700 if (dfa->mb_cur_max > 1)
3701 for (j = 0; j < BITSET_WORDS; ++j)
3702 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3705 for (j = 0; j < BITSET_WORDS; ++j)
3706 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3712 /* Then divide `accepts' into DFA states, or create a new
3713 state. Above, we make sure that accepts is not empty. */
3714 for (j = 0; j < ndests; ++j)
3716 bitset_t intersec; /* Intersection sets, see below. */
3718 /* Flags, see below. */
3719 bitset_word_t has_intersec, not_subset, not_consumed;
3721 /* Optimization, skip if this state doesn't accept the character. */
3722 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3725 /* Enumerate the intersection set of this state and `accepts'. */
3727 for (k = 0; k < BITSET_WORDS; ++k)
3728 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3729 /* And skip if the intersection set is empty. */
3733 /* Then check if this state is a subset of `accepts'. */
3734 not_subset = not_consumed = 0;
3735 for (k = 0; k < BITSET_WORDS; ++k)
3737 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3738 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3741 /* If this state isn't a subset of `accepts', create a
3742 new group state, which has the `remains'. */
3745 bitset_copy (dests_ch[ndests], remains);
3746 bitset_copy (dests_ch[j], intersec);
3747 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3748 if (BE (err != REG_NOERROR, 0))
3753 /* Put the position in the current group. */
3754 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3758 /* If all characters are consumed, go to next node. */
3762 /* Some characters remain, create a new group. */
3765 bitset_copy (dests_ch[ndests], accepts);
3766 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3767 if (BE (err != REG_NOERROR, 0))
3770 bitset_empty (accepts);
3775 for (j = 0; j < ndests; ++j)
3776 re_node_set_free (dests_node + j);
3780 #ifdef RE_ENABLE_I18N
3781 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3782 Return the number of the bytes the node accepts.
3783 STR_IDX is the current index of the input string.
3785 This function handles the nodes which can accept one character, or
3786 one collating element like '.', '[a-z]', opposite to the other nodes
3787 can only accept one byte. */
3791 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3792 const re_string_t *input, Idx str_idx)
3794 const re_token_t *node = dfa->nodes + node_idx;
3795 int char_len, elem_len;
3798 if (BE (node->type == OP_UTF8_PERIOD, 0))
3800 unsigned char c = re_string_byte_at (input, str_idx), d;
3801 if (BE (c < 0xc2, 1))
3804 if (str_idx + 2 > input->len)
3807 d = re_string_byte_at (input, str_idx + 1);
3809 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3813 if (c == 0xe0 && d < 0xa0)
3819 if (c == 0xf0 && d < 0x90)
3825 if (c == 0xf8 && d < 0x88)
3831 if (c == 0xfc && d < 0x84)
3837 if (str_idx + char_len > input->len)
3840 for (i = 1; i < char_len; ++i)
3842 d = re_string_byte_at (input, str_idx + i);
3843 if (d < 0x80 || d > 0xbf)
3849 char_len = re_string_char_size_at (input, str_idx);
3850 if (node->type == OP_PERIOD)
3854 /* FIXME: I don't think this if is needed, as both '\n'
3855 and '\0' are char_len == 1. */
3856 /* '.' accepts any one character except the following two cases. */
3857 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3858 re_string_byte_at (input, str_idx) == '\n') ||
3859 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3860 re_string_byte_at (input, str_idx) == '\0'))
3865 elem_len = re_string_elem_size_at (input, str_idx);
3866 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3869 if (node->type == COMPLEX_BRACKET)
3871 const re_charset_t *cset = node->opr.mbcset;
3873 const unsigned char *pin
3874 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3879 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3880 ? re_string_wchar_at (input, str_idx) : 0);
3882 /* match with multibyte character? */
3883 for (i = 0; i < cset->nmbchars; ++i)
3884 if (wc == cset->mbchars[i])
3886 match_len = char_len;
3887 goto check_node_accept_bytes_match;
3889 /* match with character_class? */
3890 for (i = 0; i < cset->nchar_classes; ++i)
3892 wctype_t wt = cset->char_classes[i];
3893 if (__iswctype (wc, wt))
3895 match_len = char_len;
3896 goto check_node_accept_bytes_match;
3901 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3904 unsigned int in_collseq = 0;
3905 const int32_t *table, *indirect;
3906 const unsigned char *weights, *extra;
3907 const char *collseqwc;
3909 /* This #include defines a local function! */
3910 # include <locale/weight.h>
3912 /* match with collating_symbol? */
3913 if (cset->ncoll_syms)
3914 extra = (const unsigned char *)
3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3916 for (i = 0; i < cset->ncoll_syms; ++i)
3918 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3919 /* Compare the length of input collating element and
3920 the length of current collating element. */
3921 if (*coll_sym != elem_len)
3923 /* Compare each bytes. */
3924 for (j = 0; j < *coll_sym; j++)
3925 if (pin[j] != coll_sym[1 + j])
3929 /* Match if every bytes is equal. */
3931 goto check_node_accept_bytes_match;
3937 if (elem_len <= char_len)
3939 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3940 in_collseq = __collseq_table_lookup (collseqwc, wc);
3943 in_collseq = find_collation_sequence_value (pin, elem_len);
3945 /* match with range expression? */
3946 for (i = 0; i < cset->nranges; ++i)
3947 if (cset->range_starts[i] <= in_collseq
3948 && in_collseq <= cset->range_ends[i])
3950 match_len = elem_len;
3951 goto check_node_accept_bytes_match;
3954 /* match with equivalence_class? */
3955 if (cset->nequiv_classes)
3957 const unsigned char *cp = pin;
3958 table = (const int32_t *)
3959 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3960 weights = (const unsigned char *)
3961 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3962 extra = (const unsigned char *)
3963 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3964 indirect = (const int32_t *)
3965 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3966 int32_t idx = findidx (&cp);
3968 for (i = 0; i < cset->nequiv_classes; ++i)
3970 int32_t equiv_class_idx = cset->equiv_classes[i];
3971 size_t weight_len = weights[idx & 0xffffff];
3972 if (weight_len == weights[equiv_class_idx & 0xffffff]
3973 && (idx >> 24) == (equiv_class_idx >> 24))
3978 equiv_class_idx &= 0xffffff;
3980 while (cnt <= weight_len
3981 && (weights[equiv_class_idx + 1 + cnt]
3982 == weights[idx + 1 + cnt]))
3984 if (cnt > weight_len)
3986 match_len = elem_len;
3987 goto check_node_accept_bytes_match;
3996 /* match with range expression? */
3997 #if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3998 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
4000 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
4003 for (i = 0; i < cset->nranges; ++i)
4005 cmp_buf[0] = cset->range_starts[i];
4006 cmp_buf[4] = cset->range_ends[i];
4007 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
4008 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4010 match_len = char_len;
4011 goto check_node_accept_bytes_match;
4015 check_node_accept_bytes_match:
4016 if (!cset->non_match)
4023 return (elem_len > char_len) ? elem_len : char_len;
4032 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4034 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4039 /* No valid character. Match it as a single byte character. */
4040 const unsigned char *collseq = (const unsigned char *)
4041 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4042 return collseq[mbs[0]];
4049 const unsigned char *extra = (const unsigned char *)
4050 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4051 int32_t extrasize = (const unsigned char *)
4052 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4054 for (idx = 0; idx < extrasize;)
4058 int32_t elem_mbs_len;
4059 /* Skip the name of collating element name. */
4060 idx = idx + extra[idx] + 1;
4061 elem_mbs_len = extra[idx++];
4062 if (mbs_len == elem_mbs_len)
4064 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4065 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4067 if (mbs_cnt == elem_mbs_len)
4068 /* Found the entry. */
4071 /* Skip the byte sequence of the collating element. */
4072 idx += elem_mbs_len;
4073 /* Adjust for the alignment. */
4074 idx = (idx + 3) & ~3;
4075 /* Skip the collation sequence value. */
4076 idx += sizeof (uint32_t);
4077 /* Skip the wide char sequence of the collating element. */
4078 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4079 /* If we found the entry, return the sequence value. */
4081 return *(uint32_t *) (extra + idx);
4082 /* Skip the collation sequence value. */
4083 idx += sizeof (uint32_t);
4089 #endif /* RE_ENABLE_I18N */
4091 /* Check whether the node accepts the byte which is IDX-th
4092 byte of the INPUT. */
4096 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4100 ch = re_string_byte_at (&mctx->input, idx);
4104 if (node->opr.c != ch)
4108 case SIMPLE_BRACKET:
4109 if (!bitset_contain (node->opr.sbcset, ch))
4113 #ifdef RE_ENABLE_I18N
4114 case OP_UTF8_PERIOD:
4115 if (ch >= ASCII_CHARS)
4120 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4121 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4129 if (node->constraint)
4131 /* The node has constraints. Check whether the current context
4132 satisfies the constraints. */
4133 unsigned int context = re_string_context_at (&mctx->input, idx,
4135 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4142 /* Extend the buffers, if the buffers have run out. */
4144 static reg_errcode_t
4145 internal_function __attribute_warn_unused_result__
4146 extend_buffers (re_match_context_t *mctx)
4149 re_string_t *pstr = &mctx->input;
4151 /* Avoid overflow. */
4152 if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4155 /* Double the lengthes of the buffers. */
4156 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4157 if (BE (ret != REG_NOERROR, 0))
4160 if (mctx->state_log != NULL)
4162 /* And double the length of state_log. */
4163 /* XXX We have no indication of the size of this buffer. If this
4164 allocation fail we have no indication that the state_log array
4165 does not have the right size. */
4166 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4167 pstr->bufs_len + 1);
4168 if (BE (new_array == NULL, 0))
4170 mctx->state_log = new_array;
4173 /* Then reconstruct the buffers. */
4176 #ifdef RE_ENABLE_I18N
4177 if (pstr->mb_cur_max > 1)
4179 ret = build_wcs_upper_buffer (pstr);
4180 if (BE (ret != REG_NOERROR, 0))
4184 #endif /* RE_ENABLE_I18N */
4185 build_upper_buffer (pstr);
4189 #ifdef RE_ENABLE_I18N
4190 if (pstr->mb_cur_max > 1)
4191 build_wcs_buffer (pstr);
4193 #endif /* RE_ENABLE_I18N */
4195 if (pstr->trans != NULL)
4196 re_string_translate_buffer (pstr);
4203 /* Functions for matching context. */
4205 /* Initialize MCTX. */
4207 static reg_errcode_t
4208 internal_function __attribute_warn_unused_result__
4209 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4211 mctx->eflags = eflags;
4212 mctx->match_last = REG_MISSING;
4215 /* Avoid overflow. */
4216 size_t max_object_size =
4217 MAX (sizeof (struct re_backref_cache_entry),
4218 sizeof (re_sub_match_top_t *));
4219 if (BE (SIZE_MAX / max_object_size < n, 0))
4222 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4223 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4224 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4227 /* Already zero-ed by the caller.
4229 mctx->bkref_ents = NULL;
4230 mctx->nbkref_ents = 0;
4231 mctx->nsub_tops = 0; */
4232 mctx->abkref_ents = n;
4233 mctx->max_mb_elem_len = 1;
4234 mctx->asub_tops = n;
4238 /* Clean the entries which depend on the current input in MCTX.
4239 This function must be invoked when the matcher changes the start index
4240 of the input, or changes the input string. */
4244 match_ctx_clean (re_match_context_t *mctx)
4247 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4250 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4251 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4253 re_sub_match_last_t *last = top->lasts[sl_idx];
4254 re_free (last->path.array);
4257 re_free (top->lasts);
4260 re_free (top->path->array);
4261 re_free (top->path);
4266 mctx->nsub_tops = 0;
4267 mctx->nbkref_ents = 0;
4270 /* Free all the memory associated with MCTX. */
4274 match_ctx_free (re_match_context_t *mctx)
4276 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4277 match_ctx_clean (mctx);
4278 re_free (mctx->sub_tops);
4279 re_free (mctx->bkref_ents);
4282 /* Add a new backreference entry to MCTX.
4283 Note that we assume that caller never call this function with duplicate
4284 entry, and call with STR_IDX which isn't smaller than any existing entry.
4287 static reg_errcode_t
4288 internal_function __attribute_warn_unused_result__
4289 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4292 if (mctx->nbkref_ents >= mctx->abkref_ents)
4294 struct re_backref_cache_entry* new_entry;
4295 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4296 mctx->abkref_ents * 2);
4297 if (BE (new_entry == NULL, 0))
4299 re_free (mctx->bkref_ents);
4302 mctx->bkref_ents = new_entry;
4303 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4304 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4305 mctx->abkref_ents *= 2;
4307 if (mctx->nbkref_ents > 0
4308 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4309 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4311 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4312 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4313 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4314 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4316 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4317 If bit N is clear, means that this entry won't epsilon-transition to
4318 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4319 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4322 A backreference does not epsilon-transition unless it is empty, so set
4323 to all zeros if FROM != TO. */
4324 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4325 = (from == to ? -1 : 0);
4327 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4328 if (mctx->max_mb_elem_len < to - from)
4329 mctx->max_mb_elem_len = to - from;
4333 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4334 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4338 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4340 Idx left, right, mid, last;
4341 last = right = mctx->nbkref_ents;
4342 for (left = 0; left < right;)
4344 mid = (left + right) / 2;
4345 if (mctx->bkref_ents[mid].str_idx < str_idx)
4350 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4356 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4359 static reg_errcode_t
4360 internal_function __attribute_warn_unused_result__
4361 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4364 assert (mctx->sub_tops != NULL);
4365 assert (mctx->asub_tops > 0);
4367 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4369 Idx new_asub_tops = mctx->asub_tops * 2;
4370 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4371 re_sub_match_top_t *,
4373 if (BE (new_array == NULL, 0))
4375 mctx->sub_tops = new_array;
4376 mctx->asub_tops = new_asub_tops;
4378 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4379 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4381 mctx->sub_tops[mctx->nsub_tops]->node = node;
4382 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4386 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4387 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4389 static re_sub_match_last_t *
4391 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4393 re_sub_match_last_t *new_entry;
4394 if (BE (subtop->nlasts == subtop->alasts, 0))
4396 Idx new_alasts = 2 * subtop->alasts + 1;
4397 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4398 re_sub_match_last_t *,
4400 if (BE (new_array == NULL, 0))
4402 subtop->lasts = new_array;
4403 subtop->alasts = new_alasts;
4405 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4406 if (BE (new_entry != NULL, 1))
4408 subtop->lasts[subtop->nlasts] = new_entry;
4409 new_entry->node = node;
4410 new_entry->str_idx = str_idx;
4418 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4419 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4421 sctx->sifted_states = sifted_sts;
4422 sctx->limited_states = limited_sts;
4423 sctx->last_node = last_node;
4424 sctx->last_str_idx = last_str_idx;
4425 re_node_set_init_empty (&sctx->limits);