1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 static void optimize_utf8 (re_dfa_t *dfa);
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
37 static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_token_t *token, int token_len,
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
98 #else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 /* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
127 static const char __re_error_msgid[] =
129 #define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 static const size_t __re_error_msgid_idx[] =
202 /* Entry points for GNU code. */
204 /* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
213 re_compile_pattern (pattern, length, bufp)
216 struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240 weak_alias (__re_compile_pattern, re_compile_pattern)
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
259 re_set_syntax (syntax)
262 reg_syntax_t ret = re_syntax_options;
264 re_syntax_options = syntax;
268 weak_alias (__re_set_syntax, re_set_syntax)
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
275 re_dfa_t *dfa = bufp->buffer;
276 char *fastmap = bufp->fastmap;
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[tolower (ch)] = 1;
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 re_dfa_t *dfa = bufp->buffer;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
317 if (type == CHARACTER)
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 unsigned char buf[MB_LEN_MAX];
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 re_set_fastmap (fastmap, false, buf[0]);
343 else if (type == SIMPLE_BRACKET)
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 /* See if we have to try all bytes which start multiple collation
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
375 re_set_fastmap (fastmap, icase, i);
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
386 || cset->nequiv_classes
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
527 weak_alias (__regcomp, regcomp)
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
535 regerror (errcode, preg, errbuf, errbuf_size)
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
540 #else /* size_t might promote */
542 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
543 char *_Restrict_ errbuf, size_t errbuf_size)
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 msg_size = strlen (msg) + 1; /* Includes the null. */
562 if (BE (errbuf_size != 0, 1))
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
570 memcpy (errbuf, msg, cpy_size);
576 weak_alias (__regerror, regerror)
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset_t utf8_sb_map =
587 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
610 free_dfa_content (re_dfa_t *dfa)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
638 re_dfastate_t *state = entry->array[j];
641 re_free (entry->array);
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
648 re_free (dfa->subexp_map);
650 re_free (dfa->re_str);
657 /* Free dynamically allocated space used by PREG. */
663 re_dfa_t *dfa = preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
672 re_free (preg->translate);
673 preg->translate = NULL;
676 weak_alias (__regfree, regfree)
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
682 #if defined _REGEX_RE_COMP || defined _LIBC
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
707 if (re_comp_buf.buffer)
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
716 if (re_comp_buf.fastmap == NULL)
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
740 libc_freeres_fn (free_mem)
742 __regfree (&re_comp_buf);
746 #endif /* _REGEX_RE_COMP */
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
756 reg_errcode_t err = REG_NOERROR;
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
769 /* Initialize the dfa. */
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
780 preg->allocated = sizeof (re_dfa_t);
783 preg->used = sizeof (re_dfa_t);
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
788 free_dfa_content (dfa);
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
799 __libc_lock_init (dfa->lock);
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
814 /* Parse the regular expression, and build a structure tree. */
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
838 if (BE (err != REG_NOERROR, 0))
840 free_dfa_content (dfa);
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 __re_size_t table_size;
856 const char *codeset_name;
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 size_t max_i18n_object_size = 0;
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
870 memset (dfa, '\0', sizeof (re_dfa_t));
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
893 dfa->mb_cur_max = MB_CUR_MAX;
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
901 codeset_name = nl_langinfo (CODESET);
902 if (strcasecmp (codeset_name, "UTF-8") == 0
903 || strcasecmp (codeset_name, "UTF8") == 0)
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
911 #ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 if (BE (dfa->sb_char == NULL, 0))
924 /* Set the bits corresponding to single byte chars. */
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
928 wint_t wch = __btowc (ch);
930 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
932 if (isascii (ch) && wch != ch)
933 dfa->map_notascii = 1;
940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
945 /* Initialize WORD_CHAR table, which indicate which character is
946 "word". In this case "word" means that it is the word construction
947 character used by some operators like "\<", "\>", etc. */
951 init_word_char (re_dfa_t *dfa)
953 dfa->word_ops_used = 1;
957 if (BE (dfa->map_notascii == 0, 1))
959 if (BITSET_WORD_BITS == 64)
961 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
962 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
965 else if (BITSET_WORD_BITS == 32)
967 dfa->word_char[0] = UINT32_C (0x00000000);
968 dfa->word_char[1] = UINT32_C (0x03ff0000);
969 dfa->word_char[2] = UINT32_C (0x87fffffe);
970 dfa->word_char[3] = UINT32_C (0x07fffffe);
977 if (BE (dfa->is_utf8, 1))
979 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
985 for (; i < BITSET_WORDS; ++i)
986 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
987 if (isalnum (ch) || ch == '_')
988 dfa->word_char[i] |= (bitset_word_t) 1 << j;
991 /* Free the work area which are only used while compiling. */
994 free_workarea_compile (regex_t *preg)
996 re_dfa_t *dfa = preg->buffer;
997 bin_tree_storage_t *storage, *next;
998 for (storage = dfa->str_tree_storage; storage; storage = next)
1000 next = storage->next;
1003 dfa->str_tree_storage = NULL;
1004 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1005 dfa->str_tree = NULL;
1006 re_free (dfa->org_indices);
1007 dfa->org_indices = NULL;
1010 /* Create initial states for all contexts. */
1012 static reg_errcode_t
1013 create_initial_state (re_dfa_t *dfa)
1017 re_node_set init_nodes;
1019 /* Initial states have the epsilon closure of the node which is
1020 the first node of the regular expression. */
1021 first = dfa->str_tree->first->node_idx;
1022 dfa->init_node = first;
1023 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1024 if (BE (err != REG_NOERROR, 0))
1027 /* The back-references which are in initial states can epsilon transit,
1028 since in this case all of the subexpressions can be null.
1029 Then we add epsilon closures of the nodes which are the next nodes of
1030 the back-references. */
1031 if (dfa->nbackref > 0)
1032 for (i = 0; i < init_nodes.nelem; ++i)
1034 Idx node_idx = init_nodes.elems[i];
1035 re_token_type_t type = dfa->nodes[node_idx].type;
1038 if (type != OP_BACK_REF)
1040 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1042 re_token_t *clexp_node;
1043 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1044 if (clexp_node->type == OP_CLOSE_SUBEXP
1045 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1048 if (clexp_idx == init_nodes.nelem)
1051 if (type == OP_BACK_REF)
1053 Idx dest_idx = dfa->edests[node_idx].elems[0];
1054 if (!re_node_set_contains (&init_nodes, dest_idx))
1056 reg_errcode_t merge_err
1057 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1058 if (merge_err != REG_NOERROR)
1065 /* It must be the first time to invoke acquire_state. */
1066 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1067 /* We don't check ERR here, since the initial state must not be NULL. */
1068 if (BE (dfa->init_state == NULL, 0))
1070 if (dfa->init_state->has_constraint)
1072 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1074 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1076 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1080 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1081 || dfa->init_state_begbuf == NULL, 0))
1085 dfa->init_state_word = dfa->init_state_nl
1086 = dfa->init_state_begbuf = dfa->init_state;
1088 re_node_set_free (&init_nodes);
1092 #ifdef RE_ENABLE_I18N
1093 /* If it is possible to do searching in single byte encoding instead of UTF-8
1094 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1095 DFA nodes where needed. */
1098 optimize_utf8 (re_dfa_t *dfa)
1102 bool mb_chars = false;
1103 bool has_period = false;
1105 for (node = 0; node < dfa->nodes_len; ++node)
1106 switch (dfa->nodes[node].type)
1109 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1113 switch (dfa->nodes[node].opr.ctx_type)
1121 /* Word anchors etc. cannot be handled. It's okay to test
1122 opr.ctx_type since constraints (for all DFA nodes) are
1123 created by ORing one or more opr.ctx_type values. */
1133 case OP_DUP_ASTERISK:
1134 case OP_OPEN_SUBEXP:
1135 case OP_CLOSE_SUBEXP:
1137 case COMPLEX_BRACKET:
1139 case SIMPLE_BRACKET:
1140 /* Just double check. */
1142 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1144 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1145 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1147 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1157 if (mb_chars || has_period)
1158 for (node = 0; node < dfa->nodes_len; ++node)
1160 if (dfa->nodes[node].type == CHARACTER
1161 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1162 dfa->nodes[node].mb_partial = 0;
1163 else if (dfa->nodes[node].type == OP_PERIOD)
1164 dfa->nodes[node].type = OP_UTF8_PERIOD;
1167 /* The search can be in single byte locale. */
1168 dfa->mb_cur_max = 1;
1170 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1174 /* Analyze the structure tree, and calculate "first", "next", "edest",
1175 "eclosure", and "inveclosure". */
1177 static reg_errcode_t
1178 analyze (regex_t *preg)
1180 re_dfa_t *dfa = preg->buffer;
1183 /* Allocate arrays. */
1184 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1185 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1186 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1187 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1188 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1189 || dfa->eclosures == NULL, 0))
1192 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1193 if (dfa->subexp_map != NULL)
1196 for (i = 0; i < preg->re_nsub; i++)
1197 dfa->subexp_map[i] = i;
1198 preorder (dfa->str_tree, optimize_subexps, dfa);
1199 for (i = 0; i < preg->re_nsub; i++)
1200 if (dfa->subexp_map[i] != i)
1202 if (i == preg->re_nsub)
1204 free (dfa->subexp_map);
1205 dfa->subexp_map = NULL;
1209 ret = postorder (dfa->str_tree, lower_subexps, preg);
1210 if (BE (ret != REG_NOERROR, 0))
1212 ret = postorder (dfa->str_tree, calc_first, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1215 preorder (dfa->str_tree, calc_next, dfa);
1216 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1217 if (BE (ret != REG_NOERROR, 0))
1219 ret = calc_eclosure (dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1223 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1224 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1225 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1228 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1229 if (BE (dfa->inveclosures == NULL, 0))
1231 ret = calc_inveclosure (dfa);
1237 /* Our parse trees are very unbalanced, so we cannot use a stack to
1238 implement parse tree visits. Instead, we use parent pointers and
1239 some hairy code in these two functions. */
1240 static reg_errcode_t
1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244 bin_tree_t *node, *prev;
1246 for (node = root; ; )
1248 /* Descend down the tree, preferably to the left (or to the right
1249 if that's the only child). */
1250 while (node->left || node->right)
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1261 if (node->parent == NULL)
1264 node = node->parent;
1266 /* Go up while we have a node that is reached from the right. */
1267 while (node->right == prev || node->right == NULL);
1272 static reg_errcode_t
1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1278 for (node = root; ; )
1280 reg_errcode_t err = fn (extra, node);
1281 if (BE (err != REG_NOERROR, 0))
1284 /* Go to the left node, or up and to the right. */
1289 bin_tree_t *prev = NULL;
1290 while (node->right == prev || node->right == NULL)
1293 node = node->parent;
1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1303 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1304 backreferences as well. Requires a preorder visit. */
1305 static reg_errcode_t
1306 optimize_subexps (void *extra, bin_tree_t *node)
1308 re_dfa_t *dfa = (re_dfa_t *) extra;
1310 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1312 int idx = node->token.opr.idx;
1313 node->token.opr.idx = dfa->subexp_map[idx];
1314 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1317 else if (node->token.type == SUBEXP
1318 && node->left && node->left->token.type == SUBEXP)
1320 Idx other_idx = node->left->token.opr.idx;
1322 node->left = node->left->left;
1324 node->left->parent = node;
1326 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1327 if (other_idx < BITSET_WORD_BITS)
1328 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1335 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1336 static reg_errcode_t
1337 lower_subexps (void *extra, bin_tree_t *node)
1339 regex_t *preg = (regex_t *) extra;
1340 reg_errcode_t err = REG_NOERROR;
1342 if (node->left && node->left->token.type == SUBEXP)
1344 node->left = lower_subexp (&err, preg, node->left);
1346 node->left->parent = node;
1348 if (node->right && node->right->token.type == SUBEXP)
1350 node->right = lower_subexp (&err, preg, node->right);
1352 node->right->parent = node;
1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1361 re_dfa_t *dfa = preg->buffer;
1362 bin_tree_t *body = node->left;
1363 bin_tree_t *op, *cls, *tree1, *tree;
1366 /* We do not optimize empty subexpressions, because otherwise we may
1367 have bad CONCAT nodes with NULL children. This is obviously not
1368 very common, so we do not lose much. An example that triggers
1369 this case is the sed "script" /\(\)/x. */
1370 && node->left != NULL
1371 && (node->token.opr.idx >= BITSET_WORD_BITS
1372 || !(dfa->used_bkref_map
1373 & ((bitset_word_t) 1 << node->token.opr.idx))))
1376 /* Convert the SUBEXP node to the concatenation of an
1377 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1378 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1379 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1380 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1381 tree = create_tree (dfa, op, tree1, CONCAT);
1382 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1388 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1389 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1394 nodes. Requires a postorder visit. */
1395 static reg_errcode_t
1396 calc_first (void *extra, bin_tree_t *node)
1398 re_dfa_t *dfa = (re_dfa_t *) extra;
1399 if (node->token.type == CONCAT)
1401 node->first = node->left->first;
1402 node->node_idx = node->left->node_idx;
1407 node->node_idx = re_dfa_add_node (dfa, node->token);
1408 if (BE (node->node_idx == REG_MISSING, 0))
1410 if (node->token.type == ANCHOR)
1411 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1416 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1417 static reg_errcode_t
1418 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1420 switch (node->token.type)
1422 case OP_DUP_ASTERISK:
1423 node->left->next = node;
1426 node->left->next = node->right->first;
1427 node->right->next = node->next;
1431 node->left->next = node->next;
1433 node->right->next = node->next;
1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1440 static reg_errcode_t
1441 link_nfa_nodes (void *extra, bin_tree_t *node)
1443 re_dfa_t *dfa = (re_dfa_t *) extra;
1444 Idx idx = node->node_idx;
1445 reg_errcode_t err = REG_NOERROR;
1447 switch (node->token.type)
1453 assert (node->next == NULL);
1456 case OP_DUP_ASTERISK:
1460 dfa->has_plural_match = 1;
1461 if (node->left != NULL)
1462 left = node->left->first->node_idx;
1464 left = node->next->node_idx;
1465 if (node->right != NULL)
1466 right = node->right->first->node_idx;
1468 right = node->next->node_idx;
1469 assert (REG_VALID_INDEX (left));
1470 assert (REG_VALID_INDEX (right));
1471 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1476 case OP_OPEN_SUBEXP:
1477 case OP_CLOSE_SUBEXP:
1478 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1482 dfa->nexts[idx] = node->next->node_idx;
1483 if (node->token.type == OP_BACK_REF)
1484 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1488 assert (!IS_EPSILON_NODE (node->token.type));
1489 dfa->nexts[idx] = node->next->node_idx;
1496 /* Duplicate the epsilon closure of the node ROOT_NODE.
1497 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1498 to their own constraint. */
1500 static reg_errcode_t
1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1503 Idx root_node, unsigned int init_constraint)
1505 Idx org_node, clone_node;
1507 unsigned int constraint = init_constraint;
1508 for (org_node = top_org_node, clone_node = top_clone_node;;)
1510 Idx org_dest, clone_dest;
1511 if (dfa->nodes[org_node].type == OP_BACK_REF)
1513 /* If the back reference epsilon-transit, its destination must
1514 also have the constraint. Then duplicate the epsilon closure
1515 of the destination of the back reference, and store it in
1516 edests of the back reference. */
1517 org_dest = dfa->nexts[org_node];
1518 re_node_set_empty (dfa->edests + clone_node);
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == REG_MISSING, 0))
1522 dfa->nexts[clone_node] = dfa->nexts[org_node];
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1527 else if (dfa->edests[org_node].nelem == 0)
1529 /* In case of the node can't epsilon-transit, don't duplicate the
1530 destination and store the original destination as the
1531 destination of the node. */
1532 dfa->nexts[clone_node] = dfa->nexts[org_node];
1535 else if (dfa->edests[org_node].nelem == 1)
1537 /* In case of the node can epsilon-transit, and it has only one
1539 org_dest = dfa->edests[org_node].elems[0];
1540 re_node_set_empty (dfa->edests + clone_node);
1541 /* If the node is root_node itself, it means the epsilon closure
1542 has a loop. Then tie it to the destination of the root_node. */
1543 if (org_node == root_node && clone_node != org_node)
1545 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1550 /* In case the node has another constraint, append it. */
1551 constraint |= dfa->nodes[org_node].constraint;
1552 clone_dest = duplicate_node (dfa, org_dest, constraint);
1553 if (BE (clone_dest == REG_MISSING, 0))
1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 else /* dfa->edests[org_node].nelem == 2 */
1561 /* In case of the node can epsilon-transit, and it has two
1562 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1563 org_dest = dfa->edests[org_node].elems[0];
1564 re_node_set_empty (dfa->edests + clone_node);
1565 /* Search for a duplicated node which satisfies the constraint. */
1566 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1567 if (clone_dest == REG_MISSING)
1569 /* There is no such duplicated node, create a new one. */
1571 clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 if (BE (clone_dest == REG_MISSING, 0))
1574 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1577 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1578 root_node, constraint);
1579 if (BE (err != REG_NOERROR, 0))
1584 /* There is a duplicated node which satisfies the constraint,
1585 use it to avoid infinite loop. */
1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 org_dest = dfa->edests[org_node].elems[1];
1592 clone_dest = duplicate_node (dfa, org_dest, constraint);
1593 if (BE (clone_dest == REG_MISSING, 0))
1595 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599 org_node = org_dest;
1600 clone_node = clone_dest;
1605 /* Search for a node which is duplicated from the node ORG_NODE, and
1606 satisfies the constraint CONSTRAINT. */
1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1610 unsigned int constraint)
1613 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1615 if (org_node == dfa->org_indices[idx]
1616 && constraint == dfa->nodes[idx].constraint)
1617 return idx; /* Found. */
1619 return REG_MISSING; /* Not found. */
1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1623 Return the index of the new node, or REG_MISSING if insufficient storage is
1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1629 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1630 if (BE (dup_idx != REG_MISSING, 1))
1632 dfa->nodes[dup_idx].constraint = constraint;
1633 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1634 dfa->nodes[dup_idx].duplicated = 1;
1636 /* Store the index of the original node. */
1637 dfa->org_indices[dup_idx] = org_idx;
1642 static reg_errcode_t
1643 calc_inveclosure (re_dfa_t *dfa)
1647 for (idx = 0; idx < dfa->nodes_len; ++idx)
1648 re_node_set_init_empty (dfa->inveclosures + idx);
1650 for (src = 0; src < dfa->nodes_len; ++src)
1652 Idx *elems = dfa->eclosures[src].elems;
1653 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1655 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1664 /* Calculate "eclosure" for all the node in DFA. */
1666 static reg_errcode_t
1667 calc_eclosure (re_dfa_t *dfa)
1672 assert (dfa->nodes_len > 0);
1675 /* For each nodes, calculate epsilon closure. */
1676 for (node_idx = 0; ; ++node_idx)
1679 re_node_set eclosure_elem;
1680 if (node_idx == dfa->nodes_len)
1689 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1692 /* If we have already calculated, skip it. */
1693 if (dfa->eclosures[node_idx].nelem != 0)
1695 /* Calculate epsilon closure of 'node_idx'. */
1696 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1697 if (BE (err != REG_NOERROR, 0))
1700 if (dfa->eclosures[node_idx].nelem == 0)
1703 re_node_set_free (&eclosure_elem);
1709 /* Calculate epsilon closure of NODE. */
1711 static reg_errcode_t
1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1716 re_node_set eclosure;
1718 bool incomplete = false;
1719 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1720 if (BE (err != REG_NOERROR, 0))
1723 /* This indicates that we are calculating this node now.
1724 We reference this value to avoid infinite loop. */
1725 dfa->eclosures[node].nelem = REG_MISSING;
1727 /* If the current node has constraints, duplicate all nodes
1728 since they must inherit the constraints. */
1729 if (dfa->nodes[node].constraint
1730 && dfa->edests[node].nelem
1731 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1733 err = duplicate_node_closure (dfa, node, node, node,
1734 dfa->nodes[node].constraint);
1735 if (BE (err != REG_NOERROR, 0))
1739 /* Expand each epsilon destination nodes. */
1740 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1741 for (i = 0; i < dfa->edests[node].nelem; ++i)
1743 re_node_set eclosure_elem;
1744 Idx edest = dfa->edests[node].elems[i];
1745 /* If calculating the epsilon closure of 'edest' is in progress,
1746 return intermediate result. */
1747 if (dfa->eclosures[edest].nelem == REG_MISSING)
1752 /* If we haven't calculated the epsilon closure of 'edest' yet,
1753 calculate now. Otherwise use calculated epsilon closure. */
1754 if (dfa->eclosures[edest].nelem == 0)
1756 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1757 if (BE (err != REG_NOERROR, 0))
1761 eclosure_elem = dfa->eclosures[edest];
1762 /* Merge the epsilon closure of 'edest'. */
1763 err = re_node_set_merge (&eclosure, &eclosure_elem);
1764 if (BE (err != REG_NOERROR, 0))
1766 /* If the epsilon closure of 'edest' is incomplete,
1767 the epsilon closure of this node is also incomplete. */
1768 if (dfa->eclosures[edest].nelem == 0)
1771 re_node_set_free (&eclosure_elem);
1775 /* An epsilon closure includes itself. */
1776 ok = re_node_set_insert (&eclosure, node);
1779 if (incomplete && !root)
1780 dfa->eclosures[node].nelem = 0;
1782 dfa->eclosures[node] = eclosure;
1783 *new_set = eclosure;
1787 /* Functions for token which are used in the parser. */
1789 /* Fetch a token from INPUT.
1790 We must not use this function inside bracket expressions. */
1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1796 re_string_skip_bytes (input, peek_token (result, input, syntax));
1799 /* Peek a token from INPUT, and return the length of the token.
1800 We must not use this function inside bracket expressions. */
1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1808 if (re_string_eoi (input))
1810 token->type = END_OF_RE;
1814 c = re_string_peek_byte (input, 0);
1817 token->word_char = 0;
1818 #ifdef RE_ENABLE_I18N
1819 token->mb_partial = 0;
1820 if (input->mb_cur_max > 1 &&
1821 !re_string_first_byte (input, re_string_cur_idx (input)))
1823 token->type = CHARACTER;
1824 token->mb_partial = 1;
1831 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1833 token->type = BACK_SLASH;
1837 c2 = re_string_peek_byte_case (input, 1);
1839 token->type = CHARACTER;
1840 #ifdef RE_ENABLE_I18N
1841 if (input->mb_cur_max > 1)
1843 wint_t wc = re_string_wchar_at (input,
1844 re_string_cur_idx (input) + 1);
1845 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1849 token->word_char = IS_WORD_CHAR (c2) != 0;
1854 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1855 token->type = OP_ALT;
1857 case '1': case '2': case '3': case '4': case '5':
1858 case '6': case '7': case '8': case '9':
1859 if (!(syntax & RE_NO_BK_REFS))
1861 token->type = OP_BACK_REF;
1862 token->opr.idx = c2 - '1';
1866 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_FIRST;
1873 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_LAST;
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_DELIM;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = NOT_WORD_DELIM;
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = OP_WORD;
1898 if (!(syntax & RE_NO_GNU_OPS))
1899 token->type = OP_NOTWORD;
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_SPACE;
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_NOTSPACE;
1910 if (!(syntax & RE_NO_GNU_OPS))
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_FIRST;
1917 if (!(syntax & RE_NO_GNU_OPS))
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = BUF_LAST;
1924 if (!(syntax & RE_NO_BK_PARENS))
1925 token->type = OP_OPEN_SUBEXP;
1928 if (!(syntax & RE_NO_BK_PARENS))
1929 token->type = OP_CLOSE_SUBEXP;
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1933 token->type = OP_DUP_PLUS;
1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937 token->type = OP_DUP_QUESTION;
1940 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1941 token->type = OP_OPEN_DUP_NUM;
1944 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945 token->type = OP_CLOSE_DUP_NUM;
1953 token->type = CHARACTER;
1954 #ifdef RE_ENABLE_I18N
1955 if (input->mb_cur_max > 1)
1957 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1958 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1962 token->word_char = IS_WORD_CHAR (token->opr.c);
1967 if (syntax & RE_NEWLINE_ALT)
1968 token->type = OP_ALT;
1971 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1972 token->type = OP_ALT;
1975 token->type = OP_DUP_ASTERISK;
1978 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1979 token->type = OP_DUP_PLUS;
1982 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983 token->type = OP_DUP_QUESTION;
1986 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1987 token->type = OP_OPEN_DUP_NUM;
1990 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991 token->type = OP_CLOSE_DUP_NUM;
1994 if (syntax & RE_NO_BK_PARENS)
1995 token->type = OP_OPEN_SUBEXP;
1998 if (syntax & RE_NO_BK_PARENS)
1999 token->type = OP_CLOSE_SUBEXP;
2002 token->type = OP_OPEN_BRACKET;
2005 token->type = OP_PERIOD;
2008 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2009 re_string_cur_idx (input) != 0)
2011 char prev = re_string_peek_byte (input, -1);
2012 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2015 token->type = ANCHOR;
2016 token->opr.ctx_type = LINE_FIRST;
2019 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2020 re_string_cur_idx (input) + 1 != re_string_length (input))
2023 re_string_skip_bytes (input, 1);
2024 peek_token (&next, input, syntax);
2025 re_string_skip_bytes (input, -1);
2026 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2029 token->type = ANCHOR;
2030 token->opr.ctx_type = LINE_LAST;
2038 /* Peek a token from INPUT, and return the length of the token.
2039 We must not use this function out of bracket expressions. */
2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2046 if (re_string_eoi (input))
2048 token->type = END_OF_RE;
2051 c = re_string_peek_byte (input, 0);
2054 #ifdef RE_ENABLE_I18N
2055 if (input->mb_cur_max > 1 &&
2056 !re_string_first_byte (input, re_string_cur_idx (input)))
2058 token->type = CHARACTER;
2061 #endif /* RE_ENABLE_I18N */
2063 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2064 && re_string_cur_idx (input) + 1 < re_string_length (input))
2066 /* In this case, '\' escape a character. */
2068 re_string_skip_bytes (input, 1);
2069 c2 = re_string_peek_byte (input, 0);
2071 token->type = CHARACTER;
2074 if (c == '[') /* '[' is a special char in a bracket exps. */
2078 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2079 c2 = re_string_peek_byte (input, 1);
2087 token->type = OP_OPEN_COLL_ELEM;
2090 token->type = OP_OPEN_EQUIV_CLASS;
2093 if (syntax & RE_CHAR_CLASSES)
2095 token->type = OP_OPEN_CHAR_CLASS;
2098 /* else fall through. */
2100 token->type = CHARACTER;
2110 token->type = OP_CHARSET_RANGE;
2113 token->type = OP_CLOSE_BRACKET;
2116 token->type = OP_NON_MATCH_LIST;
2119 token->type = CHARACTER;
2124 /* Functions for parser. */
2126 /* Entry point of the parser.
2127 Parse the regular expression REGEXP and return the structure tree.
2128 If an error occurs, ERR is set by error code, and return NULL.
2129 This function build the following tree, from regular expression <reg_exp>:
2135 CAT means concatenation.
2136 EOR means end of regular expression. */
2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2142 re_dfa_t *dfa = preg->buffer;
2143 bin_tree_t *tree, *eor, *root;
2144 re_token_t current_token;
2145 dfa->syntax = syntax;
2146 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2152 root = create_tree (dfa, tree, eor, CONCAT);
2155 if (BE (eor == NULL || root == NULL, 0))
2163 /* This function build the following tree, from regular expression
2164 <branch1>|<branch2>:
2170 ALT means alternative, which represents the operator '|'. */
2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2176 re_dfa_t *dfa = preg->buffer;
2177 bin_tree_t *tree, *branch = NULL;
2178 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182 while (token->type == OP_ALT)
2184 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2185 if (token->type != OP_ALT && token->type != END_OF_RE
2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2188 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2194 tree = create_tree (dfa, tree, branch, OP_ALT);
2195 if (BE (tree == NULL, 0))
2204 /* This function build the following tree, from regular expression
2211 CAT means concatenation. */
2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217 bin_tree_t *tree, *expr;
2218 re_dfa_t *dfa = preg->buffer;
2219 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 while (token->type != OP_ALT && token->type != END_OF_RE
2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2226 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230 postorder (tree, free_tree, NULL);
2233 if (tree != NULL && expr != NULL)
2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 if (newtree == NULL)
2238 postorder (expr, free_tree, NULL);
2239 postorder (tree, free_tree, NULL);
2245 else if (tree == NULL)
2247 /* Otherwise expr == NULL, we don't need to create new tree. */
2252 /* This function build the following tree, from regular expression a*:
2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2262 re_dfa_t *dfa = preg->buffer;
2264 switch (token->type)
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2273 #ifdef RE_ENABLE_I18N
2274 if (dfa->mb_cur_max > 1)
2276 while (!re_string_eoi (regexp)
2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2279 bin_tree_t *mbc_remain;
2280 fetch_token (token, regexp, syntax);
2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 if (BE (mbc_remain == NULL || tree == NULL, 0))
2292 case OP_OPEN_SUBEXP:
2293 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2297 case OP_OPEN_BRACKET:
2298 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2308 dfa->used_bkref_map |= 1 << token->opr.idx;
2309 tree = create_token_tree (dfa, NULL, NULL, token);
2310 if (BE (tree == NULL, 0))
2316 dfa->has_mb_node = 1;
2318 case OP_OPEN_DUP_NUM:
2319 if (syntax & RE_CONTEXT_INVALID_DUP)
2325 case OP_DUP_ASTERISK:
2327 case OP_DUP_QUESTION:
2328 if (syntax & RE_CONTEXT_INVALID_OPS)
2333 else if (syntax & RE_CONTEXT_INDEP_OPS)
2335 fetch_token (token, regexp, syntax);
2336 return parse_expression (regexp, preg, token, syntax, nest, err);
2338 /* else fall through */
2339 case OP_CLOSE_SUBEXP:
2340 if ((token->type == OP_CLOSE_SUBEXP) &&
2341 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2346 /* else fall through */
2347 case OP_CLOSE_DUP_NUM:
2348 /* We treat it as a normal character. */
2350 /* Then we can these characters as normal characters. */
2351 token->type = CHARACTER;
2352 /* mb_partial and word_char bits should be initialized already
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2362 if ((token->opr.ctx_type
2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 && dfa->word_ops_used == 0)
2365 init_word_char (dfa);
2366 if (token->opr.ctx_type == WORD_DELIM
2367 || token->opr.ctx_type == NOT_WORD_DELIM)
2369 bin_tree_t *tree_first, *tree_last;
2370 if (token->opr.ctx_type == WORD_DELIM)
2372 token->opr.ctx_type = WORD_FIRST;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = WORD_LAST;
2378 token->opr.ctx_type = INSIDE_WORD;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = INSIDE_NOTWORD;
2382 tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 tree = create_token_tree (dfa, NULL, NULL, token);
2393 if (BE (tree == NULL, 0))
2399 /* We must return here, since ANCHORs can't be followed
2400 by repetition operators.
2401 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2403 fetch_token (token, regexp, syntax);
2406 tree = create_token_tree (dfa, NULL, NULL, token);
2407 if (BE (tree == NULL, 0))
2412 if (dfa->mb_cur_max > 1)
2413 dfa->has_mb_node = 1;
2417 tree = build_charclass_op (dfa, regexp->trans,
2418 (const unsigned char *) "alnum",
2419 (const unsigned char *) "_",
2420 token->type == OP_NOTWORD, err);
2421 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 (const unsigned char *) "space",
2428 (const unsigned char *) "",
2429 token->type == OP_NOTSPACE, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2440 /* Must not happen? */
2446 fetch_token (token, regexp, syntax);
2448 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2451 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2452 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2454 /* In BRE consecutive duplications are not allowed. */
2455 if ((syntax & RE_CONTEXT_INVALID_DUP)
2456 && (token->type == OP_DUP_ASTERISK
2457 || token->type == OP_OPEN_DUP_NUM))
2467 /* This function build the following tree, from regular expression
2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2478 re_dfa_t *dfa = preg->buffer;
2481 cur_nsub = preg->re_nsub++;
2483 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485 /* The subexpression may be a null string. */
2486 if (token->type == OP_CLOSE_SUBEXP)
2490 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2491 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2494 postorder (tree, free_tree, NULL);
2497 if (BE (*err != REG_NOERROR, 0))
2501 if (cur_nsub <= '9' - '1')
2502 dfa->completed_bkref_map |= 1 << cur_nsub;
2504 tree = create_tree (dfa, tree, NULL, SUBEXP);
2505 if (BE (tree == NULL, 0))
2510 tree->token.opr.idx = cur_nsub;
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2518 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2520 bin_tree_t *tree = NULL, *old_tree = NULL;
2521 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2522 re_token_t start_token = *token;
2524 if (token->type == OP_OPEN_DUP_NUM)
2527 start = fetch_number (regexp, token, syntax);
2528 if (start == REG_MISSING)
2530 if (token->type == CHARACTER && token->opr.c == ',')
2531 start = 0; /* We treat "{,m}" as "{0,m}". */
2534 *err = REG_BADBR; /* <re>{} is invalid. */
2538 if (BE (start != REG_ERROR, 1))
2540 /* We treat "{n}" as "{n,n}". */
2541 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542 : ((token->type == CHARACTER && token->opr.c == ',')
2543 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2545 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2547 /* Invalid sequence. */
2548 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2550 if (token->type == END_OF_RE)
2558 /* If the syntax bit is set, rollback. */
2559 re_string_set_index (regexp, start_idx);
2560 *token = start_token;
2561 token->type = CHARACTER;
2562 /* mb_partial and word_char bits should be already initialized by
2567 if (BE ((end != REG_MISSING && start > end)
2568 || token->type != OP_CLOSE_DUP_NUM, 0))
2570 /* First number greater than second. */
2575 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2583 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2584 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2587 fetch_token (token, regexp, syntax);
2589 if (BE (elem == NULL, 0))
2591 if (BE (start == 0 && end == 0, 0))
2593 postorder (elem, free_tree, NULL);
2597 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2598 if (BE (start > 0, 0))
2601 for (i = 2; i <= start; ++i)
2603 elem = duplicate_tree (elem, dfa);
2604 tree = create_tree (dfa, tree, elem, CONCAT);
2605 if (BE (elem == NULL || tree == NULL, 0))
2606 goto parse_dup_op_espace;
2612 /* Duplicate ELEM before it is marked optional. */
2613 elem = duplicate_tree (elem, dfa);
2619 if (elem->token.type == SUBEXP)
2620 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2622 tree = create_tree (dfa, elem, NULL,
2623 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2624 if (BE (tree == NULL, 0))
2625 goto parse_dup_op_espace;
2627 /* From gnulib's "intprops.h":
2628 True if the arithmetic type T is signed. */
2629 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2631 /* This loop is actually executed only when end != REG_MISSING,
2632 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2633 already created the start+1-th copy. */
2634 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2635 for (i = start + 2; i <= end; ++i)
2637 elem = duplicate_tree (elem, dfa);
2638 tree = create_tree (dfa, tree, elem, CONCAT);
2639 if (BE (elem == NULL || tree == NULL, 0))
2640 goto parse_dup_op_espace;
2642 tree = create_tree (dfa, tree, NULL, OP_ALT);
2643 if (BE (tree == NULL, 0))
2644 goto parse_dup_op_espace;
2648 tree = create_tree (dfa, old_tree, tree, CONCAT);
2652 parse_dup_op_espace:
2657 /* Size of the names for collating symbol/equivalence_class/character_class.
2658 I'm not sure, but maybe enough. */
2659 #define BRACKET_NAME_BUF_SIZE 32
2662 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2663 Build the range expression which starts from START_ELEM, and ends
2664 at END_ELEM. The result are written to MBCSET and SBCSET.
2665 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2666 mbcset->range_ends, is a pointer argument since we may
2669 static reg_errcode_t
2671 # ifdef RE_ENABLE_I18N
2672 build_range_exp (const reg_syntax_t syntax,
2674 re_charset_t *mbcset,
2676 const bracket_elem_t *start_elem,
2677 const bracket_elem_t *end_elem)
2678 # else /* not RE_ENABLE_I18N */
2679 build_range_exp (const reg_syntax_t syntax,
2681 const bracket_elem_t *start_elem,
2682 const bracket_elem_t *end_elem)
2683 # endif /* not RE_ENABLE_I18N */
2685 unsigned int start_ch, end_ch;
2686 /* Equivalence Classes and Character Classes can't be a range start/end. */
2687 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2688 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2692 /* We can handle no multi character collating elements without libc
2694 if (BE ((start_elem->type == COLL_SYM
2695 && strlen ((char *) start_elem->opr.name) > 1)
2696 || (end_elem->type == COLL_SYM
2697 && strlen ((char *) end_elem->opr.name) > 1), 0))
2698 return REG_ECOLLATE;
2700 # ifdef RE_ENABLE_I18N
2705 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2707 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2708 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2710 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2711 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2713 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2714 ? __btowc (start_ch) : start_elem->opr.wch);
2715 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2716 ? __btowc (end_ch) : end_elem->opr.wch);
2717 if (start_wc == WEOF || end_wc == WEOF)
2718 return REG_ECOLLATE;
2719 cmp_buf[0] = start_wc;
2720 cmp_buf[4] = end_wc;
2722 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2723 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2726 /* Got valid collation sequence values, add them as a new entry.
2727 However, for !_LIBC we have no collation elements: if the
2728 character set is single byte, the single byte character set
2729 that we build below suffices. parse_bracket_exp passes
2730 no MBCSET if dfa->mb_cur_max == 1. */
2733 /* Check the space of the arrays. */
2734 if (BE (*range_alloc == mbcset->nranges, 0))
2736 /* There is not enough space, need realloc. */
2737 wchar_t *new_array_start, *new_array_end;
2740 /* +1 in case of mbcset->nranges is 0. */
2741 new_nranges = 2 * mbcset->nranges + 1;
2742 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2743 are NULL if *range_alloc == 0. */
2744 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2746 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2749 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2752 mbcset->range_starts = new_array_start;
2753 mbcset->range_ends = new_array_end;
2754 *range_alloc = new_nranges;
2757 mbcset->range_starts[mbcset->nranges] = start_wc;
2758 mbcset->range_ends[mbcset->nranges++] = end_wc;
2761 /* Build the table for single byte characters. */
2762 for (wc = 0; wc < SBC_MAX; ++wc)
2765 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2766 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2767 bitset_set (sbcset, wc);
2770 # else /* not RE_ENABLE_I18N */
2773 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2774 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2776 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2777 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2779 if (start_ch > end_ch)
2781 /* Build the table for single byte characters. */
2782 for (ch = 0; ch < SBC_MAX; ++ch)
2783 if (start_ch <= ch && ch <= end_ch)
2784 bitset_set (sbcset, ch);
2786 # endif /* not RE_ENABLE_I18N */
2789 #endif /* not _LIBC */
2792 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2793 Build the collating element which is represented by NAME.
2794 The result are written to MBCSET and SBCSET.
2795 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2796 pointer argument since we may update it. */
2798 static reg_errcode_t
2800 # ifdef RE_ENABLE_I18N
2801 build_collating_symbol (bitset_t sbcset,
2802 re_charset_t *mbcset _UNUSED_PARAMETER_,
2803 Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2804 const unsigned char *name)
2805 # else /* not RE_ENABLE_I18N */
2806 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2807 # endif /* not RE_ENABLE_I18N */
2809 size_t name_len = strlen ((const char *) name);
2810 if (BE (name_len != 1, 0))
2811 return REG_ECOLLATE;
2814 bitset_set (sbcset, name[0]);
2818 #endif /* not _LIBC */
2820 /* This function parse bracket expression like "[abc]", "[a-c]",
2824 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2825 reg_syntax_t syntax, reg_errcode_t *err)
2828 const unsigned char *collseqmb;
2829 const char *collseqwc;
2832 const int32_t *symb_table;
2833 const unsigned char *extra;
2835 /* Local function for parse_bracket_exp used in _LIBC environment.
2836 Seek the collating symbol entry corresponding to NAME.
2837 Return the index of the symbol in the SYMB_TABLE. */
2840 __attribute ((always_inline))
2841 seek_collating_symbol_entry (name, name_len)
2842 const unsigned char *name;
2845 int32_t hash = elem_hash ((const char *) name, name_len);
2846 int32_t elem = hash % table_size;
2847 if (symb_table[2 * elem] != 0)
2849 int32_t second = hash % (table_size - 2) + 1;
2853 /* First compare the hashing value. */
2854 if (symb_table[2 * elem] == hash
2855 /* Compare the length of the name. */
2856 && name_len == extra[symb_table[2 * elem + 1]]
2857 /* Compare the name. */
2858 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2861 /* Yep, this is the entry. */
2868 while (symb_table[2 * elem] != 0);
2873 /* Local function for parse_bracket_exp used in _LIBC environment.
2874 Look up the collation sequence value of BR_ELEM.
2875 Return the value if succeeded, UINT_MAX otherwise. */
2877 auto inline unsigned int
2878 __attribute ((always_inline))
2879 lookup_collation_sequence_value (br_elem)
2880 bracket_elem_t *br_elem;
2882 if (br_elem->type == SB_CHAR)
2885 if (MB_CUR_MAX == 1)
2888 return collseqmb[br_elem->opr.ch];
2891 wint_t wc = __btowc (br_elem->opr.ch);
2892 return __collseq_table_lookup (collseqwc, wc);
2895 else if (br_elem->type == MB_CHAR)
2898 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2900 else if (br_elem->type == COLL_SYM)
2902 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2906 elem = seek_collating_symbol_entry (br_elem->opr.name,
2908 if (symb_table[2 * elem] != 0)
2910 /* We found the entry. */
2911 idx = symb_table[2 * elem + 1];
2912 /* Skip the name of collating element name. */
2913 idx += 1 + extra[idx];
2914 /* Skip the byte sequence of the collating element. */
2915 idx += 1 + extra[idx];
2916 /* Adjust for the alignment. */
2917 idx = (idx + 3) & ~3;
2918 /* Skip the multibyte collation sequence value. */
2919 idx += sizeof (unsigned int);
2920 /* Skip the wide char sequence of the collating element. */
2921 idx += sizeof (unsigned int) *
2922 (1 + *(unsigned int *) (extra + idx));
2923 /* Return the collation sequence value. */
2924 return *(unsigned int *) (extra + idx);
2926 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2928 /* No valid character. Match it as a single byte
2930 return collseqmb[br_elem->opr.name[0]];
2933 else if (sym_name_len == 1)
2934 return collseqmb[br_elem->opr.name[0]];
2939 /* Local function for parse_bracket_exp used in _LIBC environment.
2940 Build the range expression which starts from START_ELEM, and ends
2941 at END_ELEM. The result are written to MBCSET and SBCSET.
2942 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2943 mbcset->range_ends, is a pointer argument since we may
2946 auto inline reg_errcode_t
2947 __attribute ((always_inline))
2948 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2949 re_charset_t *mbcset;
2952 bracket_elem_t *start_elem, *end_elem;
2955 uint32_t start_collseq;
2956 uint32_t end_collseq;
2958 /* Equivalence Classes and Character Classes can't be a range
2960 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2961 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2965 start_collseq = lookup_collation_sequence_value (start_elem);
2966 end_collseq = lookup_collation_sequence_value (end_elem);
2967 /* Check start/end collation sequence values. */
2968 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2969 return REG_ECOLLATE;
2970 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2973 /* Got valid collation sequence values, add them as a new entry.
2974 However, if we have no collation elements, and the character set
2975 is single byte, the single byte character set that we
2976 build below suffices. */
2977 if (nrules > 0 || dfa->mb_cur_max > 1)
2979 /* Check the space of the arrays. */
2980 if (BE (*range_alloc == mbcset->nranges, 0))
2982 /* There is not enough space, need realloc. */
2983 uint32_t *new_array_start;
2984 uint32_t *new_array_end;
2987 /* +1 in case of mbcset->nranges is 0. */
2988 new_nranges = 2 * mbcset->nranges + 1;
2989 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2991 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2997 mbcset->range_starts = new_array_start;
2998 mbcset->range_ends = new_array_end;
2999 *range_alloc = new_nranges;
3002 mbcset->range_starts[mbcset->nranges] = start_collseq;
3003 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006 /* Build the table for single byte characters. */
3007 for (ch = 0; ch < SBC_MAX; ch++)
3009 uint32_t ch_collseq;
3011 if (MB_CUR_MAX == 1)
3014 ch_collseq = collseqmb[ch];
3016 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018 bitset_set (sbcset, ch);
3023 /* Local function for parse_bracket_exp used in _LIBC environment.
3024 Build the collating element which is represented by NAME.
3025 The result are written to MBCSET and SBCSET.
3026 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027 pointer argument since we may update it. */
3029 auto inline reg_errcode_t
3030 __attribute ((always_inline))
3031 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3032 re_charset_t *mbcset;
3033 Idx *coll_sym_alloc;
3035 const unsigned char *name;
3038 size_t name_len = strlen ((const char *) name);
3041 elem = seek_collating_symbol_entry (name, name_len);
3042 if (symb_table[2 * elem] != 0)
3044 /* We found the entry. */
3045 idx = symb_table[2 * elem + 1];
3046 /* Skip the name of collating element name. */
3047 idx += 1 + extra[idx];
3049 else if (symb_table[2 * elem] == 0 && name_len == 1)
3051 /* No valid character, treat it as a normal
3053 bitset_set (sbcset, name[0]);
3057 return REG_ECOLLATE;
3059 /* Got valid collation sequence, add it as a new entry. */
3060 /* Check the space of the arrays. */
3061 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3063 /* Not enough, realloc it. */
3064 /* +1 in case of mbcset->ncoll_syms is 0. */
3065 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3066 /* Use realloc since mbcset->coll_syms is NULL
3068 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3069 new_coll_sym_alloc);
3070 if (BE (new_coll_syms == NULL, 0))
3072 mbcset->coll_syms = new_coll_syms;
3073 *coll_sym_alloc = new_coll_sym_alloc;
3075 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3080 if (BE (name_len != 1, 0))
3081 return REG_ECOLLATE;
3084 bitset_set (sbcset, name[0]);
3091 re_token_t br_token;
3092 re_bitset_ptr_t sbcset;
3093 #ifdef RE_ENABLE_I18N
3094 re_charset_t *mbcset;
3095 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3096 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3097 #endif /* not RE_ENABLE_I18N */
3098 bool non_match = false;
3099 bin_tree_t *work_tree;
3101 bool first_round = true;
3103 collseqmb = (const unsigned char *)
3104 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3105 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3111 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3112 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3113 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3114 _NL_COLLATE_SYMB_TABLEMB);
3115 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3116 _NL_COLLATE_SYMB_EXTRAMB);
3119 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3120 #ifdef RE_ENABLE_I18N
3121 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3122 #endif /* RE_ENABLE_I18N */
3123 #ifdef RE_ENABLE_I18N
3124 if (BE (sbcset == NULL || mbcset == NULL, 0))
3126 if (BE (sbcset == NULL, 0))
3127 #endif /* RE_ENABLE_I18N */
3130 #ifdef RE_ENABLE_I18N
3137 token_len = peek_token_bracket (token, regexp, syntax);
3138 if (BE (token->type == END_OF_RE, 0))
3141 goto parse_bracket_exp_free_return;
3143 if (token->type == OP_NON_MATCH_LIST)
3145 #ifdef RE_ENABLE_I18N
3146 mbcset->non_match = 1;
3147 #endif /* not RE_ENABLE_I18N */
3149 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3150 bitset_set (sbcset, '\n');
3151 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3152 token_len = peek_token_bracket (token, regexp, syntax);
3153 if (BE (token->type == END_OF_RE, 0))
3156 goto parse_bracket_exp_free_return;
3160 /* We treat the first ']' as a normal character. */
3161 if (token->type == OP_CLOSE_BRACKET)
3162 token->type = CHARACTER;
3166 bracket_elem_t start_elem, end_elem;
3167 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3168 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3171 bool is_range_exp = false;
3174 start_elem.opr.name = start_name_buf;
3175 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3176 syntax, first_round);
3177 if (BE (ret != REG_NOERROR, 0))
3180 goto parse_bracket_exp_free_return;
3182 first_round = false;
3184 /* Get information about the next token. We need it in any case. */
3185 token_len = peek_token_bracket (token, regexp, syntax);
3187 /* Do not check for ranges if we know they are not allowed. */
3188 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3190 if (BE (token->type == END_OF_RE, 0))
3193 goto parse_bracket_exp_free_return;
3195 if (token->type == OP_CHARSET_RANGE)
3197 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3198 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3199 if (BE (token2.type == END_OF_RE, 0))
3202 goto parse_bracket_exp_free_return;
3204 if (token2.type == OP_CLOSE_BRACKET)
3206 /* We treat the last '-' as a normal character. */
3207 re_string_skip_bytes (regexp, -token_len);
3208 token->type = CHARACTER;
3211 is_range_exp = true;
3215 if (is_range_exp == true)
3217 end_elem.opr.name = end_name_buf;
3218 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3220 if (BE (ret != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3226 token_len = peek_token_bracket (token, regexp, syntax);
3229 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3230 &start_elem, &end_elem);
3232 # ifdef RE_ENABLE_I18N
3233 *err = build_range_exp (syntax, sbcset,
3234 dfa->mb_cur_max > 1 ? mbcset : NULL,
3235 &range_alloc, &start_elem, &end_elem);
3237 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3239 #endif /* RE_ENABLE_I18N */
3240 if (BE (*err != REG_NOERROR, 0))
3241 goto parse_bracket_exp_free_return;
3245 switch (start_elem.type)
3248 bitset_set (sbcset, start_elem.opr.ch);
3250 #ifdef RE_ENABLE_I18N
3252 /* Check whether the array has enough space. */
3253 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3255 wchar_t *new_mbchars;
3256 /* Not enough, realloc it. */
3257 /* +1 in case of mbcset->nmbchars is 0. */
3258 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3259 /* Use realloc since array is NULL if *alloc == 0. */
3260 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3262 if (BE (new_mbchars == NULL, 0))
3263 goto parse_bracket_exp_espace;
3264 mbcset->mbchars = new_mbchars;
3266 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3268 #endif /* RE_ENABLE_I18N */
3270 *err = build_equiv_class (sbcset,
3271 #ifdef RE_ENABLE_I18N
3272 mbcset, &equiv_class_alloc,
3273 #endif /* RE_ENABLE_I18N */
3274 start_elem.opr.name);
3275 if (BE (*err != REG_NOERROR, 0))
3276 goto parse_bracket_exp_free_return;
3279 *err = build_collating_symbol (sbcset,
3280 #ifdef RE_ENABLE_I18N
3281 mbcset, &coll_sym_alloc,
3282 #endif /* RE_ENABLE_I18N */
3283 start_elem.opr.name);
3284 if (BE (*err != REG_NOERROR, 0))
3285 goto parse_bracket_exp_free_return;
3288 *err = build_charclass (regexp->trans, sbcset,
3289 #ifdef RE_ENABLE_I18N
3290 mbcset, &char_class_alloc,
3291 #endif /* RE_ENABLE_I18N */
3292 start_elem.opr.name, syntax);
3293 if (BE (*err != REG_NOERROR, 0))
3294 goto parse_bracket_exp_free_return;
3301 if (BE (token->type == END_OF_RE, 0))
3304 goto parse_bracket_exp_free_return;
3306 if (token->type == OP_CLOSE_BRACKET)
3310 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3312 /* If it is non-matching list. */
3314 bitset_not (sbcset);
3316 #ifdef RE_ENABLE_I18N
3317 /* Ensure only single byte characters are set. */
3318 if (dfa->mb_cur_max > 1)
3319 bitset_mask (sbcset, dfa->sb_char);
3321 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3322 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3323 || mbcset->non_match)))
3325 bin_tree_t *mbc_tree;
3327 /* Build a tree for complex bracket. */
3328 dfa->has_mb_node = 1;
3329 br_token.type = COMPLEX_BRACKET;
3330 br_token.opr.mbcset = mbcset;
3331 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3332 if (BE (mbc_tree == NULL, 0))
3333 goto parse_bracket_exp_espace;
3334 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3335 if (sbcset[sbc_idx])
3337 /* If there are no bits set in sbcset, there is no point
3338 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3339 if (sbc_idx < BITSET_WORDS)
3341 /* Build a tree for simple bracket. */
3342 br_token.type = SIMPLE_BRACKET;
3343 br_token.opr.sbcset = sbcset;
3344 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3345 if (BE (work_tree == NULL, 0))
3346 goto parse_bracket_exp_espace;
3348 /* Then join them by ALT node. */
3349 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3350 if (BE (work_tree == NULL, 0))
3351 goto parse_bracket_exp_espace;
3356 work_tree = mbc_tree;
3360 #endif /* not RE_ENABLE_I18N */
3362 #ifdef RE_ENABLE_I18N
3363 free_charset (mbcset);
3365 /* Build a tree for simple bracket. */
3366 br_token.type = SIMPLE_BRACKET;
3367 br_token.opr.sbcset = sbcset;
3368 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3369 if (BE (work_tree == NULL, 0))
3370 goto parse_bracket_exp_espace;
3374 parse_bracket_exp_espace:
3376 parse_bracket_exp_free_return:
3378 #ifdef RE_ENABLE_I18N
3379 free_charset (mbcset);
3380 #endif /* RE_ENABLE_I18N */
3384 /* Parse an element in the bracket expression. */
3386 static reg_errcode_t
3387 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3388 re_token_t *token, int token_len,
3389 re_dfa_t *dfa _UNUSED_PARAMETER_,
3390 reg_syntax_t syntax, bool accept_hyphen)
3392 #ifdef RE_ENABLE_I18N
3394 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3395 if (cur_char_size > 1)
3397 elem->type = MB_CHAR;
3398 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3399 re_string_skip_bytes (regexp, cur_char_size);
3402 #endif /* RE_ENABLE_I18N */
3403 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3404 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3405 || token->type == OP_OPEN_EQUIV_CLASS)
3406 return parse_bracket_symbol (elem, regexp, token);
3407 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3409 /* A '-' must only appear as anything but a range indicator before
3410 the closing bracket. Everything else is an error. */
3412 (void) peek_token_bracket (&token2, regexp, syntax);
3413 if (token2.type != OP_CLOSE_BRACKET)
3414 /* The actual error value is not standardized since this whole
3415 case is undefined. But ERANGE makes good sense. */
3418 elem->type = SB_CHAR;
3419 elem->opr.ch = token->opr.c;
3423 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3424 such as [:<character_class>:], [.<collating_element>.], and
3425 [=<equivalent_class>=]. */
3427 static reg_errcode_t
3428 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3431 unsigned char ch, delim = token->opr.c;
3433 if (re_string_eoi(regexp))
3437 if (i >= BRACKET_NAME_BUF_SIZE)
3439 if (token->type == OP_OPEN_CHAR_CLASS)
3440 ch = re_string_fetch_byte_case (regexp);
3442 ch = re_string_fetch_byte (regexp);
3443 if (re_string_eoi(regexp))
3445 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3447 elem->opr.name[i] = ch;
3449 re_string_skip_bytes (regexp, 1);
3450 elem->opr.name[i] = '\0';
3451 switch (token->type)
3453 case OP_OPEN_COLL_ELEM:
3454 elem->type = COLL_SYM;
3456 case OP_OPEN_EQUIV_CLASS:
3457 elem->type = EQUIV_CLASS;
3459 case OP_OPEN_CHAR_CLASS:
3460 elem->type = CHAR_CLASS;
3468 /* Helper function for parse_bracket_exp.
3469 Build the equivalence class which is represented by NAME.
3470 The result are written to MBCSET and SBCSET.
3471 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3472 is a pointer argument since we may update it. */
3474 static reg_errcode_t
3475 #ifdef RE_ENABLE_I18N
3476 build_equiv_class (bitset_t sbcset,
3477 re_charset_t *mbcset _UNUSED_PARAMETER_,
3478 Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3479 const unsigned char *name)
3480 #else /* not RE_ENABLE_I18N */
3481 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3482 #endif /* not RE_ENABLE_I18N */
3485 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3488 const int32_t *table, *indirect;
3489 const unsigned char *weights, *extra, *cp;
3490 unsigned char char_buf[2];
3494 /* This #include defines a local function! */
3495 # include <locale/weight.h>
3496 /* Calculate the index for equivalence class. */
3498 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3499 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3500 _NL_COLLATE_WEIGHTMB);
3501 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502 _NL_COLLATE_EXTRAMB);
3503 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_INDIRECTMB);
3505 idx1 = findidx (&cp, -1);
3506 if (BE (idx1 == 0 || *cp != '\0', 0))
3507 /* This isn't a valid character. */
3508 return REG_ECOLLATE;
3510 /* Build single byte matching table for this equivalence class. */
3511 len = weights[idx1 & 0xffffff];
3512 for (ch = 0; ch < SBC_MAX; ++ch)
3516 idx2 = findidx (&cp, 1);
3521 /* This isn't a valid character. */
3523 /* Compare only if the length matches and the collation rule
3524 index is the same. */
3525 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3529 while (cnt <= len &&
3530 weights[(idx1 & 0xffffff) + 1 + cnt]
3531 == weights[(idx2 & 0xffffff) + 1 + cnt])
3535 bitset_set (sbcset, ch);
3538 /* Check whether the array has enough space. */
3539 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3541 /* Not enough, realloc it. */
3542 /* +1 in case of mbcset->nequiv_classes is 0. */
3543 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544 /* Use realloc since the array is NULL if *alloc == 0. */
3545 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3547 new_equiv_class_alloc);
3548 if (BE (new_equiv_classes == NULL, 0))
3550 mbcset->equiv_classes = new_equiv_classes;
3551 *equiv_class_alloc = new_equiv_class_alloc;
3553 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3558 if (BE (strlen ((const char *) name) != 1, 0))
3559 return REG_ECOLLATE;
3560 bitset_set (sbcset, *name);
3565 /* Helper function for parse_bracket_exp.
3566 Build the character class which is represented by NAME.
3567 The result are written to MBCSET and SBCSET.
3568 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569 is a pointer argument since we may update it. */
3571 static reg_errcode_t
3572 #ifdef RE_ENABLE_I18N
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574 re_charset_t *mbcset, Idx *char_class_alloc,
3575 const unsigned char *class_name, reg_syntax_t syntax)
3576 #else /* not RE_ENABLE_I18N */
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 const unsigned char *class_name, reg_syntax_t syntax)
3579 #endif /* not RE_ENABLE_I18N */
3582 const char *name = (const char *) class_name;
3584 /* In case of REG_ICASE "upper" and "lower" match the both of
3585 upper and lower cases. */
3586 if ((syntax & RE_ICASE)
3587 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3590 #ifdef RE_ENABLE_I18N
3591 /* Check the space of the arrays. */
3592 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3594 /* Not enough, realloc it. */
3595 /* +1 in case of mbcset->nchar_classes is 0. */
3596 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597 /* Use realloc since array is NULL if *alloc == 0. */
3598 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599 new_char_class_alloc);
3600 if (BE (new_char_classes == NULL, 0))
3602 mbcset->char_classes = new_char_classes;
3603 *char_class_alloc = new_char_class_alloc;
3605 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606 #endif /* RE_ENABLE_I18N */
3608 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3610 if (BE (trans != NULL, 0)) \
3612 for (i = 0; i < SBC_MAX; ++i) \
3613 if (ctype_func (i)) \
3614 bitset_set (sbcset, trans[i]); \
3618 for (i = 0; i < SBC_MAX; ++i) \
3619 if (ctype_func (i)) \
3620 bitset_set (sbcset, i); \
3624 if (strcmp (name, "alnum") == 0)
3625 BUILD_CHARCLASS_LOOP (isalnum);
3626 else if (strcmp (name, "cntrl") == 0)
3627 BUILD_CHARCLASS_LOOP (iscntrl);
3628 else if (strcmp (name, "lower") == 0)
3629 BUILD_CHARCLASS_LOOP (islower);
3630 else if (strcmp (name, "space") == 0)
3631 BUILD_CHARCLASS_LOOP (isspace);
3632 else if (strcmp (name, "alpha") == 0)
3633 BUILD_CHARCLASS_LOOP (isalpha);
3634 else if (strcmp (name, "digit") == 0)
3635 BUILD_CHARCLASS_LOOP (isdigit);
3636 else if (strcmp (name, "print") == 0)
3637 BUILD_CHARCLASS_LOOP (isprint);
3638 else if (strcmp (name, "upper") == 0)
3639 BUILD_CHARCLASS_LOOP (isupper);
3640 else if (strcmp (name, "blank") == 0)
3641 BUILD_CHARCLASS_LOOP (isblank);
3642 else if (strcmp (name, "graph") == 0)
3643 BUILD_CHARCLASS_LOOP (isgraph);
3644 else if (strcmp (name, "punct") == 0)
3645 BUILD_CHARCLASS_LOOP (ispunct);
3646 else if (strcmp (name, "xdigit") == 0)
3647 BUILD_CHARCLASS_LOOP (isxdigit);
3655 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656 const unsigned char *class_name,
3657 const unsigned char *extra, bool non_match,
3660 re_bitset_ptr_t sbcset;
3661 #ifdef RE_ENABLE_I18N
3662 re_charset_t *mbcset;
3664 #endif /* not RE_ENABLE_I18N */
3666 re_token_t br_token;
3669 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3670 #ifdef RE_ENABLE_I18N
3671 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3672 #endif /* RE_ENABLE_I18N */
3674 #ifdef RE_ENABLE_I18N
3675 if (BE (sbcset == NULL || mbcset == NULL, 0))
3676 #else /* not RE_ENABLE_I18N */
3677 if (BE (sbcset == NULL, 0))
3678 #endif /* not RE_ENABLE_I18N */
3686 #ifdef RE_ENABLE_I18N
3687 mbcset->non_match = 1;
3688 #endif /* not RE_ENABLE_I18N */
3691 /* We don't care the syntax in this case. */
3692 ret = build_charclass (trans, sbcset,
3693 #ifdef RE_ENABLE_I18N
3695 #endif /* RE_ENABLE_I18N */
3698 if (BE (ret != REG_NOERROR, 0))
3701 #ifdef RE_ENABLE_I18N
3702 free_charset (mbcset);
3703 #endif /* RE_ENABLE_I18N */
3707 /* \w match '_' also. */
3708 for (; *extra; extra++)
3709 bitset_set (sbcset, *extra);
3711 /* If it is non-matching list. */
3713 bitset_not (sbcset);
3715 #ifdef RE_ENABLE_I18N
3716 /* Ensure only single byte characters are set. */
3717 if (dfa->mb_cur_max > 1)
3718 bitset_mask (sbcset, dfa->sb_char);
3721 /* Build a tree for simple bracket. */
3722 br_token.type = SIMPLE_BRACKET;
3723 br_token.opr.sbcset = sbcset;
3724 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3725 if (BE (tree == NULL, 0))
3726 goto build_word_op_espace;
3728 #ifdef RE_ENABLE_I18N
3729 if (dfa->mb_cur_max > 1)
3731 bin_tree_t *mbc_tree;
3732 /* Build a tree for complex bracket. */
3733 br_token.type = COMPLEX_BRACKET;
3734 br_token.opr.mbcset = mbcset;
3735 dfa->has_mb_node = 1;
3736 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3737 if (BE (mbc_tree == NULL, 0))
3738 goto build_word_op_espace;
3739 /* Then join them by ALT node. */
3740 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3741 if (BE (mbc_tree != NULL, 1))
3746 free_charset (mbcset);
3749 #else /* not RE_ENABLE_I18N */
3751 #endif /* not RE_ENABLE_I18N */
3753 build_word_op_espace:
3755 #ifdef RE_ENABLE_I18N
3756 free_charset (mbcset);
3757 #endif /* RE_ENABLE_I18N */
3762 /* This is intended for the expressions like "a{1,3}".
3763 Fetch a number from 'input', and return the number.
3764 Return REG_MISSING if the number field is empty like "{,1}".
3765 Return RE_DUP_MAX + 1 if the number field is too large.
3766 Return REG_ERROR if an error occurred. */
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3771 Idx num = REG_MISSING;
3775 fetch_token (token, input, syntax);
3777 if (BE (token->type == END_OF_RE, 0))
3779 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3781 num = ((token->type != CHARACTER || c < '0' || '9' < c
3782 || num == REG_ERROR)
3784 : num == REG_MISSING
3786 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3791 #ifdef RE_ENABLE_I18N
3793 free_charset (re_charset_t *cset)
3795 re_free (cset->mbchars);
3797 re_free (cset->coll_syms);
3798 re_free (cset->equiv_classes);
3799 re_free (cset->range_starts);
3800 re_free (cset->range_ends);
3802 re_free (cset->char_classes);
3805 #endif /* RE_ENABLE_I18N */
3807 /* Functions for binary tree operation. */
3809 /* Create a tree node. */
3812 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3813 re_token_type_t type)
3817 return create_token_tree (dfa, left, right, &t);
3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 const re_token_t *token)
3825 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829 if (storage == NULL)
3831 storage->next = dfa->str_tree_storage;
3832 dfa->str_tree_storage = storage;
3833 dfa->str_tree_storage_idx = 0;
3835 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837 tree->parent = NULL;
3839 tree->right = right;
3840 tree->token = *token;
3841 tree->token.duplicated = 0;
3842 tree->token.opt_subexp = 0;
3845 tree->node_idx = REG_MISSING;
3848 left->parent = tree;
3850 right->parent = tree;
3854 /* Mark the tree SRC as an optional subexpression.
3855 To be called from preorder or postorder. */
3857 static reg_errcode_t
3858 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 Idx idx = (Idx) (long) extra;
3861 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862 node->token.opt_subexp = 1;
3867 /* Free the allocated memory inside NODE. */
3870 free_token (re_token_t *node)
3872 #ifdef RE_ENABLE_I18N
3873 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874 free_charset (node->opr.mbcset);
3876 #endif /* RE_ENABLE_I18N */
3877 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878 re_free (node->opr.sbcset);
3881 /* Worker function for tree walking. Free the allocated memory inside NODE
3882 and its children. */
3884 static reg_errcode_t
3885 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3887 free_token (&node->token);
3892 /* Duplicate the node SRC, and return new node. This is a preorder
3893 visit similar to the one implemented by the generic visitor, but
3894 we need more infrastructure to maintain two parallel trees --- so,
3895 it's easier to duplicate. */
3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 const bin_tree_t *node;
3901 bin_tree_t *dup_root;
3902 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904 for (node = root; ; )
3906 /* Create a new tree and link it back to the current parent. */
3907 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910 (*p_new)->parent = dup_node;
3911 (*p_new)->token.duplicated = 1;
3914 /* Go to the left node, or up and to the right. */
3918 p_new = &dup_node->left;
3922 const bin_tree_t *prev = NULL;
3923 while (node->right == prev || node->right == NULL)
3926 node = node->parent;
3927 dup_node = dup_node->parent;
3932 p_new = &dup_node->right;