1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
27 #include <sys/types.h>
33 #define STREQ(a, b) (strcmp (a, b) == 0)
35 /* ISASCIIDIGIT differs from isdigit, as follows:
36 - Its arg may be any int or unsigned int; it need not be an unsigned char.
37 - It's guaranteed to evaluate its argument exactly once.
38 - It's typically faster.
39 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
40 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
41 it's important to use the locale's definition of `digit' even when the
42 host does not conform to Posix. */
43 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
45 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
47 #define _(str) gettext (str)
49 #include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
53 #if HAVE_LANGINFO_CODESET
54 # include <langinfo.h>
59 #include "hard-locale.h"
62 /* HPUX, define those as macros in sys/param.h */
70 /* Number of bits in an unsigned char. */
75 /* First integer value that is greater than any character code. */
76 #define NOTCHAR (1 << CHARBITS)
78 /* INTBITS need not be exact, just a lower bound. */
80 # define INTBITS (CHARBITS * sizeof (int))
83 /* Number of ints required to hold a bit for every character. */
84 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
86 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
87 typedef int charclass[CHARCLASS_INTS];
89 /* Convert a possibly-signed character to an unsigned character. This is
90 a bit safer than casting to unsigned char, since it catches some type
91 errors that the cast doesn't. */
92 static inline unsigned char
98 /* Contexts tell us whether a character is a newline or a word constituent.
99 Word-constituent characters are those that satisfy iswalnum(), plus '_'.
100 Each character has a single CTX_* value; bitmasks of CTX_* values denote
101 a particular character class.
103 A state also stores a context value, which is a bitmask of CTX_* values.
104 A state's context represents a set of characters that the state's
105 predecessors must match. For example, a state whose context does not
106 include CTX_LETTER will never have transitions where the previous
107 character is a word constituent. A state whose context is CTX_ANY
108 might have transitions from any character. */
112 #define CTX_NEWLINE 4
115 /* Sometimes characters can only be matched depending on the surrounding
116 context. Such context decisions depend on what the previous character
117 was, and the value of the current (lookahead) character. Context
118 dependent constraints are encoded as 8 bit integers. Each bit that
119 is set indicates that the constraint succeeds in the corresponding
122 bit 8-11 - valid contexts when next character is CTX_NEWLINE
123 bit 4-7 - valid contexts when next character is CTX_LETTER
124 bit 0-3 - valid contexts when next character is CTX_NONE
126 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
127 succeeds in a particular context. Prev is a bitmask of possible
128 context values for the previous character, curr is the (single-bit)
129 context value for the lookahead character. */
130 #define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
131 #define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
132 #define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
134 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
135 ((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
136 | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
137 | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
139 /* The following macros give information about what a constraint depends on. */
140 #define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
141 #define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
142 #define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
144 #define PREV_NEWLINE_DEPENDENT(constraint) \
145 (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
146 #define PREV_LETTER_DEPENDENT(constraint) \
147 (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
149 /* Tokens that match the empty string subject to some constraint actually
150 work by applying that constraint to determine what may follow them,
151 taking into account what has gone before. The following values are
152 the constraints corresponding to the special tokens previously defined. */
153 #define NO_CONSTRAINT 0x777
154 #define BEGLINE_CONSTRAINT 0x444
155 #define ENDLINE_CONSTRAINT 0x700
156 #define BEGWORD_CONSTRAINT 0x050
157 #define ENDWORD_CONSTRAINT 0x202
158 #define LIMWORD_CONSTRAINT 0x252
159 #define NOTLIMWORD_CONSTRAINT 0x525
161 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
162 are operators and others are terminal symbols. Most (but not all) of these
163 codes are returned by the lexical analyzer. */
165 typedef ptrdiff_t token;
167 /* Predefined token values. */
170 END = -1, /* END is a terminal symbol that matches the
171 end of input; any value of END or less in
172 the parse tree is such a symbol. Accepting
173 states of the DFA are those that would have
174 a transition on END. */
176 /* Ordinary character values are terminal symbols that match themselves. */
178 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
181 BACKREF, /* BACKREF is generated by \<digit>; it
182 is not completely handled. If the scanner
183 detects a transition on backref, it returns
184 a kind of "semi-success" indicating that
185 the match will have to be verified with
186 a backtracking matcher. */
188 BEGLINE, /* BEGLINE is a terminal symbol that matches
189 the empty string if it is at the beginning
192 ENDLINE, /* ENDLINE is a terminal symbol that matches
193 the empty string if it is at the end of
196 BEGWORD, /* BEGWORD is a terminal symbol that matches
197 the empty string if it is at the beginning
200 ENDWORD, /* ENDWORD is a terminal symbol that matches
201 the empty string if it is at the end of
204 LIMWORD, /* LIMWORD is a terminal symbol that matches
205 the empty string if it is at the beginning
206 or the end of a word. */
208 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
209 matches the empty string if it is not at
210 the beginning or end of a word. */
212 QMARK, /* QMARK is an operator of one argument that
213 matches zero or one occurrences of its
216 STAR, /* STAR is an operator of one argument that
217 matches the Kleene closure (zero or more
218 occurrences) of its argument. */
220 PLUS, /* PLUS is an operator of one argument that
221 matches the positive closure (one or more
222 occurrences) of its argument. */
224 REPMN, /* REPMN is a lexical token corresponding
225 to the {m,n} construct. REPMN never
226 appears in the compiled token vector. */
228 CAT, /* CAT is an operator of two arguments that
229 matches the concatenation of its
230 arguments. CAT is never returned by the
233 OR, /* OR is an operator of two arguments that
234 matches either of its arguments. */
236 LPAREN, /* LPAREN never appears in the parse tree,
237 it is only a lexeme. */
239 RPAREN, /* RPAREN never appears in the parse tree. */
241 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
242 any multibyte (or single byte) characters.
243 It is used only if MB_CUR_MAX > 1. */
245 MBCSET, /* MBCSET is similar to CSET, but for
246 multibyte characters. */
248 WCHAR, /* Only returned by lex. wctok contains
249 the wide character representation. */
251 CSET /* CSET and (and any value greater) is a
252 terminal symbol that matches any of a
253 class of characters. */
257 /* States of the recognizer correspond to sets of positions in the parse
258 tree, together with the constraints under which they may be matched.
259 So a position is encoded as an index into the parse tree together with
263 size_t index; /* Index into the parse array. */
264 unsigned int constraint; /* Constraint for matching this position. */
267 /* Sets of positions are stored as arrays. */
270 position *elems; /* Elements of this position set. */
271 size_t nelem; /* Number of elements in this set. */
272 size_t alloc; /* Number of elements allocated in ELEMS. */
275 /* Sets of leaves are also stored as arrays. */
278 size_t *elems; /* Elements of this position set. */
279 size_t nelem; /* Number of elements in this set. */
282 /* A state of the dfa consists of a set of positions, some flags,
283 and the token value of the lowest-numbered position of the state that
284 contains an END token. */
287 size_t hash; /* Hash of the positions of this state. */
288 position_set elems; /* Positions this state could match. */
289 unsigned char context; /* Context from previous state. */
290 char backref; /* True if this state matches a \<digit>. */
291 unsigned short constraint; /* Constraint for this state to accept. */
292 token first_end; /* Token value of the first END in elems. */
293 position_set mbps; /* Positions which can match multibyte
294 characters. e.g. period.
295 These staff are used only if
299 /* States are indexed by state_num values. These are normally
300 nonnegative but -1 is used as a special value. */
301 typedef ptrdiff_t state_num;
303 /* A bracket operator.
304 e.g. [a-c], [[:alpha:]], etc. */
305 struct mb_char_classes
309 wchar_t *chars; /* Normal characters. */
311 wctype_t *ch_classes; /* Character classes. */
313 wchar_t *range_sts; /* Range characters (start of the range). */
314 wchar_t *range_ends; /* Range characters (end of the range). */
316 char **equivs; /* Equivalence classes. */
319 size_t ncoll_elems; /* Collating elements. */
322 /* A compiled regular expression. */
325 /* Fields filled by the scanner. */
326 charclass *charclasses; /* Array of character sets for CSET tokens. */
327 size_t cindex; /* Index for adding new charclasses. */
328 size_t calloc; /* Number of charclasses currently allocated. */
330 /* Fields filled by the parser. */
331 token *tokens; /* Postfix parse array. */
332 size_t tindex; /* Index for adding new tokens. */
333 size_t talloc; /* Number of tokens currently allocated. */
334 size_t depth; /* Depth required of an evaluation stack
335 used for depth-first traversal of the
337 size_t nleaves; /* Number of leaves on the parse tree. */
338 size_t nregexps; /* Count of parallel regexps being built
340 unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
341 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
343 /* The following are used only if MB_CUR_MAX > 1. */
345 /* The value of multibyte_prop[i] is defined by following rule.
346 if tokens[i] < NOTCHAR
347 bit 0 : tokens[i] is the first byte of a character, including
348 single-byte characters.
349 bit 1 : tokens[i] is the last byte of a character, including
350 single-byte characters.
352 if tokens[i] = MBCSET
353 ("the index of mbcsets corresponding to this operator" << 2) + 3
357 = 'single_byte_a', 'multi_byte_A', single_byte_b'
358 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
362 size_t nmultibyte_prop;
365 /* Array of the bracket expression in the DFA. */
366 struct mb_char_classes *mbcsets;
368 size_t mbcsets_alloc;
370 /* Fields filled by the state builder. */
371 dfa_state *states; /* States of the dfa. */
372 state_num sindex; /* Index for adding new states. */
373 state_num salloc; /* Number of states currently allocated. */
375 /* Fields filled by the parse tree->NFA conversion. */
376 position_set *follows; /* Array of follow sets, indexed by position
377 index. The follow of a position is the set
378 of positions containing characters that
379 could conceivably follow a character
380 matching the given position in a string
381 matching the regexp. Allocated to the
382 maximum possible position index. */
383 int searchflag; /* True if we are supposed to build a searching
384 as opposed to an exact matcher. A searching
385 matcher finds the first and shortest string
386 matching a regexp anywhere in the buffer,
387 whereas an exact matcher finds the longest
388 string matching, but anchored to the
389 beginning of the buffer. */
391 /* Fields filled by dfaexec. */
392 state_num tralloc; /* Number of transition tables that have
394 int trcount; /* Number of transition tables that have
395 actually been built. */
396 state_num **trans; /* Transition tables for states that can
397 never accept. If the transitions for a
398 state have not yet been computed, or the
399 state could possibly accept, its entry in
400 this table is NULL. */
401 state_num **realtrans; /* Trans always points to realtrans + 1; this
402 is so trans[-1] can contain NULL. */
403 state_num **fails; /* Transition tables after failing to accept
404 on a state that potentially could do so. */
405 int *success; /* Table of acceptance conditions used in
406 dfaexec and computed in build_state. */
407 state_num *newlines; /* Transitions on newlines. The entry for a
408 newline in any transition table is always
409 -1 so we can count lines without wasting
410 too many cycles. The transition for a
411 newline is stored separately and handled
412 as a special case. Newline is also used
413 as a sentinel at the end of the buffer. */
414 struct dfamust *musts; /* List of strings, at least one of which
415 is known to appear in any r.e. matching
419 /* Some macros for user access to dfa internals. */
421 /* ACCEPTING returns true if s could possibly be an accepting state of r. */
422 #define ACCEPTING(s, r) ((r).states[s].constraint)
424 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
425 specified context. */
426 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
427 SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
429 static void dfamust (struct dfa *dfa);
430 static void regexp (void);
432 /* These two macros are identical to the ones in gnulib's xalloc.h,
433 except that they not to case the result to "(t *)", and thus may
434 be used via type-free CALLOC and MALLOC macros. */
438 /* Allocate memory for N elements of type T, with error checking. */
439 /* extern t *XNMALLOC (size_t n, typename t); */
440 # define XNMALLOC(n, t) \
441 (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
443 /* Allocate memory for N elements of type T, with error checking,
445 /* extern t *XCALLOC (size_t n, typename t); */
446 # define XCALLOC(n, t) \
447 (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
449 #define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
450 #define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
451 #define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
453 /* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
454 #define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
457 if ((n_alloc) <= (n_required)) \
459 size_t new_n_alloc = (n_required) + !(p); \
460 (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
461 (n_alloc) = new_n_alloc; \
475 fprintf (stderr, "END");
476 else if (t < NOTCHAR)
479 fprintf (stderr, "%c", ch);
540 fprintf (stderr, "%s", s);
545 /* Stuff pertaining to charclasses. */
548 tstbit (unsigned int b, charclass const c)
550 return c[b / INTBITS] & 1 << b % INTBITS;
554 setbit (unsigned int b, charclass c)
556 c[b / INTBITS] |= 1 << b % INTBITS;
560 clrbit (unsigned int b, charclass c)
562 c[b / INTBITS] &= ~(1 << b % INTBITS);
566 copyset (charclass const src, charclass dst)
568 memcpy (dst, src, sizeof (charclass));
572 zeroset (charclass s)
574 memset (s, 0, sizeof (charclass));
582 for (i = 0; i < CHARCLASS_INTS; ++i)
587 equal (charclass const s1, charclass const s2)
589 return memcmp (s1, s2, sizeof (charclass)) == 0;
592 /* A pointer to the current dfa is kept here during parsing. */
593 static struct dfa *dfa;
595 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
597 charclass_index (charclass const s)
601 for (i = 0; i < dfa->cindex; ++i)
602 if (equal (s, dfa->charclasses[i]))
604 REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
606 copyset (s, dfa->charclasses[i]);
610 /* Syntax bits controlling the behavior of the lexical analyzer. */
611 static reg_syntax_t syntax_bits, syntax_bits_set;
613 /* Flag for case-folding letters into sets. */
614 static int case_fold;
616 /* End-of-line byte in data. */
617 static unsigned char eolbyte;
619 /* Cache of char-context values. */
620 static int sbit[NOTCHAR];
622 /* Set of characters considered letters. */
623 static charclass letters;
625 /* Set of characters that are newline. */
626 static charclass newline;
628 /* Add this to the test for whether a byte is word-constituent, since on
629 BSD-based systems, many values in the 128..255 range are classified as
630 alphabetic, while on glibc-based systems, they are not. */
632 # define is_valid_unibyte_character(c) 1
634 # define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
637 /* Return non-zero if C is a 'word-constituent' byte; zero otherwise. */
638 #define IS_WORD_CONSTITUENT(C) \
639 (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
642 char_context (unsigned char c)
644 if (c == eolbyte || c == 0)
646 if (IS_WORD_CONSTITUENT (c))
652 wchar_context (wint_t wc)
654 if (wc == (wchar_t) eolbyte || wc == 0)
656 if (wc == L'_' || iswalnum (wc))
661 /* Entry point to set syntax options. */
663 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
672 for (i = 0; i < NOTCHAR; ++i)
674 sbit[i] = char_context (i);
687 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
688 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
689 this may happen when folding case in weird Turkish locales where
690 dotless i/dotted I are not included in the chosen character set.
691 Return whether a bit was set in the charclass. */
694 setbit_wc (wint_t wc, charclass c)
704 /* Set a bit in the charclass for the given single byte character,
705 if it is valid in the current character set. */
707 setbit_c (int b, charclass c)
709 /* Do nothing if b is invalid in this character set. */
710 if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
715 # define setbit_c setbit
717 setbit_wc (wint_t wc, charclass c)
720 /*NOTREACHED*/ return false;
724 /* Like setbit_c, but if case is folded, set both cases of a letter. For
725 MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
726 and the caller takes care of setting the appropriate field of struct
729 setbit_case_fold_c (int b, charclass c)
733 wint_t wc = btowc (b);
737 if (case_fold && iswalpha (wc))
738 setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
743 if (case_fold && isalpha (b))
744 setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
750 /* UTF-8 encoding allows some optimizations that we can't otherwise
751 assume in a multibyte encoding. */
755 static int utf8 = -1;
758 #if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
759 utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
768 /* Lexical analyzer. All the dross that deals with the obnoxious
769 GNU Regex syntax bits is located here. The poor, suffering
770 reader is referred to the GNU Regex documentation for the
771 meaning of the @#%!@#%^!@ syntax bits. */
773 static char const *lexptr; /* Pointer to next input character. */
774 static size_t lexleft; /* Number of characters remaining. */
775 static token lasttok; /* Previous token returned; initially END. */
776 static int laststart; /* True if we're separated from beginning or (, |
777 only by zero-width characters. */
778 static size_t parens; /* Count of outstanding left parens. */
779 static int minrep, maxrep; /* Repeat counts for {m,n}. */
780 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
782 static int cur_mb_len = 1; /* Length of the multibyte representation of
784 /* These variables are used only if (MB_CUR_MAX > 1). */
785 static mbstate_t mbs; /* Mbstate for mbrlen(). */
786 static wchar_t wctok; /* Wide character representation of the current
787 multibyte character. */
788 static unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec().
789 Each element store the amount of remain
790 byte of corresponding multibyte character
791 in the input string. A element's value
792 is 0 if corresponding character is a
793 single byte character.
794 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
795 mblen_buf : 0, 3, 2, 1
797 static wchar_t *inputwcs; /* Wide character representation of input
799 The length of this array is same as
800 the length of input string(char array).
801 inputstring[i] is a single-byte char,
802 or 1st byte of a multibyte char.
803 And inputwcs[i] is the codepoint. */
804 static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
805 static unsigned char const *buf_end; /* reference to end in dfaexec(). */
809 /* Note that characters become unsigned here. */
810 # define FETCH_WC(c, wc, eoferr) \
817 return lasttok = END; \
822 cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
823 if (cur_mb_len <= 0) \
827 (wc) = (c) = to_uchar (*lexptr++); \
831 lexptr += cur_mb_len; \
832 lexleft -= cur_mb_len; \
839 # define FETCH(c, eoferr) \
842 FETCH_WC (c, wc, eoferr); \
846 /* Note that characters become unsigned here. */
847 # define FETCH(c, eoferr) \
854 return lasttok = END; \
856 (c) = to_uchar (*lexptr++); \
860 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
862 #endif /* MBS_SUPPORT */
865 # define MIN(a,b) ((a) < (b) ? (a) : (b))
868 typedef int predicate (int);
870 /* The following list maps the names of the Posix named character classes
871 to predicate functions that determine whether a given character is in
872 the class. The leading [ has already been eaten by the lexical analyzer. */
877 bool single_byte_only;
880 static const struct dfa_ctype prednames[] = {
881 {"alpha", isalpha, false},
882 {"upper", isupper, false},
883 {"lower", islower, false},
884 {"digit", isdigit, true},
885 {"xdigit", isxdigit, true},
886 {"space", isspace, false},
887 {"punct", ispunct, false},
888 {"alnum", isalnum, false},
889 {"print", isprint, false},
890 {"graph", isgraph, false},
891 {"cntrl", iscntrl, false},
892 {"blank", isblank, false},
896 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
897 find_pred (const char *str)
900 for (i = 0; prednames[i].name; ++i)
901 if (STREQ (str, prednames[i].name))
904 return &prednames[i];
907 /* Multibyte character handling sub-routine for lex.
908 This function parse a bracket expression and build a struct
911 parse_bracket_exp (void)
917 /* Used to warn about [:space:].
918 Bit 0 = first character is a colon.
919 Bit 1 = last character is a colon.
920 Bit 2 = includes any other character but a colon.
921 Bit 3 = includes ranges, char/equiv classes or collation elements. */
922 int colon_warning_state;
928 /* Work area to build a mb_char_classes. */
929 struct mb_char_classes *work_mbc;
930 size_t chars_al, range_sts_al, range_ends_al, ch_classes_al,
931 equivs_al, coll_elems_al;
934 range_sts_al = range_ends_al = 0;
935 ch_classes_al = equivs_al = coll_elems_al = 0;
938 REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc,
941 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
942 We will update dfa->multibyte_prop[] in addtok(), because we can't
943 decide the index in dfa->tokens[]. */
945 /* Initialize work area. */
946 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
947 memset (work_mbc, 0, sizeof *work_mbc);
952 memset (ccl, 0, sizeof ccl);
953 FETCH_WC (c, wc, _("unbalanced ["));
956 FETCH_WC (c, wc, _("unbalanced ["));
962 colon_warning_state = (c == ':');
965 c1 = EOF; /* mark c1 is not initialized". */
966 colon_warning_state &= ~2;
968 /* Note that if we're looking at some other [:...:] construct,
969 we just treat it as a bunch of ordinary characters. We can do
970 this because we assume regex has checked for syntax errors before
971 dfa is ever called. */
972 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
974 #define BRACKET_BUFFER_SIZE 128
975 char str[BRACKET_BUFFER_SIZE];
976 FETCH_WC (c1, wc1, _("unbalanced ["));
978 /* If pattern contains `[[:', `[[.', or `[[='. */
980 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
981 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
986 FETCH_WC (c, wc, _("unbalanced ["));
987 if ((c == c1 && *lexptr == ']') || lexleft == 0)
989 if (len < BRACKET_BUFFER_SIZE)
992 /* This is in any case an invalid class name. */
998 FETCH_WC (c, wc, _("unbalanced ["));
1000 /* build character class. */
1003 = (case_fold && (STREQ (str, "upper")
1004 || STREQ (str, "lower")) ? "alpha" : str);
1005 const struct dfa_ctype *pred = find_pred (class);
1007 dfaerror (_("invalid character class"));
1009 if (MB_CUR_MAX > 1 && !pred->single_byte_only)
1011 /* Store the character class as wctype_t. */
1012 wctype_t wt = wctype (class);
1014 REALLOC_IF_NECESSARY (work_mbc->ch_classes,
1016 work_mbc->nch_classes + 1);
1017 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
1020 for (c2 = 0; c2 < NOTCHAR; ++c2)
1021 if (pred->func (c2))
1022 setbit_case_fold_c (c2, ccl);
1025 else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
1027 char *elem = xmemdup (str, len + 1);
1030 /* build equivalence class. */
1032 REALLOC_IF_NECESSARY (work_mbc->equivs,
1033 equivs_al, work_mbc->nequivs + 1);
1034 work_mbc->equivs[work_mbc->nequivs++] = elem;
1038 /* build collating element. */
1040 REALLOC_IF_NECESSARY (work_mbc->coll_elems,
1042 work_mbc->ncoll_elems + 1);
1043 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
1046 colon_warning_state |= 8;
1048 /* Fetch new lookahead character. */
1049 FETCH_WC (c1, wc1, _("unbalanced ["));
1053 /* We treat '[' as a normal character here. c/c1/wc/wc1
1054 are already set up. */
1057 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1058 FETCH_WC (c, wc, _("unbalanced ["));
1061 FETCH_WC (c1, wc1, _("unbalanced ["));
1064 /* build range characters. */
1066 FETCH_WC (c2, wc2, _("unbalanced ["));
1069 /* In the case [x-], the - is an ordinary hyphen,
1070 which is left in c1, the lookahead character. */
1071 lexptr -= cur_mb_len;
1072 lexleft += cur_mb_len;
1076 if (c1 == '-' && c2 != ']')
1078 if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1079 FETCH_WC (c2, wc2, _("unbalanced ["));
1083 /* When case folding map a range, say [m-z] (or even [M-z])
1084 to the pair of ranges, [m-z] [M-Z]. */
1085 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1086 range_sts_al, work_mbc->nranges + 1);
1087 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1088 range_ends_al, work_mbc->nranges + 1);
1089 work_mbc->range_sts[work_mbc->nranges] =
1090 case_fold ? towlower (wc) : (wchar_t) wc;
1091 work_mbc->range_ends[work_mbc->nranges++] =
1092 case_fold ? towlower (wc2) : (wchar_t) wc2;
1095 if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
1097 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1098 range_sts_al, work_mbc->nranges + 1);
1099 work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
1100 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1101 range_ends_al, work_mbc->nranges + 1);
1102 work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
1114 if (!hard_LC_COLLATE)
1115 for (c = c1; c <= c2; c++)
1116 setbit_case_fold_c (c, ccl);
1119 /* Defer to the system regex library about the meaning
1120 of range expressions. */
1122 char pattern[6] = { '[', c1, '-', c2, ']', 0 };
1123 char subject[2] = { 0, 0 };
1124 regcomp (&re, pattern, REG_NOSUB);
1125 for (c = 0; c < NOTCHAR; ++c)
1128 if (!(case_fold && isupper (c))
1129 && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
1130 setbit_case_fold_c (c, ccl);
1136 colon_warning_state |= 8;
1137 FETCH_WC (c1, wc1, _("unbalanced ["));
1141 colon_warning_state |= (c == ':') ? 2 : 4;
1143 if (MB_CUR_MAX == 1)
1145 setbit_case_fold_c (c, ccl);
1149 if (case_fold && iswalpha (wc))
1152 if (!setbit_wc (wc, ccl))
1154 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1155 work_mbc->nchars + 1);
1156 work_mbc->chars[work_mbc->nchars++] = wc;
1164 if (!setbit_wc (wc, ccl))
1166 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1167 work_mbc->nchars + 1);
1168 work_mbc->chars[work_mbc->nchars++] = wc;
1171 while ((wc = wc1, (c = c1) != ']'));
1173 if (colon_warning_state == 7)
1174 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1178 static charclass zeroclass;
1179 work_mbc->invert = invert;
1180 work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
1186 assert (MB_CUR_MAX == 1);
1188 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1189 clrbit (eolbyte, ccl);
1192 return CSET + charclass_index (ccl);
1203 /* Basic plan: We fetch a character. If it's a backslash,
1204 we set the backslash flag and go through the loop again.
1205 On the plus side, this avoids having a duplicate of the
1206 main switch inside the backslash case. On the minus side,
1207 it means that just about every case begins with
1208 "if (backslash) ...". */
1209 for (i = 0; i < 2; ++i)
1213 FETCH_WC (c, wctok, NULL);
1226 dfaerror (_("unfinished \\ escape"));
1233 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1234 || lasttok == END || lasttok == LPAREN || lasttok == OR)
1235 return lasttok = BEGLINE;
1241 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1243 || (syntax_bits & RE_NO_BK_PARENS
1244 ? lexleft > 0 && *lexptr == ')'
1245 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1246 || (syntax_bits & RE_NO_BK_VBAR
1247 ? lexleft > 0 && *lexptr == '|'
1248 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1249 || ((syntax_bits & RE_NEWLINE_ALT)
1250 && lexleft > 0 && *lexptr == '\n'))
1251 return lasttok = ENDLINE;
1263 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1266 return lasttok = BACKREF;
1271 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1272 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1276 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1277 return lasttok = ENDLINE; /* FIXME: should be end of string */
1281 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1282 return lasttok = BEGWORD;
1286 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1287 return lasttok = ENDWORD;
1291 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1292 return lasttok = LIMWORD;
1296 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1297 return lasttok = NOTLIMWORD;
1301 if (syntax_bits & RE_LIMITED_OPS)
1303 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1305 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1307 return lasttok = QMARK;
1312 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1314 return lasttok = STAR;
1317 if (syntax_bits & RE_LIMITED_OPS)
1319 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1321 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1323 return lasttok = PLUS;
1326 if (!(syntax_bits & RE_INTERVALS))
1328 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1330 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1335 {M,} - minimum count, maximum is infinity
1337 {,} - 0 to infinity (same as '*')
1338 {M,N} - M through N */
1340 char const *p = lexptr;
1341 char const *lim = p + lexleft;
1342 minrep = maxrep = -1;
1343 for (; p != lim && ISASCIIDIGIT (*p); p++)
1348 minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
1358 while (++p != lim && ISASCIIDIGIT (*p))
1363 maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
1367 if (! ((! backslash || (p != lim && *p++ == '\\'))
1368 && p != lim && *p++ == '}'
1369 && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
1371 if (syntax_bits & RE_INVALID_INTERVAL_ORD)
1373 dfaerror (_("Invalid content of \\{\\}"));
1375 if (RE_DUP_MAX < maxrep)
1376 dfaerror (_("Regular expression too big"));
1381 return lasttok = REPMN;
1384 if (syntax_bits & RE_LIMITED_OPS)
1386 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1389 return lasttok = OR;
1392 if (syntax_bits & RE_LIMITED_OPS
1393 || backslash || !(syntax_bits & RE_NEWLINE_ALT))
1396 return lasttok = OR;
1399 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1403 return lasttok = LPAREN;
1406 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1408 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1412 return lasttok = RPAREN;
1419 /* In multibyte environment period must match with a single
1420 character not a byte. So we use ANYCHAR. */
1422 return lasttok = ANYCHAR;
1426 if (!(syntax_bits & RE_DOT_NEWLINE))
1427 clrbit (eolbyte, ccl);
1428 if (syntax_bits & RE_DOT_NOT_NULL)
1431 return lasttok = CSET + charclass_index (ccl);
1435 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1438 for (c2 = 0; c2 < NOTCHAR; ++c2)
1444 return lasttok = CSET + charclass_index (ccl);
1448 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1451 for (c2 = 0; c2 < NOTCHAR; ++c2)
1452 if (IS_WORD_CONSTITUENT (c2))
1457 return lasttok = CSET + charclass_index (ccl);
1463 return lasttok = parse_bracket_exp ();
1468 /* For multibyte character sets, folding is done in atom. Always
1471 return lasttok = WCHAR;
1473 if (case_fold && isalpha (c))
1476 setbit_case_fold_c (c, ccl);
1477 return lasttok = CSET + charclass_index (ccl);
1484 /* The above loop should consume at most a backslash
1485 and some other character. */
1487 return END; /* keeps pedantic compilers happy. */
1490 /* Recursive descent parser for regular expressions. */
1492 static token tok; /* Lookahead token. */
1493 static size_t depth; /* Current depth of a hypothetical stack
1494 holding deferred productions. This is
1495 used to determine the depth that will be
1496 required of the real stack later on in
1500 addtok_mb (token t, int mbprop)
1504 REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop,
1506 dfa->multibyte_prop[dfa->tindex] = mbprop;
1509 REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1);
1510 dfa->tokens[dfa->tindex++] = t;
1530 if (depth > dfa->depth)
1534 static void addtok_wc (wint_t wc);
1536 /* Add the given token to the parse tree, maintaining the depth count and
1537 updating the maximum depth if necessary. */
1541 if (MB_CUR_MAX > 1 && t == MBCSET)
1543 bool need_or = false;
1544 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1546 /* Extract wide characters into alternations for better performance.
1547 This does not require UTF-8. */
1548 if (!work_mbc->invert)
1551 for (i = 0; i < work_mbc->nchars; i++)
1553 addtok_wc (work_mbc->chars[i]);
1558 work_mbc->nchars = 0;
1561 /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
1562 if (work_mbc->invert
1563 || (!using_utf8 () && work_mbc->cset != -1)
1564 || work_mbc->nchars != 0
1565 || work_mbc->nch_classes != 0
1566 || work_mbc->nranges != 0
1567 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
1569 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1575 /* Characters have been handled above, so it is possible
1576 that the mbcset is empty now. Do nothing in that case. */
1577 if (work_mbc->cset != -1)
1579 assert (using_utf8 ());
1580 addtok (CSET + work_mbc->cset);
1593 /* We treat a multibyte character as a single atom, so that DFA
1594 can treat a multibyte character as a single expression.
1596 e.g. We construct following tree from "<mb1><mb2>".
1597 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1598 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1600 addtok_wc (wint_t wc)
1602 unsigned char buf[MB_LEN_MAX];
1605 memset (&s, 0, sizeof s);
1606 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1608 /* This is merely stop-gap. When cur_mb_len is 0 or negative,
1609 buf[0] is undefined, yet skipping the addtok_mb call altogether
1610 can result in heap corruption. */
1611 if (cur_mb_len <= 0)
1614 addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
1615 for (i = 1; i < cur_mb_len; i++)
1617 addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
1623 addtok_wc (wint_t wc)
1629 add_utf8_anychar (void)
1632 static const charclass utf8_classes[5] = {
1633 {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-lead bytes */
1634 {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
1635 {0, 0, 0, 0, 0, 0, 0xfffffffcU, 0}, /* c2-df: 2-byte sequence */
1636 {0, 0, 0, 0, 0, 0, 0, 0xffff}, /* e0-ef: 3-byte sequence */
1637 {0, 0, 0, 0, 0, 0, 0, 0xff0000} /* f0-f7: 4-byte sequence */
1639 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1642 /* Define the five character classes that are needed below. */
1643 if (dfa->utf8_anychar_classes[0] == 0)
1644 for (i = 0; i < n; i++)
1647 copyset (utf8_classes[i], c);
1650 if (!(syntax_bits & RE_DOT_NEWLINE))
1651 clrbit (eolbyte, c);
1652 if (syntax_bits & RE_DOT_NOT_NULL)
1655 dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
1658 /* A valid UTF-8 character is
1661 |[0xc2-0xdf][0x80-0xbf]
1662 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1663 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1665 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1666 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1667 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1668 for (i = 1; i < n; i++)
1669 addtok (dfa->utf8_anychar_classes[i]);
1672 addtok (dfa->utf8_anychar_classes[0]);
1679 /* The grammar understood by the parser is as follows.
1698 <multibyte character>
1709 LPAREN regexp RPAREN
1712 The parser builds a parse tree in postfix form in an array of tokens. */
1721 else if (MBS_SUPPORT && tok == WCHAR)
1723 addtok_wc (case_fold ? towlower (wctok) : wctok);
1725 if (case_fold && iswalpha (wctok))
1727 addtok_wc (towupper (wctok));
1734 else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
1736 /* For UTF-8 expand the period to a series of CSETs that define a valid
1737 UTF-8 character. This avoids using the slow multibyte path. I'm
1738 pretty sure it would be both profitable and correct to do it for
1739 any encoding; however, the optimization must be done manually as
1740 it is done above in add_utf8_anychar. So, let's start with
1741 UTF-8: it is the most used, and the structure of the encoding
1742 makes the correctness more obvious. */
1743 add_utf8_anychar ();
1746 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1747 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1749 || tok == ANYCHAR || tok == MBCSET
1750 #endif /* MBS_SUPPORT */
1751 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1756 else if (tok == LPAREN)
1761 dfaerror (_("unbalanced ("));
1768 /* Return the number of tokens in the given subexpression. */
1769 static size_t _GL_ATTRIBUTE_PURE
1770 nsubtoks (size_t tindex)
1774 switch (dfa->tokens[tindex - 1])
1781 return 1 + nsubtoks (tindex - 1);
1784 ntoks1 = nsubtoks (tindex - 1);
1785 return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
1789 /* Copy the given subexpression to the top of the tree. */
1791 copytoks (size_t tindex, size_t ntokens)
1795 for (i = 0; i < ntokens; ++i)
1797 addtok (dfa->tokens[tindex + i]);
1798 /* Update index into multibyte csets. */
1799 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1800 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1808 size_t tindex, ntokens;
1811 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1812 if (tok == REPMN && (minrep || maxrep))
1814 ntokens = nsubtoks (dfa->tindex);
1815 tindex = dfa->tindex - ntokens;
1820 for (i = 1; i < minrep; ++i)
1822 copytoks (tindex, ntokens);
1825 for (; i < maxrep; ++i)
1827 copytoks (tindex, ntokens);
1833 else if (tok == REPMN)
1835 dfa->tindex -= nsubtoks (dfa->tindex);
1850 while (tok != RPAREN && tok != OR && tok >= 0)
1869 /* Main entry point for the parser. S is a string to be parsed, len is the
1870 length of the string, so s can include NUL characters. D is a pointer to
1871 the struct dfa to parse into. */
1873 dfaparse (char const *s, size_t len, struct dfa *d)
1882 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1887 memset (&mbs, 0, sizeof mbs);
1890 if (!syntax_bits_set)
1891 dfaerror (_("no syntax specified"));
1899 dfaerror (_("unbalanced )"));
1901 addtok (END - d->nregexps);
1910 /* Some primitives for operating on sets of positions. */
1912 /* Copy one set to another; the destination must be large enough. */
1914 copy (position_set const *src, position_set * dst)
1916 REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem);
1917 memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem);
1918 dst->nelem = src->nelem;
1922 alloc_position_set (position_set * s, size_t size)
1924 MALLOC (s->elems, size);
1929 /* Insert position P in set S. S is maintained in sorted order on
1930 decreasing index. If there is already an entry in S with P.index
1931 then merge (logically-OR) P's constraints into the one in S.
1932 S->elems must point to an array large enough to hold the resulting set. */
1934 insert (position p, position_set * s)
1936 size_t count = s->nelem;
1937 size_t lo = 0, hi = count;
1941 size_t mid = (lo + hi) >> 1;
1942 if (s->elems[mid].index > p.index)
1948 if (lo < count && p.index == s->elems[lo].index)
1950 s->elems[lo].constraint |= p.constraint;
1954 REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1);
1955 for (i = count; i > lo; i--)
1956 s->elems[i] = s->elems[i - 1];
1961 /* Merge two sets of positions into a third. The result is exactly as if
1962 the positions of both sets were inserted into an initially empty set. */
1964 merge (position_set const *s1, position_set const *s2, position_set * m)
1966 size_t i = 0, j = 0;
1968 REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem);
1970 while (i < s1->nelem && j < s2->nelem)
1971 if (s1->elems[i].index > s2->elems[j].index)
1972 m->elems[m->nelem++] = s1->elems[i++];
1973 else if (s1->elems[i].index < s2->elems[j].index)
1974 m->elems[m->nelem++] = s2->elems[j++];
1977 m->elems[m->nelem] = s1->elems[i++];
1978 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1980 while (i < s1->nelem)
1981 m->elems[m->nelem++] = s1->elems[i++];
1982 while (j < s2->nelem)
1983 m->elems[m->nelem++] = s2->elems[j++];
1986 /* Delete a position from a set. */
1988 delete (position p, position_set * s)
1992 for (i = 0; i < s->nelem; ++i)
1993 if (p.index == s->elems[i].index)
1996 for (--s->nelem; i < s->nelem; ++i)
1997 s->elems[i] = s->elems[i + 1];
2000 /* Find the index of the state corresponding to the given position set with
2001 the given preceding context, or create a new state if there is no such
2002 state. Context tells whether we got here on a newline or letter. */
2004 state_index (struct dfa *d, position_set const *s, int context)
2010 for (i = 0; i < s->nelem; ++i)
2011 hash ^= s->elems[i].index + s->elems[i].constraint;
2013 /* Try to find a state that exactly matches the proposed one. */
2014 for (i = 0; i < d->sindex; ++i)
2016 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2017 || context != d->states[i].context)
2019 for (j = 0; j < s->nelem; ++j)
2020 if (s->elems[j].constraint
2021 != d->states[i].elems.elems[j].constraint
2022 || s->elems[j].index != d->states[i].elems.elems[j].index)
2028 /* We'll have to create a new state. */
2029 REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
2030 d->states[i].hash = hash;
2031 alloc_position_set (&d->states[i].elems, s->nelem);
2032 copy (s, &d->states[i].elems);
2033 d->states[i].context = context;
2034 d->states[i].backref = 0;
2035 d->states[i].constraint = 0;
2036 d->states[i].first_end = 0;
2039 d->states[i].mbps.nelem = 0;
2040 d->states[i].mbps.elems = NULL;
2042 for (j = 0; j < s->nelem; ++j)
2043 if (d->tokens[s->elems[j].index] < 0)
2045 constraint = s->elems[j].constraint;
2046 if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
2047 d->states[i].constraint |= constraint;
2048 if (!d->states[i].first_end)
2049 d->states[i].first_end = d->tokens[s->elems[j].index];
2051 else if (d->tokens[s->elems[j].index] == BACKREF)
2053 d->states[i].constraint = NO_CONSTRAINT;
2054 d->states[i].backref = 1;
2062 /* Find the epsilon closure of a set of positions. If any position of the set
2063 contains a symbol that matches the empty string in some context, replace
2064 that position with the elements of its follow labeled with an appropriate
2065 constraint. Repeat exhaustively until no funny positions are left.
2066 S->elems must be large enough to hold the result. */
2068 epsclosure (position_set * s, struct dfa const *d)
2071 char *visited; /* array of booleans, enough to use char, not int */
2074 CALLOC (visited, d->tindex);
2076 for (i = 0; i < s->nelem; ++i)
2077 if (d->tokens[s->elems[i].index] >= NOTCHAR
2078 && d->tokens[s->elems[i].index] != BACKREF
2080 && d->tokens[s->elems[i].index] != ANYCHAR
2081 && d->tokens[s->elems[i].index] != MBCSET
2083 && d->tokens[s->elems[i].index] < CSET)
2086 p.constraint = old.constraint;
2087 delete (s->elems[i], s);
2088 if (visited[old.index])
2093 visited[old.index] = 1;
2094 switch (d->tokens[old.index])
2097 p.constraint &= BEGLINE_CONSTRAINT;
2100 p.constraint &= ENDLINE_CONSTRAINT;
2103 p.constraint &= BEGWORD_CONSTRAINT;
2106 p.constraint &= ENDWORD_CONSTRAINT;
2109 p.constraint &= LIMWORD_CONSTRAINT;
2112 p.constraint &= NOTLIMWORD_CONSTRAINT;
2117 for (j = 0; j < d->follows[old.index].nelem; ++j)
2119 p.index = d->follows[old.index].elems[j].index;
2122 /* Force rescan to start at the beginning. */
2129 /* Returns the set of contexts for which there is at least one
2130 character included in C. */
2133 charclass_context (charclass c)
2138 if (tstbit (eolbyte, c))
2139 context |= CTX_NEWLINE;
2141 for (j = 0; j < CHARCLASS_INTS; ++j)
2143 if (c[j] & letters[j])
2144 context |= CTX_LETTER;
2145 if (c[j] & ~(letters[j] | newline[j]))
2146 context |= CTX_NONE;
2152 /* Returns the contexts on which the position set S depends. Each context
2153 in the set of returned contexts (let's call it SC) may have a different
2154 follow set than other contexts in SC, and also different from the
2155 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2156 in the complement set will have the same follow set. */
2158 static int _GL_ATTRIBUTE_PURE
2159 state_separate_contexts (position_set const *s)
2161 int separate_contexts = 0;
2164 for (j = 0; j < s->nelem; ++j)
2166 if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2167 separate_contexts |= CTX_NEWLINE;
2168 if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2169 separate_contexts |= CTX_LETTER;
2172 return separate_contexts;
2176 /* Perform bottom-up analysis on the parse tree, computing various functions.
2177 Note that at this point, we're pretending constructs like \< are real
2178 characters rather than constraints on what can follow them.
2180 Nullable: A node is nullable if it is at the root of a regexp that can
2181 match the empty string.
2182 * EMPTY leaves are nullable.
2183 * No other leaf is nullable.
2184 * A QMARK or STAR node is nullable.
2185 * A PLUS node is nullable if its argument is nullable.
2186 * A CAT node is nullable if both its arguments are nullable.
2187 * An OR node is nullable if either argument is nullable.
2189 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2190 that could correspond to the first character of a string matching the
2191 regexp rooted at the given node.
2192 * EMPTY leaves have empty firstpos.
2193 * The firstpos of a nonempty leaf is that leaf itself.
2194 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2196 * The firstpos of a CAT node is the firstpos of the left argument, union
2197 the firstpos of the right if the left argument is nullable.
2198 * The firstpos of an OR node is the union of firstpos of each argument.
2200 Lastpos: The lastpos of a node is the set of positions that could
2201 correspond to the last character of a string matching the regexp at
2203 * EMPTY leaves have empty lastpos.
2204 * The lastpos of a nonempty leaf is that leaf itself.
2205 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2207 * The lastpos of a CAT node is the lastpos of its right argument, union
2208 the lastpos of the left if the right argument is nullable.
2209 * The lastpos of an OR node is the union of the lastpos of each argument.
2211 Follow: The follow of a position is the set of positions that could
2212 correspond to the character following a character matching the node in
2213 a string matching the regexp. At this point we consider special symbols
2214 that match the empty string in some context to be just normal characters.
2215 Later, if we find that a special symbol is in a follow set, we will
2216 replace it with the elements of its follow, labeled with an appropriate
2218 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2219 the follow of every node in the lastpos.
2220 * Every node in the firstpos of the second argument of a CAT node is in
2221 the follow of every node in the lastpos of the first argument.
2223 Because of the postfix representation of the parse tree, the depth-first
2224 analysis is conveniently done by a linear scan with the aid of a stack.
2225 Sets are stored as arrays of the elements, obeying a stack-like allocation
2226 scheme; the number of elements in each set deeper in the stack can be
2227 used to determine the address of a particular set's array. */
2229 dfaanalyze (struct dfa *d, int searchflag)
2231 int *nullable; /* Nullable stack. */
2232 size_t *nfirstpos; /* Element count stack for firstpos sets. */
2233 position *firstpos; /* Array where firstpos elements are stored. */
2234 size_t *nlastpos; /* Element count stack for lastpos sets. */
2235 position *lastpos; /* Array where lastpos elements are stored. */
2236 position_set tmp; /* Temporary set for merging sets. */
2237 position_set merged; /* Result of merging sets. */
2238 int separate_contexts; /* Context wanted by some position. */
2240 size_t *o_nfirst, *o_nlast;
2241 position *o_firstpos, *o_lastpos;
2246 fprintf (stderr, "dfaanalyze:\n");
2247 for (i = 0; i < d->tindex; ++i)
2249 fprintf (stderr, " %zd:", i);
2250 prtok (d->tokens[i]);
2252 putc ('\n', stderr);
2255 d->searchflag = searchflag;
2257 MALLOC (nullable, d->depth);
2258 o_nullable = nullable;
2259 MALLOC (nfirstpos, d->depth);
2260 o_nfirst = nfirstpos;
2261 MALLOC (firstpos, d->nleaves);
2262 o_firstpos = firstpos, firstpos += d->nleaves;
2263 MALLOC (nlastpos, d->depth);
2265 MALLOC (lastpos, d->nleaves);
2266 o_lastpos = lastpos, lastpos += d->nleaves;
2267 alloc_position_set (&merged, d->nleaves);
2269 CALLOC (d->follows, d->tindex);
2271 for (i = 0; i < d->tindex; ++i)
2273 switch (d->tokens[i])
2276 /* The empty set is nullable. */
2279 /* The firstpos and lastpos of the empty leaf are both empty. */
2280 *nfirstpos++ = *nlastpos++ = 0;
2285 /* Every element in the firstpos of the argument is in the follow
2286 of every element in the lastpos. */
2287 tmp.nelem = nfirstpos[-1];
2288 tmp.elems = firstpos;
2290 for (j = 0; j < nlastpos[-1]; ++j)
2292 merge (&tmp, &d->follows[pos[j].index], &merged);
2293 copy (&merged, &d->follows[pos[j].index]);
2297 /* A QMARK or STAR node is automatically nullable. */
2298 if (d->tokens[i] != PLUS)
2303 /* Every element in the firstpos of the second argument is in the
2304 follow of every element in the lastpos of the first argument. */
2305 tmp.nelem = nfirstpos[-1];
2306 tmp.elems = firstpos;
2307 pos = lastpos + nlastpos[-1];
2308 for (j = 0; j < nlastpos[-2]; ++j)
2310 merge (&tmp, &d->follows[pos[j].index], &merged);
2311 copy (&merged, &d->follows[pos[j].index]);
2314 /* The firstpos of a CAT node is the firstpos of the first argument,
2315 union that of the second argument if the first is nullable. */
2317 nfirstpos[-2] += nfirstpos[-1];
2319 firstpos += nfirstpos[-1];
2322 /* The lastpos of a CAT node is the lastpos of the second argument,
2323 union that of the first argument if the second is nullable. */
2325 nlastpos[-2] += nlastpos[-1];
2328 pos = lastpos + nlastpos[-2];
2329 for (j = nlastpos[-1]; j-- > 0;)
2330 pos[j] = lastpos[j];
2331 lastpos += nlastpos[-2];
2332 nlastpos[-2] = nlastpos[-1];
2336 /* A CAT node is nullable if both arguments are nullable. */
2337 nullable[-2] = nullable[-1] && nullable[-2];
2342 /* The firstpos is the union of the firstpos of each argument. */
2343 nfirstpos[-2] += nfirstpos[-1];
2346 /* The lastpos is the union of the lastpos of each argument. */
2347 nlastpos[-2] += nlastpos[-1];
2350 /* An OR node is nullable if either argument is nullable. */
2351 nullable[-2] = nullable[-1] || nullable[-2];
2356 /* Anything else is a nonempty position. (Note that special
2357 constructs like \< are treated as nonempty strings here;
2358 an "epsilon closure" effectively makes them nullable later.
2359 Backreferences have to get a real position so we can detect
2360 transitions on them later. But they are nullable. */
2361 *nullable++ = d->tokens[i] == BACKREF;
2363 /* This position is in its own firstpos and lastpos. */
2364 *nfirstpos++ = *nlastpos++ = 1;
2365 --firstpos, --lastpos;
2366 firstpos->index = lastpos->index = i;
2367 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2369 /* Allocate the follow set for this position. */
2370 alloc_position_set (&d->follows[i], 1);
2374 /* ... balance the above nonsyntactic #ifdef goo... */
2375 fprintf (stderr, "node %zd:", i);
2376 prtok (d->tokens[i]);
2377 putc ('\n', stderr);
2378 fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2379 fprintf (stderr, " firstpos:");
2380 for (j = nfirstpos[-1]; j-- > 0;)
2382 fprintf (stderr, " %zd:", firstpos[j].index);
2383 prtok (d->tokens[firstpos[j].index]);
2385 fprintf (stderr, "\n lastpos:");
2386 for (j = nlastpos[-1]; j-- > 0;)
2388 fprintf (stderr, " %zd:", lastpos[j].index);
2389 prtok (d->tokens[lastpos[j].index]);
2391 putc ('\n', stderr);
2395 /* For each follow set that is the follow set of a real position, replace
2396 it with its epsilon closure. */
2397 for (i = 0; i < d->tindex; ++i)
2398 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2400 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2402 || d->tokens[i] >= CSET)
2405 fprintf (stderr, "follows(%zd:", i);
2406 prtok (d->tokens[i]);
2407 fprintf (stderr, "):");
2408 for (j = d->follows[i].nelem; j-- > 0;)
2410 fprintf (stderr, " %zd:", d->follows[i].elems[j].index);
2411 prtok (d->tokens[d->follows[i].elems[j].index]);
2413 putc ('\n', stderr);
2415 copy (&d->follows[i], &merged);
2416 epsclosure (&merged, d);
2417 copy (&merged, &d->follows[i]);
2420 /* Get the epsilon closure of the firstpos of the regexp. The result will
2421 be the set of positions of state 0. */
2423 for (i = 0; i < nfirstpos[-1]; ++i)
2424 insert (firstpos[i], &merged);
2425 epsclosure (&merged, d);
2427 /* Build the initial state. */
2430 MALLOC (d->states, d->salloc);
2432 separate_contexts = state_separate_contexts (&merged);
2433 state_index (d, &merged,
2434 (separate_contexts & CTX_NEWLINE
2435 ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
2442 free (merged.elems);
2446 /* Find, for each character, the transition out of state s of d, and store
2447 it in the appropriate slot of trans.
2449 We divide the positions of s into groups (positions can appear in more
2450 than one group). Each group is labeled with a set of characters that
2451 every position in the group matches (taking into account, if necessary,
2452 preceding context information of s). For each group, find the union
2453 of the its elements' follows. This set is the set of positions of the
2454 new state. For each character in the group's label, set the transition
2455 on this character to be to a state corresponding to the set's positions,
2456 and its associated backward context information, if necessary.
2458 If we are building a searching matcher, we include the positions of state
2461 The collection of groups is constructed by building an equivalence-class
2462 partition of the positions of s.
2464 For each position, find the set of characters C that it matches. Eliminate
2465 any characters from C that fail on grounds of backward context.
2467 Search through the groups, looking for a group whose label L has nonempty
2468 intersection with C. If L - C is nonempty, create a new group labeled
2469 L - C and having the same positions as the current group, and set L to
2470 the intersection of L and C. Insert the position in this group, set
2471 C = C - L, and resume scanning.
2473 If after comparing with every group there are characters remaining in C,
2474 create a new group labeled with the characters of C and insert this
2475 position in that group. */
2477 dfastate (state_num s, struct dfa *d, state_num trans[])
2479 leaf_set *grps; /* As many as will ever be needed. */
2480 charclass *labels; /* Labels corresponding to the groups. */
2481 size_t ngrps = 0; /* Number of groups actually used. */
2482 position pos; /* Current position being considered. */
2483 charclass matches; /* Set of matching characters. */
2484 int matchesf; /* True if matches is nonempty. */
2485 charclass intersect; /* Intersection with some label set. */
2486 int intersectf; /* True if intersect is nonempty. */
2487 charclass leftovers; /* Stuff in the label that didn't match. */
2488 int leftoversf; /* True if leftovers is nonempty. */
2489 position_set follows; /* Union of the follows of some group. */
2490 position_set tmp; /* Temporary space for merging sets. */
2491 int possible_contexts; /* Contexts that this group can match. */
2492 int separate_contexts; /* Context that new state wants to know. */
2493 state_num state; /* New state. */
2494 state_num state_newline; /* New state on a newline transition. */
2495 state_num state_letter; /* New state on a letter transition. */
2496 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2499 MALLOC (grps, NOTCHAR);
2500 MALLOC (labels, NOTCHAR);
2504 for (i = 0; i < d->states[s].elems.nelem; ++i)
2506 pos = d->states[s].elems.elems[i];
2507 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2508 setbit (d->tokens[pos.index], matches);
2509 else if (d->tokens[pos.index] >= CSET)
2510 copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2511 else if (MBS_SUPPORT
2512 && (d->tokens[pos.index] == ANYCHAR
2513 || d->tokens[pos.index] == MBCSET))
2514 /* MB_CUR_MAX > 1 */
2516 /* ANYCHAR and MBCSET must match with a single character, so we
2517 must put it to d->states[s].mbps, which contains the positions
2518 which can match with a single character not a byte. */
2519 if (d->states[s].mbps.nelem == 0)
2520 alloc_position_set (&d->states[s].mbps, 1);
2521 insert (pos, &(d->states[s].mbps));
2527 /* Some characters may need to be eliminated from matches because
2528 they fail in the current context. */
2529 if (pos.constraint != NO_CONSTRAINT)
2531 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2532 d->states[s].context, CTX_NEWLINE))
2533 for (j = 0; j < CHARCLASS_INTS; ++j)
2534 matches[j] &= ~newline[j];
2535 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2536 d->states[s].context, CTX_LETTER))
2537 for (j = 0; j < CHARCLASS_INTS; ++j)
2538 matches[j] &= ~letters[j];
2539 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2540 d->states[s].context, CTX_NONE))
2541 for (j = 0; j < CHARCLASS_INTS; ++j)
2542 matches[j] &= letters[j] | newline[j];
2544 /* If there are no characters left, there's no point in going on. */
2545 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2547 if (j == CHARCLASS_INTS)
2551 for (j = 0; j < ngrps; ++j)
2553 /* If matches contains a single character only, and the current
2554 group's label doesn't contain that character, go on to the
2556 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2557 && !tstbit (d->tokens[pos.index], labels[j]))
2560 /* Check if this group's label has a nonempty intersection with
2563 for (k = 0; k < CHARCLASS_INTS; ++k)
2564 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2568 /* It does; now find the set differences both ways. */
2569 leftoversf = matchesf = 0;
2570 for (k = 0; k < CHARCLASS_INTS; ++k)
2572 /* Even an optimizing compiler can't know this for sure. */
2573 int match = matches[k], label = labels[j][k];
2575 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2576 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2579 /* If there were leftovers, create a new group labeled with them. */
2582 copyset (leftovers, labels[ngrps]);
2583 copyset (intersect, labels[j]);
2584 MALLOC (grps[ngrps].elems, d->nleaves);
2585 memcpy (grps[ngrps].elems, grps[j].elems,
2586 sizeof (grps[j].elems[0]) * grps[j].nelem);
2587 grps[ngrps].nelem = grps[j].nelem;
2591 /* Put the position in the current group. The constraint is
2593 grps[j].elems[grps[j].nelem++] = pos.index;
2595 /* If every character matching the current position has been
2596 accounted for, we're done. */
2601 /* If we've passed the last group, and there are still characters
2602 unaccounted for, then we'll have to create a new group. */
2605 copyset (matches, labels[ngrps]);
2607 MALLOC (grps[ngrps].elems, d->nleaves);
2608 grps[ngrps].nelem = 1;
2609 grps[ngrps].elems[0] = pos.index;
2614 alloc_position_set (&follows, d->nleaves);
2615 alloc_position_set (&tmp, d->nleaves);
2617 /* If we are a searching matcher, the default transition is to a state
2618 containing the positions of state 0, otherwise the default transition
2619 is to fail miserably. */
2622 /* Find the state(s) corresponding to the positions of state 0. */
2623 copy (&d->states[0].elems, &follows);
2624 separate_contexts = state_separate_contexts (&follows);
2625 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2626 if (separate_contexts & CTX_NEWLINE)
2627 state_newline = state_index (d, &follows, CTX_NEWLINE);
2629 state_newline = state;
2630 if (separate_contexts & CTX_LETTER)
2631 state_letter = state_index (d, &follows, CTX_LETTER);
2633 state_letter = state;
2635 for (i = 0; i < NOTCHAR; ++i)
2636 trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
2637 trans[eolbyte] = state_newline;
2640 for (i = 0; i < NOTCHAR; ++i)
2643 for (i = 0; i < ngrps; ++i)
2647 /* Find the union of the follows of the positions of the group.
2648 This is a hideously inefficient loop. Fix it someday. */
2649 for (j = 0; j < grps[i].nelem; ++j)
2650 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2651 insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2653 if (d->mb_cur_max > 1)
2655 /* If a token in follows.elems is not 1st byte of a multibyte
2656 character, or the states of follows must accept the bytes
2657 which are not 1st byte of the multibyte character.
2658 Then, if a state of follows encounter a byte, it must not be
2659 a 1st byte of a multibyte character nor single byte character.
2660 We cansel to add state[0].follows to next state, because
2661 state[0] must accept 1st-byte
2663 For example, we assume <sb a> is a certain single byte
2664 character, <mb A> is a certain multibyte character, and the
2665 codepoint of <sb a> equals the 2nd byte of the codepoint of
2667 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2668 by accepting accepts 1st byte of <mb A>, and state[i+1]
2669 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2670 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2671 <mb A>, so we cannot add state[0]. */
2673 next_isnt_1st_byte = 0;
2674 for (j = 0; j < follows.nelem; ++j)
2676 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2678 next_isnt_1st_byte = 1;
2684 /* If we are building a searching matcher, throw in the positions
2685 of state 0 as well. */
2687 && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
2688 for (j = 0; j < d->states[0].elems.nelem; ++j)
2689 insert (d->states[0].elems.elems[j], &follows);
2691 /* Find out if the new state will want any context information. */
2692 possible_contexts = charclass_context (labels[i]);
2693 separate_contexts = state_separate_contexts (&follows);
2695 /* Find the state(s) corresponding to the union of the follows. */
2696 if ((separate_contexts & possible_contexts) != possible_contexts)
2697 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2700 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2701 state_newline = state_index (d, &follows, CTX_NEWLINE);
2703 state_newline = state;
2704 if (separate_contexts & possible_contexts & CTX_LETTER)
2705 state_letter = state_index (d, &follows, CTX_LETTER);
2707 state_letter = state;
2709 /* Set the transitions for each character in the current label. */
2710 for (j = 0; j < CHARCLASS_INTS; ++j)
2711 for (k = 0; k < INTBITS; ++k)
2712 if (labels[i][j] & 1 << k)
2714 int c = j * INTBITS + k;
2717 trans[c] = state_newline;
2718 else if (IS_WORD_CONSTITUENT (c))
2719 trans[c] = state_letter;
2720 else if (c < NOTCHAR)
2725 for (i = 0; i < ngrps; ++i)
2726 free (grps[i].elems);
2727 free (follows.elems);
2733 /* Some routines for manipulating a compiled dfa's transition tables.
2734 Each state may or may not have a transition table; if it does, and it
2735 is a non-accepting state, then d->trans[state] points to its table.
2736 If it is an accepting state then d->fails[state] points to its table.
2737 If it has no table at all, then d->trans[state] is NULL.
2738 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2741 build_state (state_num s, struct dfa *d)
2743 state_num *trans; /* The new transition table. */
2746 /* Set an upper limit on the number of transition tables that will ever
2747 exist at once. 1024 is arbitrary. The idea is that the frequently
2748 used transition tables will be quickly rebuilt, whereas the ones that
2749 were only needed once or twice will be cleared away. */
2750 if (d->trcount >= 1024)
2752 for (i = 0; i < d->tralloc; ++i)
2756 d->trans[i] = d->fails[i] = NULL;
2763 /* Set up the success bits for this state. */
2765 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2766 d->success[s] |= CTX_NEWLINE;
2767 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2768 d->success[s] |= CTX_LETTER;
2769 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2770 d->success[s] |= CTX_NONE;
2772 MALLOC (trans, NOTCHAR);
2773 dfastate (s, d, trans);
2775 /* Now go through the new transition table, and make sure that the trans
2776 and fail arrays are allocated large enough to hold a pointer for the
2777 largest state mentioned in the table. */
2778 for (i = 0; i < NOTCHAR; ++i)
2779 if (trans[i] >= d->tralloc)
2781 state_num oldalloc = d->tralloc;
2783 while (trans[i] >= d->tralloc)
2785 REALLOC (d->realtrans, d->tralloc + 1);
2786 d->trans = d->realtrans + 1;
2787 REALLOC (d->fails, d->tralloc);
2788 REALLOC (d->success, d->tralloc);
2789 REALLOC (d->newlines, d->tralloc);
2790 while (oldalloc < d->tralloc)
2792 d->trans[oldalloc] = NULL;
2793 d->fails[oldalloc++] = NULL;
2797 /* Keep the newline transition in a special place so we can use it as
2799 d->newlines[s] = trans[eolbyte];
2800 trans[eolbyte] = -1;
2802 if (ACCEPTING (s, *d))
2803 d->fails[s] = trans;
2805 d->trans[s] = trans;
2809 build_state_zero (struct dfa *d)
2813 CALLOC (d->realtrans, d->tralloc + 1);
2814 d->trans = d->realtrans + 1;
2815 CALLOC (d->fails, d->tralloc);
2816 MALLOC (d->success, d->tralloc);
2817 MALLOC (d->newlines, d->tralloc);
2821 /* Multibyte character handling sub-routines for dfaexec. */
2823 /* Initial state may encounter the byte which is not a single byte character
2824 nor 1st byte of a multibyte character. But it is incorrect for initial
2825 state to accept such a byte.
2826 For example, in sjis encoding the regular expression like "\\" accepts
2827 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2828 0x815c. Then Initial state must skip the bytes which are not a single byte
2829 character nor 1st byte of a multibyte character. */
2830 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2833 while (inputwcs[p - buf_begin] == 0 \
2834 && mblen_buf[p - buf_begin] > 0 \
2835 && (unsigned char const *) p < buf_end) \
2837 if ((char *) p >= end) \
2847 realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2849 /* Make sure that the trans and fail arrays are allocated large enough
2850 to hold a pointer for the new state. */
2851 if (new_state >= d->tralloc)
2853 state_num oldalloc = d->tralloc;
2855 while (new_state >= d->tralloc)
2857 REALLOC (d->realtrans, d->tralloc + 1);
2858 d->trans = d->realtrans + 1;
2859 REALLOC (d->fails, d->tralloc);
2860 REALLOC (d->success, d->tralloc);
2861 REALLOC (d->newlines, d->tralloc);
2862 while (oldalloc < d->tralloc)
2864 d->trans[oldalloc] = NULL;
2865 d->fails[oldalloc++] = NULL;
2870 /* Return values of transit_state_singlebyte(), and
2871 transit_state_consume_1char. */
2874 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2875 TRANSIT_STATE_DONE, /* State transition has finished. */
2876 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2877 } status_transit_state;
2879 /* Consume a single byte and transit state from 's' to '*next_state'.
2880 This function is almost same as the state transition routin in dfaexec().
2881 But state transition is done just once, otherwise matching succeed or
2882 reach the end of the buffer. */
2883 static status_transit_state
2884 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
2885 state_num * next_state)
2888 state_num works = s;
2890 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2892 while (rval == TRANSIT_STATE_IN_PROGRESS)
2894 if ((t = d->trans[works]) != NULL)
2897 rval = TRANSIT_STATE_DONE;
2905 /* At the moment, it must not happen. */
2910 else if (d->fails[works])
2912 works = d->fails[works][*p];
2913 rval = TRANSIT_STATE_DONE;
2917 build_state (works, d);
2920 *next_state = works;
2924 /* Match a "." against the current context. buf_begin[IDX] is the
2925 current position. Return the length of the match, in bytes.
2926 POS is the position of the ".". */
2928 match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
2935 mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2937 /* Check syntax bits. */
2938 if (wc == (wchar_t) eolbyte)
2940 if (!(syntax_bits & RE_DOT_NEWLINE))
2943 else if (wc == (wchar_t) '\0')
2945 if (syntax_bits & RE_DOT_NOT_NULL)
2949 context = wchar_context (wc);
2950 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2956 /* Match a bracket expression against the current context.
2957 buf_begin[IDX] is the current position.
2958 Return the length of the match, in bytes.
2959 POS is the position of the bracket expression. */
2961 match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
2964 int match; /* Flag which represent that matching succeed. */
2965 int match_len; /* Length of the character (or collating element)
2966 with which this operator match. */
2967 int op_len; /* Length of the operator. */
2971 /* Pointer to the structure to which we are currently referring. */
2972 struct mb_char_classes *work_mbc;
2975 wchar_t wc; /* Current referring character. */
2979 /* Check syntax bits. */
2980 if (wc == (wchar_t) eolbyte)
2982 if (!(syntax_bits & RE_DOT_NEWLINE))
2985 else if (wc == (wchar_t) '\0')
2987 if (syntax_bits & RE_DOT_NOT_NULL)
2991 context = wchar_context (wc);
2992 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2995 /* Assign the current referring operator to work_mbc. */
2996 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2997 match = !work_mbc->invert;
2998 match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
3000 /* Match in range 0-255? */
3001 if (wc < NOTCHAR && work_mbc->cset != -1
3002 && tstbit ((unsigned char) wc, d->charclasses[work_mbc->cset]))
3003 goto charset_matched;
3005 /* match with a character class? */
3006 for (i = 0; i < work_mbc->nch_classes; i++)
3008 if (iswctype ((wint_t) wc, work_mbc->ch_classes[i]))
3009 goto charset_matched;
3012 strncpy (buffer, (char const *) buf_begin + idx, match_len);
3013 buffer[match_len] = '\0';
3015 /* match with an equivalence class? */
3016 for (i = 0; i < work_mbc->nequivs; i++)
3018 op_len = strlen (work_mbc->equivs[i]);
3019 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3020 buffer[op_len] = '\0';
3021 if (strcoll (work_mbc->equivs[i], buffer) == 0)
3024 goto charset_matched;
3028 /* match with a collating element? */
3029 for (i = 0; i < work_mbc->ncoll_elems; i++)
3031 op_len = strlen (work_mbc->coll_elems[i]);
3032 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3033 buffer[op_len] = '\0';
3035 if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
3038 goto charset_matched;
3043 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
3045 /* match with a range? */
3046 for (i = 0; i < work_mbc->nranges; i++)
3048 wcbuf[2] = work_mbc->range_sts[i];
3049 wcbuf[4] = work_mbc->range_ends[i];
3051 if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0)
3052 goto charset_matched;
3055 /* match with a character? */
3056 for (i = 0; i < work_mbc->nchars; i++)
3058 if (wc == work_mbc->chars[i])
3059 goto charset_matched;
3065 return match ? match_len : 0;
3068 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
3069 array which corresponds to `d->states[s].mbps.elem' and each element of
3070 the array contains the amount of the bytes with which the element can
3072 `idx' is the index from the buf_begin, and it is the current position
3074 Caller MUST free the array which this function return. */
3076 check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
3081 MALLOC (rarray, d->states[s].mbps.nelem);
3082 for (i = 0; i < d->states[s].mbps.nelem; ++i)
3084 position pos = d->states[s].mbps.elems[i];
3085 switch (d->tokens[pos.index])
3088 rarray[i] = match_anychar (d, s, pos, idx);
3091 rarray[i] = match_mb_charset (d, s, pos, idx);
3094 break; /* cannot happen. */
3100 /* Consume a single character and enumerate all of the positions which can
3101 be next position from the state `s'.
3102 `match_lens' is the input. It can be NULL, but it can also be the output
3103 of check_matching_with_multibyte_ops() for optimization.
3104 `mbclen' and `pps' are the output. `mbclen' is the length of the
3105 character consumed, and `pps' is the set this function enumerate. */
3106 static status_transit_state
3107 transit_state_consume_1char (struct dfa *d, state_num s,
3108 unsigned char const **pp,
3109 int *match_lens, int *mbclen, position_set * pps)
3115 status_transit_state rs = TRANSIT_STATE_DONE;
3117 /* Calculate the length of the (single/multi byte) character
3118 to which p points. */
3119 *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
3121 /* Calculate the state which can be reached from the state `s' by
3122 consuming `*mbclen' single bytes from the buffer. */
3124 for (k = 0; k < *mbclen; k++)
3127 rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
3129 /* Copy the positions contained by `s1' to the set `pps'. */
3130 copy (&(d->states[s1].elems), pps);
3132 /* Check (input) match_lens, and initialize if it is NULL. */
3133 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3134 work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3136 work_mbls = match_lens;
3138 /* Add all of the positions which can be reached from `s' by consuming
3139 a single character. */
3140 for (i = 0; i < d->states[s].mbps.nelem; i++)
3142 if (work_mbls[i] == *mbclen)
3143 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3145 insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
3148 if (match_lens == NULL && work_mbls != NULL)
3151 /* FIXME: this return value is always ignored. */
3155 /* Transit state from s, then return new state and update the pointer of the
3156 buffer. This function is for some operator which can match with a multi-
3157 byte character or a collating element (which may be multi characters). */
3159 transit_state (struct dfa *d, state_num s, unsigned char const **pp)
3162 int mbclen; /* The length of current input multibyte character. */
3165 int *match_lens = NULL;
3166 size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
3167 position_set follows;
3168 unsigned char const *p1 = *pp;
3172 /* This state has (a) multibyte operator(s).
3173 We check whether each of them can match or not. */
3175 /* Note: caller must free the return value of this function. */
3176 match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3178 for (i = 0; i < nelem; i++)
3179 /* Search the operator which match the longest string,
3182 if (match_lens[i] > maxlen)
3183 maxlen = match_lens[i];
3187 if (nelem == 0 || maxlen == 0)
3188 /* This state has no multibyte operator which can match.
3189 We need to check only one single byte character. */
3191 status_transit_state rs;
3192 rs = transit_state_singlebyte (d, s, *pp, &s1);
3194 /* We must update the pointer if state transition succeeded. */
3195 if (rs == TRANSIT_STATE_DONE)
3202 /* This state has some operators which can match a multibyte character. */
3203 alloc_position_set (&follows, d->nleaves);
3205 /* `maxlen' may be longer than the length of a character, because it may
3206 not be a character but a (multi character) collating element.
3207 We enumerate all of the positions which `s' can reach by consuming
3209 transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
3211 wc = inputwcs[*pp - mbclen - buf_begin];
3212 s1 = state_index (d, &follows, wchar_context (wc));
3213 realloc_trans_if_necessary (d, s1);
3215 while (*pp - p1 < maxlen)
3217 transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
3219 for (i = 0; i < nelem; i++)
3221 if (match_lens[i] == *pp - p1)
3223 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3224 insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3228 wc = inputwcs[*pp - mbclen - buf_begin];
3229 s1 = state_index (d, &follows, wchar_context (wc));
3230 realloc_trans_if_necessary (d, s1);
3233 free (follows.elems);
3238 /* Initialize mblen_buf and inputwcs with data from the next line. */
3241 prepare_wc_buf (const char *begin, const char *end)
3244 unsigned char eol = eolbyte;
3245 size_t remain_bytes, i;
3247 buf_begin = (unsigned char *) begin;
3250 for (i = 0; i < end - begin + 1; i++)
3252 if (remain_bytes == 0)
3255 = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3256 if (remain_bytes < 1
3257 || remain_bytes == (size_t) -1
3258 || remain_bytes == (size_t) -2
3259 || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
3262 inputwcs[i] = (wchar_t) begin[i];
3264 if (begin[i] == eol)
3269 mblen_buf[i] = remain_bytes;
3275 mblen_buf[i] = remain_bytes;
3281 buf_end = (unsigned char *) (begin + i);
3283 inputwcs[i] = 0; /* sentinel */
3284 #endif /* MBS_SUPPORT */
3287 /* Search through a buffer looking for a match to the given struct dfa.
3288 Find the first occurrence of a string matching the regexp in the
3289 buffer, and the shortest possible version thereof. Return a pointer to
3290 the first character after the match, or NULL if none is found. BEGIN
3291 points to the beginning of the buffer, and END points to the first byte
3292 after its end. Note however that we store a sentinel byte (usually
3293 newline) in *END, so the actual buffer must be one byte longer.
3294 When ALLOW_NL is nonzero, newlines may appear in the matching string.
3295 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3296 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3297 encountered a back-reference (1) or not (0). The caller may use this
3298 to decide whether to fall back on a backtracking matcher. */
3300 dfaexec (struct dfa *d, char const *begin, char *end,
3301 int allow_nl, size_t *count, int *backref)
3303 state_num s, s1; /* Current state. */
3304 unsigned char const *p; /* Current input character. */
3305 state_num **trans, *t; /* Copy of d->trans so it can be optimized
3307 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3308 unsigned char saved_end;
3311 build_state_zero (d);
3314 p = (unsigned char const *) begin;
3316 saved_end = *(unsigned char *) end;
3319 if (d->mb_cur_max > 1)
3321 MALLOC (mblen_buf, end - begin + 2);
3322 MALLOC (inputwcs, end - begin + 2);
3323 memset (&mbs, 0, sizeof (mbstate_t));
3324 prepare_wc_buf ((const char *) p, end);
3329 if (d->mb_cur_max > 1)
3330 while ((t = trans[s]) != NULL)
3335 SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
3337 if (d->states[s].mbps.nelem == 0)
3343 /* Falling back to the glibc matcher in this case gives
3344 better performance (up to 25% better on [a-z], for
3345 example) and enables support for collating symbols and
3346 equivalence classes. */
3356 /* Can match with a multibyte character (and multi character
3357 collating element). Transition table might be updated. */
3358 s = transit_state (d, s, &p);
3363 while ((t = trans[s]) != NULL)
3366 if ((t = trans[s1]) == NULL)
3370 s1 = tmp; /* swap */
3377 if (s >= 0 && (char *) p <= end && d->fails[s])
3379 if (d->success[s] & sbit[*p])
3382 *backref = (d->states[s].backref != 0);
3383 if (d->mb_cur_max > 1)
3393 if (d->mb_cur_max > 1)
3395 /* Can match with a multibyte character (and multicharacter
3396 collating element). Transition table might be updated. */
3397 s = transit_state (d, s, &p);
3401 s = d->fails[s][*p++];
3405 /* If the previous character was a newline, count it. */
3406 if ((char *) p <= end && p[-1] == eol)
3411 if (d->mb_cur_max > 1)
3412 prepare_wc_buf ((const char *) p, end);
3415 /* Check if we've run off the end of the buffer. */
3416 if ((char *) p > end)
3418 if (d->mb_cur_max > 1)
3434 if (p[-1] == eol && allow_nl)
3436 s = d->newlines[s1];
3445 free_mbdata (struct dfa *d)
3449 free (d->multibyte_prop);
3450 d->multibyte_prop = NULL;
3452 for (i = 0; i < d->nmbcsets; ++i)
3455 struct mb_char_classes *p = &(d->mbcsets[i]);
3457 free (p->ch_classes);
3458 free (p->range_sts);
3459 free (p->range_ends);
3461 for (j = 0; j < p->nequivs; ++j)
3462 free (p->equivs[j]);
3465 for (j = 0; j < p->ncoll_elems; ++j)
3466 free (p->coll_elems[j]);
3467 free (p->coll_elems);
3475 /* Initialize the components of a dfa that the other routines don't
3476 initialize for themselves. */
3478 dfainit (struct dfa *d)
3480 memset (d, 0, sizeof *d);
3483 MALLOC (d->charclasses, d->calloc);
3486 MALLOC (d->tokens, d->talloc);
3488 d->mb_cur_max = MB_CUR_MAX;
3490 if (d->mb_cur_max > 1)
3492 d->nmultibyte_prop = 1;
3493 MALLOC (d->multibyte_prop, d->nmultibyte_prop);
3494 d->mbcsets_alloc = 1;
3495 MALLOC (d->mbcsets, d->mbcsets_alloc);
3500 dfaoptimize (struct dfa *d)
3504 if (!MBS_SUPPORT || !using_utf8 ())
3507 for (i = 0; i < d->tindex; ++i)
3509 switch (d->tokens[i])
3515 /* Requires multi-byte algorithm. */
3526 /* Parse and analyze a single string of the given length. */
3528 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3531 dfaparse (s, len, d);
3534 dfaanalyze (d, searchflag);
3537 /* Free the storage held by the components of a dfa. */
3539 dfafree (struct dfa *d)
3542 struct dfamust *dm, *ndm;
3544 free (d->charclasses);
3547 if (d->mb_cur_max > 1)
3550 for (i = 0; i < d->sindex; ++i)
3552 free (d->states[i].elems.elems);
3554 free (d->states[i].mbps.elems);
3557 for (i = 0; i < d->tindex; ++i)
3558 free (d->follows[i].elems);
3560 for (i = 0; i < d->tralloc; ++i)
3565 free (d->realtrans);
3569 for (dm = d->musts; dm; dm = ndm)
3577 /* Having found the postfix representation of the regular expression,
3578 try to find a long sequence of characters that must appear in any line
3580 Finding a "longest" sequence is beyond the scope here;
3581 we take an easy way out and hope for the best.
3582 (Take "(ab|a)b"--please.)
3584 We do a bottom-up calculation of sequences of characters that must appear
3585 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3587 sequences that must appear at the left of the match ("left")
3588 sequences that must appear at the right of the match ("right")
3589 lists of sequences that must appear somewhere in the match ("in")
3590 sequences that must constitute the match ("is")
3592 When we get to the root of the tree, we use one of the longest of its
3593 calculated "in" sequences as our answer. The sequence we find is returned in
3594 d->must (where "d" is the single argument passed to "dfamust");
3595 the length of the sequence is returned in d->mustn.
3597 The sequences calculated for the various types of node (in pseudo ANSI c)
3598 are shown below. "p" is the operand of unary operators (and the left-hand
3599 operand of binary operators); "q" is the right-hand operand of binary
3602 "ZERO" means "a zero-length sequence" below.
3604 Type left right is in
3605 ---- ---- ----- -- --
3606 char c # c # c # c # c
3608 ANYCHAR ZERO ZERO ZERO ZERO
3610 MBCSET ZERO ZERO ZERO ZERO
3612 CSET ZERO ZERO ZERO ZERO
3614 STAR ZERO ZERO ZERO ZERO
3616 QMARK ZERO ZERO ZERO ZERO
3618 PLUS p->left p->right ZERO p->in
3620 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3621 p->left : q->right : q->is!=ZERO) ? q->in plus
3622 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3625 OR longest common longest common (do p->is and substrings common to
3626 leading trailing q->is have same p->in and q->in
3627 (sub)sequence (sub)sequence length and
3628 of p->left of p->right content) ?
3629 and q->left and q->right p->is : NULL
3631 If there's anything else we recognize in the tree, all four sequences get set
3632 to zero-length sequences. If there's something we don't recognize in the tree,
3633 we just return a zero-length sequence.
3635 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3638 And. . .is it here or someplace that we might ponder "optimizations" such as
3639 egrep 'psi|epsilon' -> egrep 'psi'
3640 egrep 'pepsi|epsilon' -> egrep 'epsi'
3641 (Yes, we now find "epsi" as a "string
3642 that must occur", but we might also
3643 simplify the *entire* r.e. being sought)
3644 grep '[c]' -> grep 'c'
3645 grep '(ab|a)b' -> grep 'ab'
3646 grep 'ab*' -> grep 'a'
3647 grep 'a*b' -> grep 'b'
3649 There are several issues:
3651 Is optimization easy (enough)?
3653 Does optimization actually accomplish anything,
3654 or is the automaton you get from "psi|epsilon" (for example)
3655 the same as the one you get from "psi" (for example)?
3657 Are optimizable r.e.'s likely to be used in real-life situations
3658 (something like 'ab*' is probably unlikely; something like is
3659 'psi|epsilon' is likelier)? */
3662 icatalloc (char *old, char const *new)
3665 size_t oldsize = old == NULL ? 0 : strlen (old);
3666 size_t newsize = new == NULL ? 0 : strlen (new);
3669 result = xrealloc (old, oldsize + newsize + 1);
3670 memcpy (result + oldsize, new, newsize + 1);
3675 icpyalloc (char const *string)
3677 return icatalloc (NULL, string);
3680 static char *_GL_ATTRIBUTE_PURE
3681 istrstr (char const *lookin, char const *lookfor)
3686 len = strlen (lookfor);
3687 for (cp = lookin; *cp != '\0'; ++cp)
3688 if (strncmp (cp, lookfor, len) == 0)
3694 freelist (char **cpp)
3700 for (i = 0; cpp[i] != NULL; ++i)
3708 enlist (char **cpp, char *new, size_t len)
3714 if ((new = icpyalloc (new)) == NULL)
3720 /* Is there already something in the list that's new (or longer)? */
3721 for (i = 0; cpp[i] != NULL; ++i)
3722 if (istrstr (cpp[i], new) != NULL)
3727 /* Eliminate any obsoleted strings. */
3729 while (cpp[j] != NULL)
3730 if (istrstr (new, cpp[j]) == NULL)
3740 /* Add the new string. */
3741 REALLOC (cpp, i + 2);
3747 /* Given pointers to two strings, return a pointer to an allocated
3748 list of their distinct common substrings. Return NULL if something
3751 comsubs (char *left, char const *right)
3758 if (left == NULL || right == NULL)
3760 cpp = malloc (sizeof *cpp);
3764 for (lcp = left; *lcp != '\0'; ++lcp)
3767 rcp = strchr (right, *lcp);
3770 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3774 rcp = strchr (rcp + 1, *lcp);
3779 char **p = enlist (cpp, lcp, len);
3793 addlists (char **old, char **new)
3797 if (old == NULL || new == NULL)
3799 for (i = 0; new[i] != NULL; ++i)
3801 old = enlist (old, new[i], strlen (new[i]));
3808 /* Given two lists of substrings, return a new list giving substrings
3811 inboth (char **left, char **right)
3817 if (left == NULL || right == NULL)
3819 both = malloc (sizeof *both);
3823 for (lnum = 0; left[lnum] != NULL; ++lnum)
3825 for (rnum = 0; right[rnum] != NULL; ++rnum)
3827 temp = comsubs (left[lnum], right[rnum]);
3833 both = addlists (both, temp);
3852 resetmust (must * mp)
3854 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3859 dfamust (struct dfa *d)
3870 static char empty_string[] = "";
3872 result = empty_string;
3874 MALLOC (musts, d->tindex + 1);
3876 for (i = 0; i <= d->tindex; ++i)
3878 for (i = 0; i <= d->tindex; ++i)
3880 mp[i].in = xmalloc (sizeof *mp[i].in);
3881 mp[i].left = xmalloc (2);
3882 mp[i].right = xmalloc (2);
3883 mp[i].is = xmalloc (2);
3884 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3888 fprintf (stderr, "dfamust:\n");
3889 for (i = 0; i < d->tindex; ++i)
3891 fprintf (stderr, " %zd:", i);
3892 prtok (d->tokens[i]);
3894 putc ('\n', stderr);
3896 for (ri = 0; ri < d->tindex; ++ri)
3898 switch (t = d->tokens[ri])
3902 assert (!"neither LPAREN nor RPAREN may appear here");
3915 assert (musts < mp);
3920 assert (&musts[2] <= mp);
3925 size_t j, ln, rn, n;
3929 /* Guaranteed to be. Unlikely, but. . . */
3930 if (!STREQ (lmp->is, rmp->is))
3932 /* Left side--easy */
3934 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3936 lmp->left[i] = '\0';
3938 ln = strlen (lmp->right);
3939 rn = strlen (rmp->right);
3943 for (i = 0; i < n; ++i)
3944 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3946 for (j = 0; j < i; ++j)
3947 lmp->right[j] = lmp->right[(ln - i) + j];
3948 lmp->right[j] = '\0';
3949 new = inboth (lmp->in, rmp->in);
3958 assert (musts < mp);
3963 assert (mp == &musts[1]);
3964 for (i = 0; musts[0].in[i] != NULL; ++i)
3965 if (strlen (musts[0].in[i]) > strlen (result))
3966 result = musts[0].in[i];
3967 if (STREQ (result, musts[0].is))
3971 assert (&musts[2] <= mp);
3978 /* In. Everything in left, plus everything in
3979 right, plus concatenation of
3980 left's right and right's left. */
3981 lmp->in = addlists (lmp->in, rmp->in);
3982 if (lmp->in == NULL)
3984 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3988 tp = icpyalloc (lmp->right);
3989 tp = icatalloc (tp, rmp->left);
3990 lmp->in = enlist (lmp->in, tp, strlen (tp));
3992 if (lmp->in == NULL)
3996 if (lmp->is[0] != '\0')
3998 lmp->left = icatalloc (lmp->left, rmp->left);
3999 if (lmp->left == NULL)
4003 if (rmp->is[0] == '\0')
4004 lmp->right[0] = '\0';
4005 lmp->right = icatalloc (lmp->right, rmp->right);
4006 if (lmp->right == NULL)
4008 /* Guaranteed to be */
4009 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
4011 lmp->is = icatalloc (lmp->is, rmp->is);
4012 if (lmp->is == NULL)
4022 assert (!"oops! t >= END");
4026 /* not on *my* shift */
4029 else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
4036 /* plain character */
4038 mp->is[0] = mp->left[0] = mp->right[0] = t;
4039 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4040 mp->in = enlist (mp->in, mp->is, (size_t) 1);
4047 fprintf (stderr, " node: %zd:", ri);
4048 prtok (d->tokens[ri]);
4049 fprintf (stderr, "\n in:");
4050 for (i = 0; mp->in[i]; ++i)
4051 fprintf (stderr, " \"%s\"", mp->in[i]);
4052 fprintf (stderr, "\n is: \"%s\"\n", mp->is);
4053 fprintf (stderr, " left: \"%s\"\n", mp->left);
4054 fprintf (stderr, " right: \"%s\"\n", mp->right);
4059 if (strlen (result))
4063 dm->must = xmemdup (result, strlen (result) + 1);
4064 dm->next = d->musts;
4068 for (i = 0; i <= d->tindex; ++i)
4070 freelist (mp[i].in);
4082 return xmalloc (sizeof (struct dfa));
4085 struct dfamust *_GL_ATTRIBUTE_PURE
4086 dfamusts (struct dfa const *d)
4091 /* vim:set shiftwidth=2: */