1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
35 #define STREQ(a, b) (strcmp (a, b) == 0)
37 /* ISASCIIDIGIT differs from isdigit, as follows:
38 - Its arg may be any int or unsigned int; it need not be an unsigned char.
39 - It's guaranteed to evaluate its argument exactly once.
40 - It's typically faster.
41 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
42 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
43 it's important to use the locale's definition of "digit" even when the
44 host does not conform to Posix. */
45 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
48 #define _(str) gettext (str)
55 /* HPUX defines these as macros in sys/param.h. */
63 /* First integer value that is greater than any character code. */
64 enum { NOTCHAR = 1 << CHAR_BIT };
66 /* This represents part of a character class. It must be unsigned and
67 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
68 typedef unsigned int charclass_word;
70 /* The number of bits used in a charclass word. utf8_classes assumes
71 this is exactly 32. */
72 enum { CHARCLASS_WORD_BITS = 32 };
74 /* The maximum useful value of a charclass_word; all used bits are 1. */
75 #define CHARCLASS_WORD_MASK \
76 (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1)
78 /* Number of words required to hold a bit for every character. */
81 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
84 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
85 typedef charclass_word charclass[CHARCLASS_WORDS];
87 /* Convert a possibly-signed character to an unsigned character. This is
88 a bit safer than casting to unsigned char, since it catches some type
89 errors that the cast doesn't. */
96 /* Contexts tell us whether a character is a newline or a word constituent.
97 Word-constituent characters are those that satisfy iswalnum, plus '_'.
98 Each character has a single CTX_* value; bitmasks of CTX_* values denote
99 a particular character class.
101 A state also stores a context value, which is a bitmask of CTX_* values.
102 A state's context represents a set of characters that the state's
103 predecessors must match. For example, a state whose context does not
104 include CTX_LETTER will never have transitions where the previous
105 character is a word constituent. A state whose context is CTX_ANY
106 might have transitions from any character. */
110 #define CTX_NEWLINE 4
113 /* Sometimes characters can only be matched depending on the surrounding
114 context. Such context decisions depend on what the previous character
115 was, and the value of the current (lookahead) character. Context
116 dependent constraints are encoded as 8 bit integers. Each bit that
117 is set indicates that the constraint succeeds in the corresponding
120 bit 8-11 - valid contexts when next character is CTX_NEWLINE
121 bit 4-7 - valid contexts when next character is CTX_LETTER
122 bit 0-3 - valid contexts when next character is CTX_NONE
124 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
125 succeeds in a particular context. Prev is a bitmask of possible
126 context values for the previous character, curr is the (single-bit)
127 context value for the lookahead character. */
128 #define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
129 #define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
130 #define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
132 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
133 ((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \
134 | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \
135 | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) & (prev))
137 /* The following macros describe what a constraint depends on. */
138 #define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
139 #define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
140 #define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
142 #define PREV_NEWLINE_DEPENDENT(constraint) \
143 (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
144 #define PREV_LETTER_DEPENDENT(constraint) \
145 (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
147 /* Tokens that match the empty string subject to some constraint actually
148 work by applying that constraint to determine what may follow them,
149 taking into account what has gone before. The following values are
150 the constraints corresponding to the special tokens previously defined. */
151 #define NO_CONSTRAINT 0x777
152 #define BEGLINE_CONSTRAINT 0x444
153 #define ENDLINE_CONSTRAINT 0x700
154 #define BEGWORD_CONSTRAINT 0x050
155 #define ENDWORD_CONSTRAINT 0x202
156 #define LIMWORD_CONSTRAINT 0x252
157 #define NOTLIMWORD_CONSTRAINT 0x525
159 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
160 are operators and others are terminal symbols. Most (but not all) of these
161 codes are returned by the lexical analyzer. */
163 typedef ptrdiff_t token;
165 /* Predefined token values. */
168 END = -1, /* END is a terminal symbol that matches the
169 end of input; any value of END or less in
170 the parse tree is such a symbol. Accepting
171 states of the DFA are those that would have
172 a transition on END. */
174 /* Ordinary character values are terminal symbols that match themselves. */
176 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
179 BACKREF, /* BACKREF is generated by \<digit>
180 or by any other construct that
181 is not completely handled. If the scanner
182 detects a transition on backref, it returns
183 a kind of "semi-success" indicating that
184 the match will have to be verified with
185 a backtracking matcher. */
187 BEGLINE, /* BEGLINE is a terminal symbol that matches
188 the empty string at the beginning of a
191 ENDLINE, /* ENDLINE is a terminal symbol that matches
192 the empty string at the end of a line. */
194 BEGWORD, /* BEGWORD is a terminal symbol that matches
195 the empty string at the beginning of a
198 ENDWORD, /* ENDWORD is a terminal symbol that matches
199 the empty string at the end of a word. */
201 LIMWORD, /* LIMWORD is a terminal symbol that matches
202 the empty string at the beginning or the
205 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
206 matches the empty string not at
207 the beginning or end of a word. */
209 QMARK, /* QMARK is an operator of one argument that
210 matches zero or one occurrences of its
213 STAR, /* STAR is an operator of one argument that
214 matches the Kleene closure (zero or more
215 occurrences) of its argument. */
217 PLUS, /* PLUS is an operator of one argument that
218 matches the positive closure (one or more
219 occurrences) of its argument. */
221 REPMN, /* REPMN is a lexical token corresponding
222 to the {m,n} construct. REPMN never
223 appears in the compiled token vector. */
225 CAT, /* CAT is an operator of two arguments that
226 matches the concatenation of its
227 arguments. CAT is never returned by the
230 OR, /* OR is an operator of two arguments that
231 matches either of its arguments. */
233 LPAREN, /* LPAREN never appears in the parse tree,
234 it is only a lexeme. */
236 RPAREN, /* RPAREN never appears in the parse tree. */
238 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
239 a valid multibyte (or single byte) character.
240 It is used only if MB_CUR_MAX > 1. */
242 MBCSET, /* MBCSET is similar to CSET, but for
243 multibyte characters. */
245 WCHAR, /* Only returned by lex. wctok contains
246 the wide character representation. */
248 CSET /* CSET and (and any value greater) is a
249 terminal symbol that matches any of a
250 class of characters. */
254 /* States of the recognizer correspond to sets of positions in the parse
255 tree, together with the constraints under which they may be matched.
256 So a position is encoded as an index into the parse tree together with
260 size_t index; /* Index into the parse array. */
261 unsigned int constraint; /* Constraint for matching this position. */
264 /* Sets of positions are stored as arrays. */
267 position *elems; /* Elements of this position set. */
268 size_t nelem; /* Number of elements in this set. */
269 size_t alloc; /* Number of elements allocated in ELEMS. */
272 /* Sets of leaves are also stored as arrays. */
275 size_t *elems; /* Elements of this position set. */
276 size_t nelem; /* Number of elements in this set. */
279 /* A state of the dfa consists of a set of positions, some flags,
280 and the token value of the lowest-numbered position of the state that
281 contains an END token. */
284 size_t hash; /* Hash of the positions of this state. */
285 position_set elems; /* Positions this state could match. */
286 unsigned char context; /* Context from previous state. */
287 bool has_backref; /* This state matches a \<digit>. */
288 bool has_mbcset; /* This state matches a MBCSET. */
289 unsigned short constraint; /* Constraint for this state to accept. */
290 token first_end; /* Token value of the first END in elems. */
291 position_set mbps; /* Positions which can match multibyte
292 characters, e.g., period.
293 Used only if MB_CUR_MAX > 1. */
296 /* States are indexed by state_num values. These are normally
297 nonnegative but -1 is used as a special value. */
298 typedef ptrdiff_t state_num;
300 /* A bracket operator.
301 e.g., [a-c], [[:alpha:]], etc. */
302 struct mb_char_classes
306 wchar_t *chars; /* Normal characters. */
308 wctype_t *ch_classes; /* Character classes. */
310 struct /* Range characters. */
312 wchar_t beg; /* Range start. */
313 wchar_t end; /* Range end. */
316 char **equivs; /* Equivalence classes. */
319 size_t ncoll_elems; /* Collating elements. */
322 /* A compiled regular expression. */
325 /* Fields filled by the scanner. */
326 charclass *charclasses; /* Array of character sets for CSET tokens. */
327 size_t cindex; /* Index for adding new charclasses. */
328 size_t calloc; /* Number of charclasses allocated. */
330 /* Fields filled by the parser. */
331 token *tokens; /* Postfix parse array. */
332 size_t tindex; /* Index for adding new tokens. */
333 size_t talloc; /* Number of tokens currently allocated. */
334 size_t depth; /* Depth required of an evaluation stack
335 used for depth-first traversal of the
337 size_t nleaves; /* Number of leaves on the parse tree. */
338 size_t nregexps; /* Count of parallel regexps being built
340 bool fast; /* The DFA is fast. */
341 bool multibyte; /* MB_CUR_MAX > 1. */
342 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
343 mbstate_t mbs; /* Multibyte conversion state. */
345 /* The following are valid only if MB_CUR_MAX > 1. */
347 /* The value of multibyte_prop[i] is defined by following rule.
348 if tokens[i] < NOTCHAR
349 bit 0 : tokens[i] is the first byte of a character, including
350 single-byte characters.
351 bit 1 : tokens[i] is the last byte of a character, including
352 single-byte characters.
354 if tokens[i] = MBCSET
355 ("the index of mbcsets corresponding to this operator" << 2) + 3
359 = 'single_byte_a', 'multi_byte_A', single_byte_b'
360 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
366 /* A table indexed by byte values that contains the corresponding wide
367 character (if any) for that byte. WEOF means the byte is not a
368 valid single-byte character. */
369 wint_t mbrtowc_cache[NOTCHAR];
371 /* Array of the bracket expression in the DFA. */
372 struct mb_char_classes *mbcsets;
374 size_t mbcsets_alloc;
376 /* Fields filled by the superset. */
377 struct dfa *superset; /* Hint of the dfa. */
379 /* Fields filled by the state builder. */
380 dfa_state *states; /* States of the dfa. */
381 state_num sindex; /* Index for adding new states. */
382 size_t salloc; /* Number of states currently allocated. */
384 /* Fields filled by the parse tree->NFA conversion. */
385 position_set *follows; /* Array of follow sets, indexed by position
386 index. The follow of a position is the set
387 of positions containing characters that
388 could conceivably follow a character
389 matching the given position in a string
390 matching the regexp. Allocated to the
391 maximum possible position index. */
392 bool searchflag; /* We are supposed to build a searching
393 as opposed to an exact matcher. A searching
394 matcher finds the first and shortest string
395 matching a regexp anywhere in the buffer,
396 whereas an exact matcher finds the longest
397 string matching, but anchored to the
398 beginning of the buffer. */
400 /* Fields filled by dfaexec. */
401 state_num tralloc; /* Number of transition tables that have
402 slots so far, not counting trans[-1]. */
403 int trcount; /* Number of transition tables that have
404 actually been built. */
405 state_num **trans; /* Transition tables for states that can
406 never accept. If the transitions for a
407 state have not yet been computed, or the
408 state could possibly accept, its entry in
409 this table is NULL. This points to one
410 past the start of the allocated array,
411 and trans[-1] is always NULL. */
412 state_num **fails; /* Transition tables after failing to accept
413 on a state that potentially could do so. */
414 int *success; /* Table of acceptance conditions used in
415 dfaexec and computed in build_state. */
416 state_num *newlines; /* Transitions on newlines. The entry for a
417 newline in any transition table is always
418 -1 so we can count lines without wasting
419 too many cycles. The transition for a
420 newline is stored separately and handled
421 as a special case. Newline is also used
422 as a sentinel at the end of the buffer. */
423 struct dfamust *musts; /* List of strings, at least one of which
424 is known to appear in any r.e. matching
426 position_set mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
428 int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or
429 MBCSET. Null if mb_follows.elems has not
433 /* Some macros for user access to dfa internals. */
435 /* S could possibly be an accepting state of R. */
436 #define ACCEPTING(s, r) ((r).states[s].constraint)
438 /* STATE accepts in the specified context. */
439 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
440 SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
442 static void dfamust (struct dfa *dfa);
443 static void regexp (void);
446 dfambcache (struct dfa *d)
449 for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
452 unsigned char uc = i;
455 d->mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
459 /* Store into *PWC the result of converting the leading bytes of the
460 multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
461 and updating the conversion state in *D. On conversion error,
462 convert just a single byte, to WEOF. Return the number of bytes
465 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
467 * PWC points to wint_t, not to wchar_t.
468 * The last arg is a dfa *D instead of merely a multibyte conversion
469 state D->mbs. D also contains an mbrtowc_cache for speed.
470 * N must be at least 1.
471 * S[N - 1] must be a sentinel byte.
472 * Shift encodings are not supported.
473 * The return value is always in the range 1..N.
474 * D->mbs is always valid afterwards.
475 * *PWC is always set to something. */
477 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
479 unsigned char uc = s[0];
480 wint_t wc = d->mbrtowc_cache[uc];
485 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
486 if (0 < nbytes && nbytes < (size_t) -2)
491 memset (&d->mbs, 0, sizeof d->mbs);
506 fprintf (stderr, "END");
507 else if (t < NOTCHAR)
510 fprintf (stderr, "%c", ch);
571 fprintf (stderr, "%s", s);
576 /* Stuff pertaining to charclasses. */
579 tstbit (unsigned int b, charclass const c)
581 return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
585 setbit (unsigned int b, charclass c)
587 c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
591 clrbit (unsigned int b, charclass c)
593 c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
594 << b % CHARCLASS_WORD_BITS);
598 copyset (charclass const src, charclass dst)
600 memcpy (dst, src, sizeof (charclass));
604 zeroset (charclass s)
606 memset (s, 0, sizeof (charclass));
614 for (i = 0; i < CHARCLASS_WORDS; ++i)
615 s[i] = CHARCLASS_WORD_MASK & ~s[i];
619 equal (charclass const s1, charclass const s2)
621 return memcmp (s1, s2, sizeof (charclass)) == 0;
624 /* Ensure that the array addressed by PTR holds at least NITEMS +
625 (PTR || !NITEMS) items. Either return PTR, or reallocate the array
626 and return its new address. Although PTR may be null, the returned
629 The array holds *NALLOC items; *NALLOC is updated on reallocation.
630 ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays
633 maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
635 if (nitems < *nalloc)
638 return x2nrealloc (ptr, nalloc, itemsize);
641 /* In DFA D, find the index of charclass S, or allocate a new one. */
643 dfa_charclass_index (struct dfa *d, charclass const s)
647 for (i = 0; i < d->cindex; ++i)
648 if (equal (s, d->charclasses[i]))
650 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
651 sizeof *d->charclasses);
653 copyset (s, d->charclasses[i]);
657 /* A pointer to the current dfa is kept here during parsing. */
658 static struct dfa *dfa;
660 /* Find the index of charclass S in the current DFA, or allocate a new one. */
662 charclass_index (charclass const s)
664 return dfa_charclass_index (dfa, s);
667 /* Syntax bits controlling the behavior of the lexical analyzer. */
668 static reg_syntax_t syntax_bits, syntax_bits_set;
670 /* Flag for case-folding letters into sets. */
671 static bool case_fold;
673 /* End-of-line byte in data. */
674 static unsigned char eolbyte;
676 /* Cache of char-context values. */
677 static int sbit[NOTCHAR];
679 /* Set of characters considered letters. */
680 static charclass letters;
682 /* Set of characters that are newline. */
683 static charclass newline;
685 /* Add this to the test for whether a byte is word-constituent, since on
686 BSD-based systems, many values in the 128..255 range are classified as
687 alphabetic, while on glibc-based systems, they are not. */
689 # define is_valid_unibyte_character(c) 1
691 # define is_valid_unibyte_character(c) (btowc (c) != WEOF)
694 /* C is a "word-constituent" byte. */
695 #define IS_WORD_CONSTITUENT(C) \
696 (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
699 char_context (unsigned char c)
703 if (IS_WORD_CONSTITUENT (c))
709 wchar_context (wint_t wc)
711 if (wc == (wchar_t) eolbyte || wc == 0)
713 if (wc == L'_' || iswalnum (wc))
718 /* Entry point to set syntax options. */
720 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
726 case_fold = fold != 0;
729 for (i = 0; i < NOTCHAR; ++i)
731 sbit[i] = char_context (i);
744 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
745 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
746 this may happen when folding case in weird Turkish locales where
747 dotless i/dotted I are not included in the chosen character set.
748 Return whether a bit was set in the charclass. */
750 setbit_wc (wint_t wc, charclass c)
760 /* Set a bit for B and its case variants in the charclass C.
761 MB_CUR_MAX must be 1. */
763 setbit_case_fold_c (int b, charclass c)
765 int ub = toupper (b);
767 for (i = 0; i < NOTCHAR; i++)
768 if (toupper (i) == ub)
774 /* UTF-8 encoding allows some optimizations that we can't otherwise
775 assume in a multibyte encoding. */
779 static int utf8 = -1;
783 mbstate_t mbs = { 0 };
784 utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
789 /* The current locale is known to be a unibyte locale
790 without multicharacter collating sequences and where range
791 comparisons simply use the native encoding. These locales can be
792 processed more efficiently. */
795 using_simple_locale (void)
797 /* The native character set is known to be compatible with
798 the C locale. The following test isn't perfect, but it's good
799 enough in practice, as only ASCII and EBCDIC are in common use
800 and this test correctly accepts ASCII and rejects EBCDIC. */
801 enum { native_c_charset =
802 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
803 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
804 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
805 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
806 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
807 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
808 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
809 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
810 && '}' == 125 && '~' == 126)
813 if (! native_c_charset || dfa->multibyte)
817 static int unibyte_c = -1;
820 char const *locale = setlocale (LC_ALL, NULL);
822 || STREQ (locale, "C")
823 || STREQ (locale, "POSIX"));
829 /* Lexical analyzer. All the dross that deals with the obnoxious
830 GNU Regex syntax bits is located here. The poor, suffering
831 reader is referred to the GNU Regex documentation for the
832 meaning of the @#%!@#%^!@ syntax bits. */
834 static char const *lexptr; /* Pointer to next input character. */
835 static size_t lexleft; /* Number of characters remaining. */
836 static token lasttok; /* Previous token returned; initially END. */
837 static bool laststart; /* We're separated from beginning or (,
838 | only by zero-width characters. */
839 static size_t parens; /* Count of outstanding left parens. */
840 static int minrep, maxrep; /* Repeat counts for {m,n}. */
842 static int cur_mb_len = 1; /* Length of the multibyte representation of
845 static wint_t wctok; /* Wide character representation of the current
846 multibyte character, or WEOF if there was
847 an encoding error. Used only if
851 /* Fetch the next lexical input character. Set C (of type int) to the
852 next input byte, except set C to EOF if the input is a multibyte
853 character of length greater than 1. Set WC (of type wint_t) to the
854 value of the input if it is a valid multibyte character (possibly
855 of length 1); otherwise set WC to WEOF. If there is no more input,
856 report EOFERR if EOFERR is not null, and return lasttok = END
858 # define FETCH_WC(c, wc, eoferr) \
865 return lasttok = END; \
870 size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
871 cur_mb_len = nbytes; \
873 (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
880 # define MIN(a,b) ((a) < (b) ? (a) : (b))
883 /* The set of wchar_t values C such that there's a useful locale
884 somewhere where C != towupper (C) && C != towlower (towupper (C)).
885 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
886 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
887 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
888 static short const lonesome_lower[] =
890 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
891 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
893 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
894 counterpart in locales predating Unicode 4.0.0 (April 2003). */
897 0x03F5, 0x1E9B, 0x1FBE,
900 /* Maximum number of characters that can be the case-folded
901 counterparts of a single character, not counting the character
902 itself. This is 1 for towupper, 1 for towlower, and 1 for each
903 entry in LONESOME_LOWER. */
905 { CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
907 /* Find the characters equal to C after case-folding, other than C
908 itself, and store them into FOLDED. Return the number of characters
911 case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
915 wint_t uc = towupper (c);
916 wint_t lc = towlower (uc);
919 if (lc != uc && lc != c && towupper (lc) == uc)
921 for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
923 wint_t li = lonesome_lower[i];
924 if (li != lc && li != uc && li != c && towupper (li) == uc)
930 typedef int predicate (int);
932 /* The following list maps the names of the Posix named character classes
933 to predicate functions that determine whether a given character is in
934 the class. The leading [ has already been eaten by the lexical
940 bool single_byte_only;
943 static const struct dfa_ctype prednames[] = {
944 {"alpha", isalpha, false},
945 {"upper", isupper, false},
946 {"lower", islower, false},
947 {"digit", isdigit, true},
948 {"xdigit", isxdigit, false},
949 {"space", isspace, false},
950 {"punct", ispunct, false},
951 {"alnum", isalnum, false},
952 {"print", isprint, false},
953 {"graph", isgraph, false},
954 {"cntrl", iscntrl, false},
955 {"blank", isblank, false},
959 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
960 find_pred (const char *str)
963 for (i = 0; prednames[i].name; ++i)
964 if (STREQ (str, prednames[i].name))
967 return &prednames[i];
970 /* Multibyte character handling sub-routine for lex.
971 Parse a bracket expression and build a struct mb_char_classes. */
973 parse_bracket_exp (void)
979 /* This is a bracket expression that dfaexec is known to
980 process correctly. */
981 bool known_bracket_exp = true;
983 /* Used to warn about [:space:].
984 Bit 0 = first character is a colon.
985 Bit 1 = last character is a colon.
986 Bit 2 = includes any other character but a colon.
987 Bit 3 = includes ranges, char/equiv classes or collation elements. */
988 int colon_warning_state;
994 /* Work area to build a mb_char_classes. */
995 struct mb_char_classes *work_mbc;
996 size_t chars_al, ranges_al, ch_classes_al, equivs_al, coll_elems_al;
998 chars_al = ranges_al = ch_classes_al = equivs_al = coll_elems_al = 0;
1001 dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
1002 &dfa->mbcsets_alloc,
1003 sizeof *dfa->mbcsets);
1005 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
1006 We will update dfa->multibyte_prop[] in addtok, because we can't
1007 decide the index in dfa->tokens[]. */
1009 /* Initialize work area. */
1010 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
1011 memset (work_mbc, 0, sizeof *work_mbc);
1016 memset (ccl, 0, sizeof ccl);
1017 FETCH_WC (c, wc, _("unbalanced ["));
1020 FETCH_WC (c, wc, _("unbalanced ["));
1022 known_bracket_exp = using_simple_locale ();
1027 colon_warning_state = (c == ':');
1030 c1 = NOTCHAR; /* Mark c1 as not initialized. */
1031 colon_warning_state &= ~2;
1033 /* Note that if we're looking at some other [:...:] construct,
1034 we just treat it as a bunch of ordinary characters. We can do
1035 this because we assume regex has checked for syntax errors before
1036 dfa is ever called. */
1039 FETCH_WC (c1, wc1, _("unbalanced ["));
1041 if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
1042 || c1 == '.' || c1 == '=')
1044 enum { MAX_BRACKET_STRING_LEN = 32 };
1045 char str[MAX_BRACKET_STRING_LEN + 1];
1049 FETCH_WC (c, wc, _("unbalanced ["));
1050 if ((c == c1 && *lexptr == ']') || lexleft == 0)
1052 if (len < MAX_BRACKET_STRING_LEN)
1055 /* This is in any case an invalid class name. */
1060 /* Fetch bracket. */
1061 FETCH_WC (c, wc, _("unbalanced ["));
1063 /* Build character class. POSIX allows character
1064 classes to match multicharacter collating elements,
1065 but the regex code does not support that, so do not
1066 worry about that possibility. */
1069 = (case_fold && (STREQ (str, "upper")
1070 || STREQ (str, "lower")) ? "alpha" : str);
1071 const struct dfa_ctype *pred = find_pred (class);
1073 dfaerror (_("invalid character class"));
1075 if (dfa->multibyte && !pred->single_byte_only)
1077 /* Store the character class as wctype_t. */
1078 wctype_t wt = wctype (class);
1080 work_mbc->ch_classes
1081 = maybe_realloc (work_mbc->ch_classes,
1082 work_mbc->nch_classes, &ch_classes_al,
1083 sizeof *work_mbc->ch_classes);
1084 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
1087 for (c2 = 0; c2 < NOTCHAR; ++c2)
1088 if (pred->func (c2))
1092 known_bracket_exp = false;
1094 colon_warning_state |= 8;
1096 /* Fetch new lookahead character. */
1097 FETCH_WC (c1, wc1, _("unbalanced ["));
1101 /* We treat '[' as a normal character here. c/c1/wc/wc1
1102 are already set up. */
1105 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1106 FETCH_WC (c, wc, _("unbalanced ["));
1109 FETCH_WC (c1, wc1, _("unbalanced ["));
1112 /* build range characters. */
1114 FETCH_WC (c2, wc2, _("unbalanced ["));
1116 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1117 Treat it like [-a[.aa.]] while parsing it, and
1118 remember that the set is unknown. */
1119 if (c2 == '[' && *lexptr == '.')
1121 known_bracket_exp = false;
1127 if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1128 FETCH_WC (c2, wc2, _("unbalanced ["));
1132 /* When case folding map a range, say [m-z] (or even [M-z])
1133 to the pair of ranges, [m-z] [M-Z]. Although this code
1134 is wrong in multiple ways, it's never used in practice.
1135 FIXME: Remove this (and related) unused code. */
1136 if (wc != WEOF && wc2 != WEOF)
1139 = maybe_realloc (work_mbc->ranges, work_mbc->nranges + 2,
1140 &ranges_al, sizeof *work_mbc->ranges);
1141 work_mbc->ranges[work_mbc->nranges].beg
1142 = case_fold ? towlower (wc) : wc;
1143 work_mbc->ranges[work_mbc->nranges++].end
1144 = case_fold ? towlower (wc2) : wc2;
1146 if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
1148 work_mbc->ranges[work_mbc->nranges].beg = towupper (wc);
1149 work_mbc->ranges[work_mbc->nranges++].end
1154 else if (using_simple_locale ())
1156 for (c1 = c; c1 <= c2; c1++)
1160 int uc = toupper (c);
1161 int uc2 = toupper (c2);
1162 for (c1 = 0; c1 < NOTCHAR; c1++)
1164 int uc1 = toupper (c1);
1165 if (uc <= uc1 && uc1 <= uc2)
1171 known_bracket_exp = false;
1173 colon_warning_state |= 8;
1174 FETCH_WC (c1, wc1, _("unbalanced ["));
1178 /* In the case [x-], the - is an ordinary hyphen,
1179 which is left in c1, the lookahead character. */
1180 lexptr -= cur_mb_len;
1181 lexleft += cur_mb_len;
1184 colon_warning_state |= (c == ':') ? 2 : 4;
1186 if (!dfa->multibyte)
1189 setbit_case_fold_c (c, ccl);
1196 known_bracket_exp = false;
1199 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1201 int n = (case_fold ? case_folded_counterparts (wc, folded + 1) + 1
1204 for (i = 0; i < n; i++)
1205 if (!setbit_wc (folded[i], ccl))
1208 = maybe_realloc (work_mbc->chars, work_mbc->nchars,
1209 &chars_al, sizeof *work_mbc->chars);
1210 work_mbc->chars[work_mbc->nchars++] = folded[i];
1214 while ((wc = wc1, (c = c1) != ']'));
1216 if (colon_warning_state == 7)
1217 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1219 if (! known_bracket_exp)
1224 static charclass zeroclass;
1225 work_mbc->invert = invert;
1226 work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
1232 assert (!dfa->multibyte);
1234 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1235 clrbit (eolbyte, ccl);
1238 return CSET + charclass_index (ccl);
1245 bool backslash = false;
1249 /* Basic plan: We fetch a character. If it's a backslash,
1250 we set the backslash flag and go through the loop again.
1251 On the plus side, this avoids having a duplicate of the
1252 main switch inside the backslash case. On the minus side,
1253 it means that just about every case begins with
1254 "if (backslash) ...". */
1255 for (i = 0; i < 2; ++i)
1257 FETCH_WC (c, wctok, NULL);
1265 dfaerror (_("unfinished \\ escape"));
1272 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1273 || lasttok == END || lasttok == LPAREN || lasttok == OR)
1274 return lasttok = BEGLINE;
1280 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1282 || (syntax_bits & RE_NO_BK_PARENS
1283 ? lexleft > 0 && *lexptr == ')'
1284 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1285 || (syntax_bits & RE_NO_BK_VBAR
1286 ? lexleft > 0 && *lexptr == '|'
1287 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1288 || ((syntax_bits & RE_NEWLINE_ALT)
1289 && lexleft > 0 && *lexptr == '\n'))
1290 return lasttok = ENDLINE;
1302 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1305 return lasttok = BACKREF;
1310 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1311 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1315 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1316 return lasttok = ENDLINE; /* FIXME: should be end of string */
1320 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1321 return lasttok = BEGWORD;
1325 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1326 return lasttok = ENDWORD;
1330 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1331 return lasttok = LIMWORD;
1335 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1336 return lasttok = NOTLIMWORD;
1340 if (syntax_bits & RE_LIMITED_OPS)
1342 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1344 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1346 return lasttok = QMARK;
1351 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1353 return lasttok = STAR;
1356 if (syntax_bits & RE_LIMITED_OPS)
1358 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1360 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1362 return lasttok = PLUS;
1365 if (!(syntax_bits & RE_INTERVALS))
1367 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1369 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1374 {M,} - minimum count, maximum is infinity
1376 {,} - 0 to infinity (same as '*')
1377 {M,N} - M through N */
1379 char const *p = lexptr;
1380 char const *lim = p + lexleft;
1381 minrep = maxrep = -1;
1382 for (; p != lim && ISASCIIDIGIT (*p); p++)
1387 minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
1397 while (++p != lim && ISASCIIDIGIT (*p))
1402 maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
1406 if (! ((! backslash || (p != lim && *p++ == '\\'))
1407 && p != lim && *p++ == '}'
1408 && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
1410 if (syntax_bits & RE_INVALID_INTERVAL_ORD)
1412 dfaerror (_("invalid content of \\{\\}"));
1414 if (RE_DUP_MAX < maxrep)
1415 dfaerror (_("regular expression too big"));
1420 return lasttok = REPMN;
1423 if (syntax_bits & RE_LIMITED_OPS)
1425 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1428 return lasttok = OR;
1431 if (syntax_bits & RE_LIMITED_OPS
1432 || backslash || !(syntax_bits & RE_NEWLINE_ALT))
1435 return lasttok = OR;
1438 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1442 return lasttok = LPAREN;
1445 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1447 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1451 return lasttok = RPAREN;
1458 /* In multibyte environment period must match with a single
1459 character not a byte. So we use ANYCHAR. */
1461 return lasttok = ANYCHAR;
1465 if (!(syntax_bits & RE_DOT_NEWLINE))
1466 clrbit (eolbyte, ccl);
1467 if (syntax_bits & RE_DOT_NOT_NULL)
1470 return lasttok = CSET + charclass_index (ccl);
1474 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1476 if (!dfa->multibyte)
1479 for (c2 = 0; c2 < NOTCHAR; ++c2)
1485 return lasttok = CSET + charclass_index (ccl);
1488 #define PUSH_LEX_STATE(s) \
1491 char const *lexptr_saved = lexptr; \
1492 size_t lexleft_saved = lexleft; \
1494 lexleft = strlen (lexptr)
1496 #define POP_LEX_STATE() \
1497 lexptr = lexptr_saved; \
1498 lexleft = lexleft_saved; \
1502 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1503 add_utf8_anychar, makes sense. */
1505 /* \s and \S are documented to be equivalent to [[:space:]] and
1506 [^[:space:]] respectively, so tell the lexer to process those
1507 strings, each minus its "already processed" '['. */
1508 PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
1510 lasttok = parse_bracket_exp ();
1519 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1522 for (c2 = 0; c2 < NOTCHAR; ++c2)
1523 if (IS_WORD_CONSTITUENT (c2))
1528 return lasttok = CSET + charclass_index (ccl);
1534 return lasttok = parse_bracket_exp ();
1539 /* For multibyte character sets, folding is done in atom. Always
1542 return lasttok = WCHAR;
1544 if (case_fold && isalpha (c))
1547 setbit_case_fold_c (c, ccl);
1548 return lasttok = CSET + charclass_index (ccl);
1555 /* The above loop should consume at most a backslash
1556 and some other character. */
1558 return END; /* keeps pedantic compilers happy. */
1561 /* Recursive descent parser for regular expressions. */
1563 static token tok; /* Lookahead token. */
1564 static size_t depth; /* Current depth of a hypothetical stack
1565 holding deferred productions. This is
1566 used to determine the depth that will be
1567 required of the real stack later on in
1571 addtok_mb (token t, int mbprop)
1573 if (dfa->talloc == dfa->tindex)
1575 dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1576 sizeof *dfa->tokens);
1578 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1579 sizeof *dfa->multibyte_prop);
1582 dfa->multibyte_prop[dfa->tindex] = mbprop;
1583 dfa->tokens[dfa->tindex++] = t;
1607 if (depth > dfa->depth)
1611 static void addtok_wc (wint_t wc);
1613 /* Add the given token to the parse tree, maintaining the depth count and
1614 updating the maximum depth if necessary. */
1618 if (dfa->multibyte && t == MBCSET)
1620 bool need_or = false;
1621 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1623 /* Extract wide characters into alternations for better performance.
1624 This does not require UTF-8. */
1625 if (!work_mbc->invert)
1628 for (i = 0; i < work_mbc->nchars; i++)
1630 addtok_wc (work_mbc->chars[i]);
1635 work_mbc->nchars = 0;
1638 /* If the MBCSET is non-inverted and doesn't include neither
1639 character classes including multibyte characters, range
1640 expressions, equivalence classes nor collating elements,
1641 it can be replaced to a simple CSET. */
1642 if (work_mbc->invert
1643 || work_mbc->nch_classes != 0
1644 || work_mbc->nranges != 0
1645 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
1647 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1653 /* Characters have been handled above, so it is possible
1654 that the mbcset is empty now. Do nothing in that case. */
1655 if (work_mbc->cset != -1)
1657 addtok (CSET + work_mbc->cset);
1669 /* We treat a multibyte character as a single atom, so that DFA
1670 can treat a multibyte character as a single expression.
1672 e.g., we construct the following tree from "<mb1><mb2>".
1673 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1674 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1676 addtok_wc (wint_t wc)
1678 unsigned char buf[MB_LEN_MAX];
1679 mbstate_t s = { 0 };
1681 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1683 if (stored_bytes != (size_t) -1)
1684 cur_mb_len = stored_bytes;
1687 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1688 the addtok_mb call altogether can corrupt the heap. */
1693 addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
1694 for (i = 1; i < cur_mb_len; i++)
1696 addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
1702 add_utf8_anychar (void)
1704 static const charclass utf8_classes[5] = {
1705 /* 80-bf: non-leading bytes. */
1706 {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
1708 /* 00-7f: 1-byte sequence. */
1709 {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK,
1710 CHARCLASS_WORD_MASK, 0, 0, 0, 0},
1712 /* c2-df: 2-byte sequence. */
1713 {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0},
1715 /* e0-ef: 3-byte sequence. */
1716 {0, 0, 0, 0, 0, 0, 0, 0xffff},
1718 /* f0-f7: 4-byte sequence. */
1719 {0, 0, 0, 0, 0, 0, 0, 0xff0000}
1721 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1724 /* Define the five character classes that are needed below. */
1725 if (dfa->utf8_anychar_classes[0] == 0)
1726 for (i = 0; i < n; i++)
1729 copyset (utf8_classes[i], c);
1732 if (!(syntax_bits & RE_DOT_NEWLINE))
1733 clrbit (eolbyte, c);
1734 if (syntax_bits & RE_DOT_NOT_NULL)
1737 dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
1740 /* A valid UTF-8 character is
1743 |[0xc2-0xdf][0x80-0xbf]
1744 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1745 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1747 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1748 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1749 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1750 for (i = 1; i < n; i++)
1751 addtok (dfa->utf8_anychar_classes[i]);
1754 addtok (dfa->utf8_anychar_classes[0]);
1760 /* The grammar understood by the parser is as follows.
1779 <multibyte character>
1790 LPAREN regexp RPAREN
1793 The parser builds a parse tree in postfix form in an array of tokens. */
1808 wchar_t folded[CASE_FOLDED_BUFSIZE];
1809 int i, n = case_folded_counterparts (wctok, folded);
1810 for (i = 0; i < n; i++)
1812 addtok_wc (folded[i]);
1820 else if (tok == ANYCHAR && using_utf8 ())
1822 /* For UTF-8 expand the period to a series of CSETs that define a valid
1823 UTF-8 character. This avoids using the slow multibyte path. I'm
1824 pretty sure it would be both profitable and correct to do it for
1825 any encoding; however, the optimization must be done manually as
1826 it is done above in add_utf8_anychar. So, let's start with
1827 UTF-8: it is the most used, and the structure of the encoding
1828 makes the correctness more obvious. */
1829 add_utf8_anychar ();
1832 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1833 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1834 || tok == ANYCHAR || tok == MBCSET
1835 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1840 else if (tok == LPAREN)
1845 dfaerror (_("unbalanced ("));
1852 /* Return the number of tokens in the given subexpression. */
1853 static size_t _GL_ATTRIBUTE_PURE
1854 nsubtoks (size_t tindex)
1858 switch (dfa->tokens[tindex - 1])
1865 return 1 + nsubtoks (tindex - 1);
1868 ntoks1 = nsubtoks (tindex - 1);
1869 return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
1873 /* Copy the given subexpression to the top of the tree. */
1875 copytoks (size_t tindex, size_t ntokens)
1880 for (i = 0; i < ntokens; ++i)
1881 addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
1883 for (i = 0; i < ntokens; ++i)
1884 addtok_mb (dfa->tokens[tindex + i], 3);
1891 size_t tindex, ntokens;
1894 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1895 if (tok == REPMN && (minrep || maxrep))
1897 ntokens = nsubtoks (dfa->tindex);
1898 tindex = dfa->tindex - ntokens;
1903 for (i = 1; i < minrep; ++i)
1905 copytoks (tindex, ntokens);
1908 for (; i < maxrep; ++i)
1910 copytoks (tindex, ntokens);
1916 else if (tok == REPMN)
1918 dfa->tindex -= nsubtoks (dfa->tindex);
1933 while (tok != RPAREN && tok != OR && tok >= 0)
1952 /* Main entry point for the parser. S is a string to be parsed, len is the
1953 length of the string, so s can include NUL characters. D is a pointer to
1954 the struct dfa to parse into. */
1956 dfaparse (char const *s, size_t len, struct dfa *d)
1967 memset (&d->mbs, 0, sizeof d->mbs);
1970 if (!syntax_bits_set)
1971 dfaerror (_("no syntax specified"));
1979 dfaerror (_("unbalanced )"));
1981 addtok (END - d->nregexps);
1990 /* Some primitives for operating on sets of positions. */
1992 /* Copy one set to another. */
1994 copy (position_set const *src, position_set * dst)
1996 if (dst->alloc < src->nelem)
1999 dst->alloc = src->nelem;
2000 dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems);
2002 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2003 dst->nelem = src->nelem;
2007 alloc_position_set (position_set * s, size_t size)
2009 s->elems = xnmalloc (size, sizeof *s->elems);
2014 /* Insert position P in set S. S is maintained in sorted order on
2015 decreasing index. If there is already an entry in S with P.index
2016 then merge (logically-OR) P's constraints into the one in S.
2017 S->elems must point to an array large enough to hold the resulting set. */
2019 insert (position p, position_set * s)
2021 size_t count = s->nelem;
2022 size_t lo = 0, hi = count;
2026 size_t mid = (lo + hi) >> 1;
2027 if (s->elems[mid].index > p.index)
2033 if (lo < count && p.index == s->elems[lo].index)
2035 s->elems[lo].constraint |= p.constraint;
2039 s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems);
2040 for (i = count; i > lo; i--)
2041 s->elems[i] = s->elems[i - 1];
2046 /* Merge two sets of positions into a third. The result is exactly as if
2047 the positions of both sets were inserted into an initially empty set. */
2049 merge (position_set const *s1, position_set const *s2, position_set * m)
2051 size_t i = 0, j = 0;
2053 if (m->alloc < s1->nelem + s2->nelem)
2056 m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc,
2060 while (i < s1->nelem && j < s2->nelem)
2061 if (s1->elems[i].index > s2->elems[j].index)
2062 m->elems[m->nelem++] = s1->elems[i++];
2063 else if (s1->elems[i].index < s2->elems[j].index)
2064 m->elems[m->nelem++] = s2->elems[j++];
2067 m->elems[m->nelem] = s1->elems[i++];
2068 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
2070 while (i < s1->nelem)
2071 m->elems[m->nelem++] = s1->elems[i++];
2072 while (j < s2->nelem)
2073 m->elems[m->nelem++] = s2->elems[j++];
2076 /* Delete a position from a set. */
2078 delete (position p, position_set * s)
2082 for (i = 0; i < s->nelem; ++i)
2083 if (p.index == s->elems[i].index)
2086 for (--s->nelem; i < s->nelem; ++i)
2087 s->elems[i] = s->elems[i + 1];
2090 /* Find the index of the state corresponding to the given position set with
2091 the given preceding context, or create a new state if there is no such
2092 state. Context tells whether we got here on a newline or letter. */
2094 state_index (struct dfa *d, position_set const *s, int context)
2100 for (i = 0; i < s->nelem; ++i)
2101 hash ^= s->elems[i].index + s->elems[i].constraint;
2103 /* Try to find a state that exactly matches the proposed one. */
2104 for (i = 0; i < d->sindex; ++i)
2106 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2107 || context != d->states[i].context)
2109 for (j = 0; j < s->nelem; ++j)
2110 if (s->elems[j].constraint
2111 != d->states[i].elems.elems[j].constraint
2112 || s->elems[j].index != d->states[i].elems.elems[j].index)
2118 /* We'll have to create a new state. */
2119 d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
2121 d->states[i].hash = hash;
2122 alloc_position_set (&d->states[i].elems, s->nelem);
2123 copy (s, &d->states[i].elems);
2124 d->states[i].context = context;
2125 d->states[i].has_backref = false;
2126 d->states[i].has_mbcset = false;
2127 d->states[i].constraint = 0;
2128 d->states[i].first_end = 0;
2129 d->states[i].mbps.nelem = 0;
2130 d->states[i].mbps.elems = NULL;
2132 for (j = 0; j < s->nelem; ++j)
2133 if (d->tokens[s->elems[j].index] < 0)
2135 constraint = s->elems[j].constraint;
2136 if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
2137 d->states[i].constraint |= constraint;
2138 if (!d->states[i].first_end)
2139 d->states[i].first_end = d->tokens[s->elems[j].index];
2141 else if (d->tokens[s->elems[j].index] == BACKREF)
2143 d->states[i].constraint = NO_CONSTRAINT;
2144 d->states[i].has_backref = true;
2152 /* Find the epsilon closure of a set of positions. If any position of the set
2153 contains a symbol that matches the empty string in some context, replace
2154 that position with the elements of its follow labeled with an appropriate
2155 constraint. Repeat exhaustively until no funny positions are left.
2156 S->elems must be large enough to hold the result. */
2158 epsclosure (position_set *s, struct dfa const *d, char *visited)
2162 bool initialized = false;
2164 for (i = 0; i < s->nelem; ++i)
2165 if (d->tokens[s->elems[i].index] >= NOTCHAR
2166 && d->tokens[s->elems[i].index] != BACKREF
2167 && d->tokens[s->elems[i].index] != ANYCHAR
2168 && d->tokens[s->elems[i].index] != MBCSET
2169 && d->tokens[s->elems[i].index] < CSET)
2173 memset (visited, 0, d->tindex * sizeof (*visited));
2177 p.constraint = old.constraint;
2178 delete (s->elems[i], s);
2179 if (visited[old.index])
2184 visited[old.index] = 1;
2185 switch (d->tokens[old.index])
2188 p.constraint &= BEGLINE_CONSTRAINT;
2191 p.constraint &= ENDLINE_CONSTRAINT;
2194 p.constraint &= BEGWORD_CONSTRAINT;
2197 p.constraint &= ENDWORD_CONSTRAINT;
2200 p.constraint &= LIMWORD_CONSTRAINT;
2203 p.constraint &= NOTLIMWORD_CONSTRAINT;
2208 for (j = 0; j < d->follows[old.index].nelem; ++j)
2210 p.index = d->follows[old.index].elems[j].index;
2213 /* Force rescan to start at the beginning. */
2218 /* Returns the set of contexts for which there is at least one
2219 character included in C. */
2222 charclass_context (charclass c)
2227 if (tstbit (eolbyte, c))
2228 context |= CTX_NEWLINE;
2230 for (j = 0; j < CHARCLASS_WORDS; ++j)
2232 if (c[j] & letters[j])
2233 context |= CTX_LETTER;
2234 if (c[j] & ~(letters[j] | newline[j]))
2235 context |= CTX_NONE;
2241 /* Returns the contexts on which the position set S depends. Each context
2242 in the set of returned contexts (let's call it SC) may have a different
2243 follow set than other contexts in SC, and also different from the
2244 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2245 in the complement set will have the same follow set. */
2247 static int _GL_ATTRIBUTE_PURE
2248 state_separate_contexts (position_set const *s)
2250 int separate_contexts = 0;
2253 for (j = 0; j < s->nelem; ++j)
2255 if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2256 separate_contexts |= CTX_NEWLINE;
2257 if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2258 separate_contexts |= CTX_LETTER;
2261 return separate_contexts;
2265 /* Perform bottom-up analysis on the parse tree, computing various functions.
2266 Note that at this point, we're pretending constructs like \< are real
2267 characters rather than constraints on what can follow them.
2269 Nullable: A node is nullable if it is at the root of a regexp that can
2270 match the empty string.
2271 * EMPTY leaves are nullable.
2272 * No other leaf is nullable.
2273 * A QMARK or STAR node is nullable.
2274 * A PLUS node is nullable if its argument is nullable.
2275 * A CAT node is nullable if both its arguments are nullable.
2276 * An OR node is nullable if either argument is nullable.
2278 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2279 that could correspond to the first character of a string matching the
2280 regexp rooted at the given node.
2281 * EMPTY leaves have empty firstpos.
2282 * The firstpos of a nonempty leaf is that leaf itself.
2283 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2285 * The firstpos of a CAT node is the firstpos of the left argument, union
2286 the firstpos of the right if the left argument is nullable.
2287 * The firstpos of an OR node is the union of firstpos of each argument.
2289 Lastpos: The lastpos of a node is the set of positions that could
2290 correspond to the last character of a string matching the regexp at
2292 * EMPTY leaves have empty lastpos.
2293 * The lastpos of a nonempty leaf is that leaf itself.
2294 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2296 * The lastpos of a CAT node is the lastpos of its right argument, union
2297 the lastpos of the left if the right argument is nullable.
2298 * The lastpos of an OR node is the union of the lastpos of each argument.
2300 Follow: The follow of a position is the set of positions that could
2301 correspond to the character following a character matching the node in
2302 a string matching the regexp. At this point we consider special symbols
2303 that match the empty string in some context to be just normal characters.
2304 Later, if we find that a special symbol is in a follow set, we will
2305 replace it with the elements of its follow, labeled with an appropriate
2307 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2308 the follow of every node in the lastpos.
2309 * Every node in the firstpos of the second argument of a CAT node is in
2310 the follow of every node in the lastpos of the first argument.
2312 Because of the postfix representation of the parse tree, the depth-first
2313 analysis is conveniently done by a linear scan with the aid of a stack.
2314 Sets are stored as arrays of the elements, obeying a stack-like allocation
2315 scheme; the number of elements in each set deeper in the stack can be
2316 used to determine the address of a particular set's array. */
2318 dfaanalyze (struct dfa *d, int searchflag)
2320 /* Array allocated to hold position sets. */
2321 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2322 /* Firstpos and lastpos elements. */
2323 position *firstpos = posalloc + d->nleaves;
2324 position *lastpos = firstpos + d->nleaves;
2326 /* Stack for element counts and nullable flags. */
2329 /* Whether the entry is nullable. */
2332 /* Counts of firstpos and lastpos sets. */
2335 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2337 position_set tmp; /* Temporary set for merging sets. */
2338 position_set merged; /* Result of merging sets. */
2339 int separate_contexts; /* Context wanted by some position. */
2342 char *visited = xnmalloc (d->tindex, sizeof *visited);
2345 fprintf (stderr, "dfaanalyze:\n");
2346 for (i = 0; i < d->tindex; ++i)
2348 fprintf (stderr, " %zd:", i);
2349 prtok (d->tokens[i]);
2351 putc ('\n', stderr);
2354 d->searchflag = searchflag != 0;
2355 alloc_position_set (&merged, d->nleaves);
2356 d->follows = xcalloc (d->tindex, sizeof *d->follows);
2358 for (i = 0; i < d->tindex; ++i)
2360 switch (d->tokens[i])
2363 /* The empty set is nullable. */
2364 stk->nullable = true;
2366 /* The firstpos and lastpos of the empty leaf are both empty. */
2367 stk->nfirstpos = stk->nlastpos = 0;
2373 /* Every element in the firstpos of the argument is in the follow
2374 of every element in the lastpos. */
2375 tmp.nelem = stk[-1].nfirstpos;
2376 tmp.elems = firstpos;
2378 for (j = 0; j < stk[-1].nlastpos; ++j)
2380 merge (&tmp, &d->follows[pos[j].index], &merged);
2381 copy (&merged, &d->follows[pos[j].index]);
2386 /* A QMARK or STAR node is automatically nullable. */
2387 if (d->tokens[i] != PLUS)
2388 stk[-1].nullable = true;
2392 /* Every element in the firstpos of the second argument is in the
2393 follow of every element in the lastpos of the first argument. */
2394 tmp.nelem = stk[-1].nfirstpos;
2395 tmp.elems = firstpos;
2396 pos = lastpos + stk[-1].nlastpos;
2397 for (j = 0; j < stk[-2].nlastpos; ++j)
2399 merge (&tmp, &d->follows[pos[j].index], &merged);
2400 copy (&merged, &d->follows[pos[j].index]);
2403 /* The firstpos of a CAT node is the firstpos of the first argument,
2404 union that of the second argument if the first is nullable. */
2405 if (stk[-2].nullable)
2406 stk[-2].nfirstpos += stk[-1].nfirstpos;
2408 firstpos += stk[-1].nfirstpos;
2410 /* The lastpos of a CAT node is the lastpos of the second argument,
2411 union that of the first argument if the second is nullable. */
2412 if (stk[-1].nullable)
2413 stk[-2].nlastpos += stk[-1].nlastpos;
2416 pos = lastpos + stk[-2].nlastpos;
2417 for (j = stk[-1].nlastpos; j-- > 0;)
2418 pos[j] = lastpos[j];
2419 lastpos += stk[-2].nlastpos;
2420 stk[-2].nlastpos = stk[-1].nlastpos;
2423 /* A CAT node is nullable if both arguments are nullable. */
2424 stk[-2].nullable &= stk[-1].nullable;
2429 /* The firstpos is the union of the firstpos of each argument. */
2430 stk[-2].nfirstpos += stk[-1].nfirstpos;
2432 /* The lastpos is the union of the lastpos of each argument. */
2433 stk[-2].nlastpos += stk[-1].nlastpos;
2435 /* An OR node is nullable if either argument is nullable. */
2436 stk[-2].nullable |= stk[-1].nullable;
2441 /* Anything else is a nonempty position. (Note that special
2442 constructs like \< are treated as nonempty strings here;
2443 an "epsilon closure" effectively makes them nullable later.
2444 Backreferences have to get a real position so we can detect
2445 transitions on them later. But they are nullable. */
2446 stk->nullable = d->tokens[i] == BACKREF;
2448 /* This position is in its own firstpos and lastpos. */
2449 stk->nfirstpos = stk->nlastpos = 1;
2452 --firstpos, --lastpos;
2453 firstpos->index = lastpos->index = i;
2454 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2456 /* Allocate the follow set for this position. */
2457 alloc_position_set (&d->follows[i], 1);
2461 /* ... balance the above nonsyntactic #ifdef goo... */
2462 fprintf (stderr, "node %zd:", i);
2463 prtok (d->tokens[i]);
2464 putc ('\n', stderr);
2466 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2467 fprintf (stderr, " firstpos:");
2468 for (j = stk[-1].nfirstpos; j-- > 0;)
2470 fprintf (stderr, " %zd:", firstpos[j].index);
2471 prtok (d->tokens[firstpos[j].index]);
2473 fprintf (stderr, "\n lastpos:");
2474 for (j = stk[-1].nlastpos; j-- > 0;)
2476 fprintf (stderr, " %zd:", lastpos[j].index);
2477 prtok (d->tokens[lastpos[j].index]);
2479 putc ('\n', stderr);
2483 /* For each follow set that is the follow set of a real position, replace
2484 it with its epsilon closure. */
2485 for (i = 0; i < d->tindex; ++i)
2486 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2487 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2488 || d->tokens[i] >= CSET)
2491 fprintf (stderr, "follows(%zd:", i);
2492 prtok (d->tokens[i]);
2493 fprintf (stderr, "):");
2494 for (j = d->follows[i].nelem; j-- > 0;)
2496 fprintf (stderr, " %zd:", d->follows[i].elems[j].index);
2497 prtok (d->tokens[d->follows[i].elems[j].index]);
2499 putc ('\n', stderr);
2501 copy (&d->follows[i], &merged);
2502 epsclosure (&merged, d, visited);
2503 copy (&merged, &d->follows[i]);
2506 /* Get the epsilon closure of the firstpos of the regexp. The result will
2507 be the set of positions of state 0. */
2509 for (i = 0; i < stk[-1].nfirstpos; ++i)
2510 insert (firstpos[i], &merged);
2511 epsclosure (&merged, d, visited);
2513 /* Build the initial state. */
2514 separate_contexts = state_separate_contexts (&merged);
2515 state_index (d, &merged,
2516 (separate_contexts & CTX_NEWLINE
2517 ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
2521 free (merged.elems);
2526 /* Find, for each character, the transition out of state s of d, and store
2527 it in the appropriate slot of trans.
2529 We divide the positions of s into groups (positions can appear in more
2530 than one group). Each group is labeled with a set of characters that
2531 every position in the group matches (taking into account, if necessary,
2532 preceding context information of s). For each group, find the union
2533 of the its elements' follows. This set is the set of positions of the
2534 new state. For each character in the group's label, set the transition
2535 on this character to be to a state corresponding to the set's positions,
2536 and its associated backward context information, if necessary.
2538 If we are building a searching matcher, we include the positions of state
2541 The collection of groups is constructed by building an equivalence-class
2542 partition of the positions of s.
2544 For each position, find the set of characters C that it matches. Eliminate
2545 any characters from C that fail on grounds of backward context.
2547 Search through the groups, looking for a group whose label L has nonempty
2548 intersection with C. If L - C is nonempty, create a new group labeled
2549 L - C and having the same positions as the current group, and set L to
2550 the intersection of L and C. Insert the position in this group, set
2551 C = C - L, and resume scanning.
2553 If after comparing with every group there are characters remaining in C,
2554 create a new group labeled with the characters of C and insert this
2555 position in that group. */
2557 dfastate (state_num s, struct dfa *d, state_num trans[])
2559 leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */
2560 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
2561 size_t ngrps = 0; /* Number of groups actually used. */
2562 position pos; /* Current position being considered. */
2563 charclass matches; /* Set of matching characters. */
2564 charclass_word matchesf; /* Nonzero if matches is nonempty. */
2565 charclass intersect; /* Intersection with some label set. */
2566 charclass_word intersectf; /* Nonzero if intersect is nonempty. */
2567 charclass leftovers; /* Stuff in the label that didn't match. */
2568 charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */
2569 position_set follows; /* Union of the follows of some group. */
2570 position_set tmp; /* Temporary space for merging sets. */
2571 int possible_contexts; /* Contexts that this group can match. */
2572 int separate_contexts; /* Context that new state wants to know. */
2573 state_num state; /* New state. */
2574 state_num state_newline; /* New state on a newline transition. */
2575 state_num state_letter; /* New state on a letter transition. */
2576 bool next_isnt_1st_byte = false; /* We can't add state0. */
2581 for (i = 0; i < d->states[s].elems.nelem; ++i)
2583 pos = d->states[s].elems.elems[i];
2584 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2585 setbit (d->tokens[pos.index], matches);
2586 else if (d->tokens[pos.index] >= CSET)
2587 copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2590 if (d->tokens[pos.index] == MBCSET
2591 || d->tokens[pos.index] == ANYCHAR)
2593 /* MB_CUR_MAX > 1 */
2594 if (d->tokens[pos.index] == MBCSET)
2595 d->states[s].has_mbcset = true;
2596 /* ANYCHAR and MBCSET must match with a single character, so we
2597 must put it to d->states[s].mbps, which contains the positions
2598 which can match with a single character not a byte. */
2599 if (d->states[s].mbps.nelem == 0)
2600 alloc_position_set (&d->states[s].mbps, 1);
2601 insert (pos, &(d->states[s].mbps));
2606 /* Some characters may need to be eliminated from matches because
2607 they fail in the current context. */
2608 if (pos.constraint != NO_CONSTRAINT)
2610 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2611 d->states[s].context, CTX_NEWLINE))
2612 for (j = 0; j < CHARCLASS_WORDS; ++j)
2613 matches[j] &= ~newline[j];
2614 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2615 d->states[s].context, CTX_LETTER))
2616 for (j = 0; j < CHARCLASS_WORDS; ++j)
2617 matches[j] &= ~letters[j];
2618 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2619 d->states[s].context, CTX_NONE))
2620 for (j = 0; j < CHARCLASS_WORDS; ++j)
2621 matches[j] &= letters[j] | newline[j];
2623 /* If there are no characters left, there's no point in going on. */
2624 for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
2626 if (j == CHARCLASS_WORDS)
2630 for (j = 0; j < ngrps; ++j)
2632 /* If matches contains a single character only, and the current
2633 group's label doesn't contain that character, go on to the
2635 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2636 && !tstbit (d->tokens[pos.index], labels[j]))
2639 /* Check if this group's label has a nonempty intersection with
2642 for (k = 0; k < CHARCLASS_WORDS; ++k)
2643 intersectf |= intersect[k] = matches[k] & labels[j][k];
2647 /* It does; now find the set differences both ways. */
2648 leftoversf = matchesf = 0;
2649 for (k = 0; k < CHARCLASS_WORDS; ++k)
2651 /* Even an optimizing compiler can't know this for sure. */
2652 charclass_word match = matches[k], label = labels[j][k];
2654 leftoversf |= leftovers[k] = ~match & label;
2655 matchesf |= matches[k] = match & ~label;
2658 /* If there were leftovers, create a new group labeled with them. */
2661 copyset (leftovers, labels[ngrps]);
2662 copyset (intersect, labels[j]);
2663 grps[ngrps].elems = xnmalloc (d->nleaves,
2664 sizeof *grps[ngrps].elems);
2665 memcpy (grps[ngrps].elems, grps[j].elems,
2666 sizeof (grps[j].elems[0]) * grps[j].nelem);
2667 grps[ngrps].nelem = grps[j].nelem;
2671 /* Put the position in the current group. The constraint is
2673 grps[j].elems[grps[j].nelem++] = pos.index;
2675 /* If every character matching the current position has been
2676 accounted for, we're done. */
2681 /* If we've passed the last group, and there are still characters
2682 unaccounted for, then we'll have to create a new group. */
2685 copyset (matches, labels[ngrps]);
2687 grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
2688 grps[ngrps].nelem = 1;
2689 grps[ngrps].elems[0] = pos.index;
2694 alloc_position_set (&follows, d->nleaves);
2695 alloc_position_set (&tmp, d->nleaves);
2697 /* If we are a searching matcher, the default transition is to a state
2698 containing the positions of state 0, otherwise the default transition
2699 is to fail miserably. */
2702 /* Find the state(s) corresponding to the positions of state 0. */
2703 copy (&d->states[0].elems, &follows);
2704 separate_contexts = state_separate_contexts (&follows);
2705 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2706 if (separate_contexts & CTX_NEWLINE)
2707 state_newline = state_index (d, &follows, CTX_NEWLINE);
2709 state_newline = state;
2710 if (separate_contexts & CTX_LETTER)
2711 state_letter = state_index (d, &follows, CTX_LETTER);
2713 state_letter = state;
2715 for (i = 0; i < NOTCHAR; ++i)
2716 trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
2717 trans[eolbyte] = state_newline;
2720 for (i = 0; i < NOTCHAR; ++i)
2723 for (i = 0; i < ngrps; ++i)
2727 /* Find the union of the follows of the positions of the group.
2728 This is a hideously inefficient loop. Fix it someday. */
2729 for (j = 0; j < grps[i].nelem; ++j)
2730 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2731 insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2735 /* If a token in follows.elems is not 1st byte of a multibyte
2736 character, or the states of follows must accept the bytes
2737 which are not 1st byte of the multibyte character.
2738 Then, if a state of follows encounter a byte, it must not be
2739 a 1st byte of a multibyte character nor single byte character.
2740 We cansel to add state[0].follows to next state, because
2741 state[0] must accept 1st-byte
2743 For example, we assume <sb a> is a certain single byte
2744 character, <mb A> is a certain multibyte character, and the
2745 codepoint of <sb a> equals the 2nd byte of the codepoint of
2747 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2748 by accepting accepts 1st byte of <mb A>, and state[i+1]
2749 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2750 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2751 <mb A>, so we cannot add state[0]. */
2753 next_isnt_1st_byte = false;
2754 for (j = 0; j < follows.nelem; ++j)
2756 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2758 next_isnt_1st_byte = true;
2764 /* If we are building a searching matcher, throw in the positions
2765 of state 0 as well. */
2766 if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
2768 merge (&d->states[0].elems, &follows, &tmp);
2769 copy (&tmp, &follows);
2772 /* Find out if the new state will want any context information. */
2773 possible_contexts = charclass_context (labels[i]);
2774 separate_contexts = state_separate_contexts (&follows);
2776 /* Find the state(s) corresponding to the union of the follows. */
2777 if ((separate_contexts & possible_contexts) != possible_contexts)
2778 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2781 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2782 state_newline = state_index (d, &follows, CTX_NEWLINE);
2784 state_newline = state;
2785 if (separate_contexts & possible_contexts & CTX_LETTER)
2786 state_letter = state_index (d, &follows, CTX_LETTER);
2788 state_letter = state;
2790 /* Set the transitions for each character in the current label. */
2791 for (j = 0; j < CHARCLASS_WORDS; ++j)
2792 for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
2793 if (labels[i][j] >> k & 1)
2795 int c = j * CHARCLASS_WORD_BITS + k;
2798 trans[c] = state_newline;
2799 else if (IS_WORD_CONSTITUENT (c))
2800 trans[c] = state_letter;
2801 else if (c < NOTCHAR)
2806 for (i = 0; i < ngrps; ++i)
2807 free (grps[i].elems);
2808 free (follows.elems);
2812 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2814 realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2816 state_num oldalloc = d->tralloc;
2817 if (oldalloc <= new_state)
2819 state_num **realtrans = d->trans ? d->trans - 1 : NULL;
2820 size_t newalloc, newalloc1;
2821 newalloc1 = new_state + 1;
2822 realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
2823 realtrans[0] = NULL;
2824 d->trans = realtrans + 1;
2825 d->tralloc = newalloc = newalloc1 - 1;
2826 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2827 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2828 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2829 for (; oldalloc < newalloc; oldalloc++)
2831 d->trans[oldalloc] = NULL;
2832 d->fails[oldalloc] = NULL;
2837 /* Some routines for manipulating a compiled dfa's transition tables.
2838 Each state may or may not have a transition table; if it does, and it
2839 is a non-accepting state, then d->trans[state] points to its table.
2840 If it is an accepting state then d->fails[state] points to its table.
2841 If it has no table at all, then d->trans[state] is NULL.
2842 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2845 build_state (state_num s, struct dfa *d)
2847 state_num *trans; /* The new transition table. */
2848 state_num i, maxstate;
2850 /* Set an upper limit on the number of transition tables that will ever
2851 exist at once. 1024 is arbitrary. The idea is that the frequently
2852 used transition tables will be quickly rebuilt, whereas the ones that
2853 were only needed once or twice will be cleared away. However, do
2854 not clear the initial state, as it's always used. */
2855 if (d->trcount >= 1024)
2857 for (i = 1; i < d->tralloc; ++i)
2861 d->trans[i] = d->fails[i] = NULL;
2868 /* Set up the success bits for this state. */
2870 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2871 d->success[s] |= CTX_NEWLINE;
2872 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2873 d->success[s] |= CTX_LETTER;
2874 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2875 d->success[s] |= CTX_NONE;
2877 trans = xmalloc (NOTCHAR * sizeof *trans);
2878 dfastate (s, d, trans);
2880 /* Now go through the new transition table, and make sure that the trans
2881 and fail arrays are allocated large enough to hold a pointer for the
2882 largest state mentioned in the table. */
2884 for (i = 0; i < NOTCHAR; ++i)
2885 if (maxstate < trans[i])
2886 maxstate = trans[i];
2887 realloc_trans_if_necessary (d, maxstate);
2889 /* Keep the newline transition in a special place so we can use it as
2891 d->newlines[s] = trans[eolbyte];
2892 trans[eolbyte] = -1;
2894 if (ACCEPTING (s, *d))
2895 d->fails[s] = trans;
2897 d->trans[s] = trans;
2900 /* Multibyte character handling sub-routines for dfaexec. */
2902 /* Return values of transit_state_singlebyte, and
2903 transit_state_consume_1char. */
2906 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2907 TRANSIT_STATE_DONE, /* State transition has finished. */
2908 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2909 } status_transit_state;
2911 /* Consume a single byte and transit state from 's' to '*next_state'.
2912 This function is almost same as the state transition routin in dfaexec.
2913 But state transition is done just once, otherwise matching succeed or
2914 reach the end of the buffer. */
2915 static status_transit_state
2916 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
2917 state_num * next_state)
2920 state_num works = s;
2922 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2924 while (rval == TRANSIT_STATE_IN_PROGRESS)
2926 if ((t = d->trans[works]) != NULL)
2929 rval = TRANSIT_STATE_DONE;
2935 else if (d->fails[works])
2937 works = d->fails[works][*p];
2938 rval = TRANSIT_STATE_DONE;
2942 build_state (works, d);
2945 *next_state = works;
2949 /* Match a "." against the current context. Return the length of the
2950 match, in bytes. POS is the position of the ".". */
2952 match_anychar (struct dfa *d, state_num s, position pos,
2953 wint_t wc, size_t mbclen)
2957 /* Check syntax bits. */
2958 if (wc == (wchar_t) eolbyte)
2960 if (!(syntax_bits & RE_DOT_NEWLINE))
2963 else if (wc == (wchar_t) '\0')
2965 if (syntax_bits & RE_DOT_NOT_NULL)
2968 else if (wc == WEOF)
2971 context = wchar_context (wc);
2972 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2978 /* Match a bracket expression against the current context.
2979 Return the length of the match, in bytes.
2980 POS is the position of the bracket expression. */
2982 match_mb_charset (struct dfa *d, state_num s, position pos,
2983 char const *p, wint_t wc, size_t match_len)
2986 bool match; /* Matching succeeded. */
2987 int op_len; /* Length of the operator. */
2990 /* Pointer to the structure to which we are currently referring. */
2991 struct mb_char_classes *work_mbc;
2995 /* Check syntax bits. */
2996 if (wc == (wchar_t) eolbyte)
2998 if (!(syntax_bits & RE_DOT_NEWLINE))
3001 else if (wc == (wchar_t) '\0')
3003 if (syntax_bits & RE_DOT_NOT_NULL)
3006 else if (wc == WEOF)
3009 context = wchar_context (wc);
3010 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
3013 /* Assign the current referring operator to work_mbc. */
3014 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
3015 match = !work_mbc->invert;
3017 /* Match in range 0-255? */
3018 if (wc < NOTCHAR && work_mbc->cset != -1
3019 && tstbit (to_uchar (wc), d->charclasses[work_mbc->cset]))
3020 goto charset_matched;
3022 /* match with a character class? */
3023 for (i = 0; i < work_mbc->nch_classes; i++)
3025 if (iswctype ((wint_t) wc, work_mbc->ch_classes[i]))
3026 goto charset_matched;
3029 strncpy (buffer, p, match_len);
3030 buffer[match_len] = '\0';
3032 /* match with an equivalence class? */
3033 for (i = 0; i < work_mbc->nequivs; i++)
3035 op_len = strlen (work_mbc->equivs[i]);
3036 strncpy (buffer, p, op_len);
3037 buffer[op_len] = '\0';
3038 if (strcoll (work_mbc->equivs[i], buffer) == 0)
3041 goto charset_matched;
3045 /* match with a collating element? */
3046 for (i = 0; i < work_mbc->ncoll_elems; i++)
3048 op_len = strlen (work_mbc->coll_elems[i]);
3049 strncpy (buffer, p, op_len);
3050 buffer[op_len] = '\0';
3052 if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
3055 goto charset_matched;
3059 /* match with a range? */
3060 for (i = 0; i < work_mbc->nranges; i++)
3062 if (work_mbc->ranges[i].beg <= wc && wc <= work_mbc->ranges[i].end)
3063 goto charset_matched;
3066 /* match with a character? */
3067 for (i = 0; i < work_mbc->nchars; i++)
3069 if (wc == work_mbc->chars[i])
3070 goto charset_matched;
3076 return match ? match_len : 0;
3079 /* Check whether each of 'd->states[s].mbps.elem' can match. Then return the
3080 array which corresponds to 'd->states[s].mbps.elem'; each element of the
3081 array contains the number of bytes with which the element can match.
3083 The caller MUST free the array which this function return. */
3085 check_matching_with_multibyte_ops (struct dfa *d, state_num s,
3086 char const *p, wint_t wc, size_t mbclen)
3091 rarray = d->mb_match_lens;
3092 for (i = 0; i < d->states[s].mbps.nelem; ++i)
3094 position pos = d->states[s].mbps.elems[i];
3095 switch (d->tokens[pos.index])
3098 rarray[i] = match_anychar (d, s, pos, wc, mbclen);
3101 rarray[i] = match_mb_charset (d, s, pos, p, wc, mbclen);
3104 break; /* cannot happen. */
3110 /* Consume a single character and enumerate all of the positions which can
3111 be the next position from the state 's'.
3113 'match_lens' is the input. It can be NULL, but it can also be the output
3114 of check_matching_with_multibyte_ops for optimization.
3116 'mbclen' and 'pps' are the output. 'mbclen' is the length of the
3117 character consumed, and 'pps' is the set this function enumerates. */
3118 static status_transit_state
3119 transit_state_consume_1char (struct dfa *d, state_num s,
3120 unsigned char const **pp,
3121 wint_t wc, size_t mbclen,
3127 status_transit_state rs = TRANSIT_STATE_DONE;
3129 if (! match_lens && d->states[s].mbps.nelem != 0)
3130 match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
3133 /* Calculate the state which can be reached from the state 's' by
3134 consuming 'mbclen' single bytes from the buffer. */
3136 for (k = 0; k < mbclen; k++)
3139 rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
3141 copy (&d->states[s1].elems, &d->mb_follows);
3143 /* Add all of the positions which can be reached from 's' by consuming
3144 a single character. */
3145 for (i = 0; i < d->states[s].mbps.nelem; i++)
3147 if (match_lens[i] == mbclen)
3148 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3150 insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
3154 /* FIXME: this return value is always ignored. */
3158 /* Transit state from s, then return new state and update the pointer of the
3159 buffer. This function is for some operator which can match with a multi-
3160 byte character or a collating element (which may be multi characters). */
3162 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3163 unsigned char const *end)
3166 int mbclen; /* The length of current input multibyte character. */
3169 int *match_lens = NULL;
3170 size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
3171 unsigned char const *p1 = *pp;
3175 /* This state has (a) multibyte operator(s).
3176 We check whether each of them can match or not. */
3178 /* Note: caller must free the return value of this function. */
3179 mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3180 match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp,
3183 for (i = 0; i < nelem; i++)
3184 /* Search the operator which match the longest string,
3187 if (match_lens[i] > maxlen)
3188 maxlen = match_lens[i];
3192 if (nelem == 0 || maxlen == 0)
3193 /* This state has no multibyte operator which can match.
3194 We need to check only one single byte character. */
3196 status_transit_state rs;
3197 rs = transit_state_singlebyte (d, s, *pp, &s1);
3199 /* We must update the pointer if state transition succeeded. */
3200 if (rs == TRANSIT_STATE_DONE)
3206 /* This state has some operators which can match a multibyte character. */
3207 d->mb_follows.nelem = 0;
3209 /* 'maxlen' may be longer than the length of a character, because it may
3210 not be a character but a (multi character) collating element.
3211 We enumerate all of the positions which 's' can reach by consuming
3213 transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
3215 s1 = state_index (d, &d->mb_follows, wchar_context (wc));
3216 realloc_trans_if_necessary (d, s1);
3218 while (*pp - p1 < maxlen)
3220 mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3221 transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
3223 for (i = 0; i < nelem; i++)
3225 if (match_lens[i] == *pp - p1)
3227 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3228 insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3232 s1 = state_index (d, &d->mb_follows, wchar_context (wc));
3233 realloc_trans_if_necessary (d, s1);
3238 /* Search through a buffer looking for a match to the given struct dfa.
3239 Find the first occurrence of a string matching the regexp in the
3240 buffer, and the shortest possible version thereof. Return a pointer to
3241 the first character after the match, or NULL if none is found. BEGIN
3242 points to the beginning of the buffer, and END points to the first byte
3243 after its end. Note however that we store a sentinel byte (usually
3244 newline) in *END, so the actual buffer must be one byte longer.
3245 When ALLOW_NL is nonzero, newlines may appear in the matching string.
3246 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3247 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3248 encountered a back-reference (1) or not (0). The caller may use this
3249 to decide whether to fall back on a backtracking matcher. */
3251 dfaexec (struct dfa *d, char const *begin, char *end,
3252 int allow_nl, size_t *count, int *backref)
3254 state_num s, s1; /* Current state. */
3255 unsigned char const *p, *mbp; /* Current input character. */
3256 state_num **trans, *t; /* Copy of d->trans so it can be optimized
3258 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3259 unsigned char saved_end;
3264 realloc_trans_if_necessary (d, 1);
3269 p = mbp = (unsigned char const *) begin;
3271 saved_end = *(unsigned char *) end;
3276 memset (&d->mbs, 0, sizeof d->mbs);
3277 if (! d->mb_match_lens)
3279 d->mb_match_lens = xnmalloc (d->nleaves, sizeof *d->mb_match_lens);
3280 alloc_position_set (&d->mb_follows, d->nleaves);
3288 while ((t = trans[s]) != NULL)
3294 /* The initial state may encounter a byte which is not
3295 a single byte character nor the first byte of a
3296 multibyte character. But it is incorrect for the
3297 initial state to accept such a byte. For example,
3298 in Shift JIS the regular expression "\\" accepts
3299 the codepoint 0x5c, but should not accept the second
3300 byte of the codepoint 0x815c. Then the initial
3301 state must skip the bytes that are not a single
3302 byte character nor the first byte of a multibyte
3306 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3307 end - (char const *) mbp, d);
3310 if ((char *) p > end)
3317 if (d->states[s].mbps.nelem == 0)
3323 /* Falling back to the glibc matcher in this case gives
3324 better performance (up to 25% better on [a-z], for
3325 example) and enables support for collating symbols and
3326 equivalence classes. */
3327 if (d->states[s].has_mbcset && backref)
3333 /* Can match with a multibyte character (and multi character
3334 collating element). Transition table might be updated. */
3335 s = transit_state (d, s, &p, (unsigned char *) end);
3342 while ((t = trans[s]) != NULL)
3345 if ((t = trans[s1]) == NULL)
3349 s1 = tmp; /* swap */
3356 if ((char *) p > end)
3362 if (s >= 0 && d->fails[s])
3364 if (d->success[s] & sbit[*p])
3367 *backref = d->states[s].has_backref;
3374 /* Can match with a multibyte character (and multicharacter
3375 collating element). Transition table might be updated. */
3376 s = transit_state (d, s, &p, (unsigned char *) end);
3381 s = d->fails[s][*p++];
3385 /* If the previous character was a newline, count it, and skip
3386 checking of multibyte character boundary until here. */
3401 if (p[-1] == eol && allow_nl)
3403 s = d->newlines[s1];
3418 dfasuperset (struct dfa const *d)
3424 dfaisfast (struct dfa const *d)
3430 free_mbdata (struct dfa *d)
3434 free (d->multibyte_prop);
3436 for (i = 0; i < d->nmbcsets; ++i)
3439 struct mb_char_classes *p = &(d->mbcsets[i]);
3441 free (p->ch_classes);
3444 for (j = 0; j < p->nequivs; ++j)
3445 free (p->equivs[j]);
3448 for (j = 0; j < p->ncoll_elems; ++j)
3449 free (p->coll_elems[j]);
3450 free (p->coll_elems);
3454 free (d->mb_follows.elems);
3455 free (d->mb_match_lens);
3456 d->mb_match_lens = NULL;
3459 /* Initialize the components of a dfa that the other routines don't
3460 initialize for themselves. */
3462 dfainit (struct dfa *d)
3464 memset (d, 0, sizeof *d);
3465 d->multibyte = MB_CUR_MAX > 1;
3466 d->fast = !d->multibyte;
3470 dfaoptimize (struct dfa *d)
3473 bool have_backref = false;
3478 for (i = 0; i < d->tindex; ++i)
3480 switch (d->tokens[i])
3486 have_backref = true;
3489 /* Requires multi-byte algorithm. */
3496 if (!have_backref && d->superset)
3498 /* The superset DFA is not likely to be much faster, so remove it. */
3499 dfafree (d->superset);
3505 d->multibyte = false;
3509 dfassbuild (struct dfa *d)
3513 bool have_achar = false;
3514 bool have_nchar = false;
3515 struct dfa *sup = dfaalloc ();
3518 sup->multibyte = false;
3519 sup->multibyte_prop = NULL;
3520 sup->mbcsets = NULL;
3521 sup->superset = NULL;
3524 sup->follows = NULL;
3528 sup->success = NULL;
3529 sup->newlines = NULL;
3532 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3533 memcpy (sup->charclasses, d->charclasses,
3534 d->cindex * sizeof *sup->charclasses);
3536 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3537 sup->talloc = d->tindex * 2;
3539 for (i = j = 0; i < d->tindex; i++)
3541 switch (d->tokens[i])
3548 sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
3549 sup->tokens[j++] = STAR;
3550 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3551 || d->tokens[i + 1] == PLUS)
3561 /* These constraints aren't supported in a multibyte locale.
3562 Ignore them in the superset DFA, and treat them as
3563 backreferences in the main DFA. */
3564 sup->tokens[j++] = EMPTY;
3565 d->tokens[i] = BACKREF;
3569 sup->tokens[j++] = d->tokens[i];
3570 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3571 || d->tokens[i] >= CSET)
3578 if (have_nchar && (have_achar || d->multibyte))
3587 /* Parse and analyze a single string of the given length. */
3589 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3593 dfaparse (s, len, d);
3597 dfaanalyze (d, searchflag);
3601 dfaanalyze (d->superset, searchflag);
3605 /* Free the storage held by the components of a dfa. */
3607 dfafree (struct dfa *d)
3610 struct dfamust *dm, *ndm;
3612 free (d->charclasses);
3618 for (i = 0; i < d->sindex; ++i)
3620 free (d->states[i].elems.elems);
3621 free (d->states[i].mbps.elems);
3627 for (i = 0; i < d->tindex; ++i)
3628 free (d->follows[i].elems);
3634 for (i = 0; i < d->tralloc; ++i)
3640 free (d->trans - 1);
3646 for (dm = d->musts; dm; dm = ndm)
3654 dfafree (d->superset);
3657 /* Having found the postfix representation of the regular expression,
3658 try to find a long sequence of characters that must appear in any line
3660 Finding a "longest" sequence is beyond the scope here;
3661 we take an easy way out and hope for the best.
3662 (Take "(ab|a)b"--please.)
3664 We do a bottom-up calculation of sequences of characters that must appear
3665 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3667 sequences that must appear at the left of the match ("left")
3668 sequences that must appear at the right of the match ("right")
3669 lists of sequences that must appear somewhere in the match ("in")
3670 sequences that must constitute the match ("is")
3672 When we get to the root of the tree, we use one of the longest of its
3673 calculated "in" sequences as our answer. The sequence we find is returned in
3674 d->must (where "d" is the single argument passed to "dfamust");
3675 the length of the sequence is returned in d->mustn.
3677 The sequences calculated for the various types of node (in pseudo ANSI c)
3678 are shown below. "p" is the operand of unary operators (and the left-hand
3679 operand of binary operators); "q" is the right-hand operand of binary
3682 "ZERO" means "a zero-length sequence" below.
3684 Type left right is in
3685 ---- ---- ----- -- --
3686 char c # c # c # c # c
3688 ANYCHAR ZERO ZERO ZERO ZERO
3690 MBCSET ZERO ZERO ZERO ZERO
3692 CSET ZERO ZERO ZERO ZERO
3694 STAR ZERO ZERO ZERO ZERO
3696 QMARK ZERO ZERO ZERO ZERO
3698 PLUS p->left p->right ZERO p->in
3700 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3701 p->left : q->right : q->is!=ZERO) ? q->in plus
3702 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3705 OR longest common longest common (do p->is and substrings common to
3706 leading trailing q->is have same p->in and q->in
3707 (sub)sequence (sub)sequence length and
3708 of p->left of p->right content) ?
3709 and q->left and q->right p->is : NULL
3711 If there's anything else we recognize in the tree, all four sequences get set
3712 to zero-length sequences. If there's something we don't recognize in the
3713 tree, we just return a zero-length sequence.
3715 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3718 And ... is it here or someplace that we might ponder "optimizations" such as
3719 egrep 'psi|epsilon' -> egrep 'psi'
3720 egrep 'pepsi|epsilon' -> egrep 'epsi'
3721 (Yes, we now find "epsi" as a "string
3722 that must occur", but we might also
3723 simplify the *entire* r.e. being sought)
3724 grep '[c]' -> grep 'c'
3725 grep '(ab|a)b' -> grep 'ab'
3726 grep 'ab*' -> grep 'a'
3727 grep 'a*b' -> grep 'b'
3729 There are several issues:
3731 Is optimization easy (enough)?
3733 Does optimization actually accomplish anything,
3734 or is the automaton you get from "psi|epsilon" (for example)
3735 the same as the one you get from "psi" (for example)?
3737 Are optimizable r.e.'s likely to be used in real-life situations
3738 (something like 'ab*' is probably unlikely; something like is
3739 'psi|epsilon' is likelier)? */
3742 icatalloc (char *old, char const *new)
3746 size_t newsize = strlen (new);
3749 oldsize = strlen (old);
3750 result = xrealloc (old, oldsize + newsize + 1);
3751 memcpy (result + oldsize, new, newsize + 1);
3755 static char *_GL_ATTRIBUTE_PURE
3756 istrstr (char const *lookin, char const *lookfor)
3761 len = strlen (lookfor);
3762 for (cp = lookin; *cp != '\0'; ++cp)
3763 if (strncmp (cp, lookfor, len) == 0)
3769 freelist (char **cpp)
3776 enlist (char **cpp, char *new, size_t len)
3779 new = memcpy (xmalloc (len + 1), new, len);
3781 /* Is there already something in the list that's new (or longer)? */
3782 for (i = 0; cpp[i] != NULL; ++i)
3783 if (istrstr (cpp[i], new) != NULL)
3788 /* Eliminate any obsoleted strings. */
3790 while (cpp[j] != NULL)
3791 if (istrstr (new, cpp[j]) == NULL)
3801 /* Add the new string. */
3802 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3808 /* Given pointers to two strings, return a pointer to an allocated
3809 list of their distinct common substrings. */
3811 comsubs (char *left, char const *right)
3813 char **cpp = xzalloc (sizeof *cpp);
3816 for (lcp = left; *lcp != '\0'; ++lcp)
3819 char *rcp = strchr (right, *lcp);
3823 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3827 rcp = strchr (rcp + 1, *lcp);
3830 cpp = enlist (cpp, lcp, len);
3836 addlists (char **old, char **new)
3839 old = enlist (old, *new, strlen (*new));
3843 /* Given two lists of substrings, return a new list giving substrings
3846 inboth (char **left, char **right)
3848 char **both = xzalloc (sizeof *both);
3851 for (lnum = 0; left[lnum] != NULL; ++lnum)
3853 for (rnum = 0; right[rnum] != NULL; ++rnum)
3855 char **temp = comsubs (left[lnum], right[rnum]);
3856 both = addlists (both, temp);
3864 typedef struct must must;
3878 allocmust (must *mp)
3880 must *new_mp = xmalloc (sizeof *new_mp);
3881 new_mp->in = xzalloc (sizeof *new_mp->in);
3882 new_mp->left = xzalloc (2);
3883 new_mp->right = xzalloc (2);
3884 new_mp->is = xzalloc (2);
3885 new_mp->begline = false;
3886 new_mp->endline = false;
3892 resetmust (must *mp)
3896 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3897 mp->begline = false;
3898 mp->endline = false;
3913 dfamust (struct dfa *d)
3916 char const *result = "";
3920 bool begline = false;
3921 bool endline = false;
3924 for (ri = 0; ri < d->tindex; ++ri)
3926 token t = d->tokens[ri];
3930 mp = allocmust (mp);
3934 mp = allocmust (mp);
3939 assert (!"neither LPAREN nor RPAREN may appear here");
3949 mp = allocmust (mp);
3961 must *lmp = mp = mp->prev;
3962 size_t j, ln, rn, n;
3964 /* Guaranteed to be. Unlikely, but ... */
3965 if (STREQ (lmp->is, rmp->is))
3967 lmp->begline &= rmp->begline;
3968 lmp->endline &= rmp->endline;
3973 lmp->begline = false;
3974 lmp->endline = false;
3976 /* Left side--easy */
3978 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3980 lmp->left[i] = '\0';
3982 ln = strlen (lmp->right);
3983 rn = strlen (rmp->right);
3987 for (i = 0; i < n; ++i)
3988 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3990 for (j = 0; j < i; ++j)
3991 lmp->right[j] = lmp->right[(ln - i) + j];
3992 lmp->right[j] = '\0';
3993 new = inboth (lmp->in, rmp->in);
4007 for (i = 0; mp->in[i] != NULL; ++i)
4008 if (strlen (mp->in[i]) > strlen (result))
4010 if (STREQ (result, mp->is))
4013 begline = mp->begline;
4014 endline = mp->endline;
4021 must *lmp = mp = mp->prev;
4023 /* In. Everything in left, plus everything in
4024 right, plus concatenation of
4025 left's right and right's left. */
4026 lmp->in = addlists (lmp->in, rmp->in);
4027 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4029 size_t lrlen = strlen (lmp->right);
4030 size_t rllen = strlen (rmp->left);
4031 char *tp = xmalloc (lrlen + rllen);
4032 memcpy (tp, lmp->right, lrlen);
4033 memcpy (tp + lrlen, rmp->left, rllen);
4034 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4038 if (lmp->is[0] != '\0')
4039 lmp->left = icatalloc (lmp->left, rmp->left);
4041 if (rmp->is[0] == '\0')
4042 lmp->right[0] = '\0';
4043 lmp->right = icatalloc (lmp->right, rmp->right);
4044 /* Guaranteed to be */
4045 if ((lmp->is[0] != '\0' || lmp->begline)
4046 && (rmp->is[0] != '\0' || rmp->endline))
4048 lmp->is = icatalloc (lmp->is, rmp->is);
4049 lmp->endline = rmp->endline;
4054 lmp->begline = false;
4055 lmp->endline = false;
4062 /* Not on *my* shift. */
4066 mp = allocmust (mp);
4069 /* If T is a singleton, or if case-folding in a unibyte
4070 locale and T's members all case-fold to the same char,
4071 convert T to one of its members. Otherwise, do
4072 nothing further with T. */
4073 charclass *ccl = &d->charclasses[t - CSET];
4075 for (j = 0; j < NOTCHAR; j++)
4076 if (tstbit (j, *ccl))
4078 if (! (j < NOTCHAR))
4081 while (++j < NOTCHAR)
4082 if (tstbit (j, *ccl)
4083 && ! (case_fold && !d->multibyte
4084 && toupper (j) == toupper (t)))
4089 mp->is[0] = mp->left[0] = mp->right[0]
4090 = case_fold && !d->multibyte ? toupper (t) : t;
4091 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4092 mp->in = enlist (mp->in, mp->is, 1);
4099 dm = xmalloc (sizeof *dm);
4101 dm->begline = begline;
4102 dm->endline = endline;
4103 dm->must = xstrdup (result);
4104 dm->next = d->musts;
4110 must *prev = mp->prev;
4119 return xmalloc (sizeof (struct dfa));
4122 struct dfamust *_GL_ATTRIBUTE_PURE
4123 dfamusts (struct dfa const *d)
4128 /* vim:set shiftwidth=2: */