1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
27 #include <sys/types.h>
34 #define STREQ(a, b) (strcmp (a, b) == 0)
36 /* ISASCIIDIGIT differs from isdigit, as follows:
37 - Its arg may be any int or unsigned int; it need not be an unsigned char.
38 - It's guaranteed to evaluate its argument exactly once.
39 - It's typically faster.
40 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
41 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
42 it's important to use the locale's definition of "digit" even when the
43 host does not conform to Posix. */
44 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
46 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
48 #define _(str) gettext (str)
50 #include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
54 #if HAVE_LANGINFO_CODESET
55 # include <langinfo.h>
62 /* HPUX, define those as macros in sys/param.h */
70 /* Number of bits in an unsigned char. */
75 /* First integer value that is greater than any character code. */
76 #define NOTCHAR (1 << CHARBITS)
78 /* INTBITS need not be exact, just a lower bound. */
80 # define INTBITS (CHARBITS * sizeof (int))
83 /* Number of ints required to hold a bit for every character. */
84 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
86 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
87 typedef int charclass[CHARCLASS_INTS];
89 /* Convert a possibly-signed character to an unsigned character. This is
90 a bit safer than casting to unsigned char, since it catches some type
91 errors that the cast doesn't. */
92 static inline unsigned char
98 /* Contexts tell us whether a character is a newline or a word constituent.
99 Word-constituent characters are those that satisfy iswalnum(), plus '_'.
100 Each character has a single CTX_* value; bitmasks of CTX_* values denote
101 a particular character class.
103 A state also stores a context value, which is a bitmask of CTX_* values.
104 A state's context represents a set of characters that the state's
105 predecessors must match. For example, a state whose context does not
106 include CTX_LETTER will never have transitions where the previous
107 character is a word constituent. A state whose context is CTX_ANY
108 might have transitions from any character. */
112 #define CTX_NEWLINE 4
115 /* Sometimes characters can only be matched depending on the surrounding
116 context. Such context decisions depend on what the previous character
117 was, and the value of the current (lookahead) character. Context
118 dependent constraints are encoded as 8 bit integers. Each bit that
119 is set indicates that the constraint succeeds in the corresponding
122 bit 8-11 - valid contexts when next character is CTX_NEWLINE
123 bit 4-7 - valid contexts when next character is CTX_LETTER
124 bit 0-3 - valid contexts when next character is CTX_NONE
126 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
127 succeeds in a particular context. Prev is a bitmask of possible
128 context values for the previous character, curr is the (single-bit)
129 context value for the lookahead character. */
130 #define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
131 #define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
132 #define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
134 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
135 ((((curr) & CTX_NONE ? OTHER_CONSTRAINT(constraint) : 0) \
136 | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT(constraint) : 0) \
137 | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT(constraint) : 0)) & (prev))
139 /* The following macros give information about what a constraint depends on. */
140 #define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
141 #define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
142 #define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
144 #define PREV_NEWLINE_DEPENDENT(constraint) \
145 (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
146 #define PREV_LETTER_DEPENDENT(constraint) \
147 (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
149 /* Tokens that match the empty string subject to some constraint actually
150 work by applying that constraint to determine what may follow them,
151 taking into account what has gone before. The following values are
152 the constraints corresponding to the special tokens previously defined. */
153 #define NO_CONSTRAINT 0x777
154 #define BEGLINE_CONSTRAINT 0x444
155 #define ENDLINE_CONSTRAINT 0x700
156 #define BEGWORD_CONSTRAINT 0x050
157 #define ENDWORD_CONSTRAINT 0x202
158 #define LIMWORD_CONSTRAINT 0x252
159 #define NOTLIMWORD_CONSTRAINT 0x525
161 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
162 are operators and others are terminal symbols. Most (but not all) of these
163 codes are returned by the lexical analyzer. */
165 typedef ptrdiff_t token;
167 /* Predefined token values. */
170 END = -1, /* END is a terminal symbol that matches the
171 end of input; any value of END or less in
172 the parse tree is such a symbol. Accepting
173 states of the DFA are those that would have
174 a transition on END. */
176 /* Ordinary character values are terminal symbols that match themselves. */
178 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
181 BACKREF, /* BACKREF is generated by \<digit>; it
182 is not completely handled. If the scanner
183 detects a transition on backref, it returns
184 a kind of "semi-success" indicating that
185 the match will have to be verified with
186 a backtracking matcher. */
188 BEGLINE, /* BEGLINE is a terminal symbol that matches
189 the empty string if it is at the beginning
192 ENDLINE, /* ENDLINE is a terminal symbol that matches
193 the empty string if it is at the end of
196 BEGWORD, /* BEGWORD is a terminal symbol that matches
197 the empty string if it is at the beginning
200 ENDWORD, /* ENDWORD is a terminal symbol that matches
201 the empty string if it is at the end of
204 LIMWORD, /* LIMWORD is a terminal symbol that matches
205 the empty string if it is at the beginning
206 or the end of a word. */
208 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
209 matches the empty string if it is not at
210 the beginning or end of a word. */
212 QMARK, /* QMARK is an operator of one argument that
213 matches zero or one occurrences of its
216 STAR, /* STAR is an operator of one argument that
217 matches the Kleene closure (zero or more
218 occurrences) of its argument. */
220 PLUS, /* PLUS is an operator of one argument that
221 matches the positive closure (one or more
222 occurrences) of its argument. */
224 REPMN, /* REPMN is a lexical token corresponding
225 to the {m,n} construct. REPMN never
226 appears in the compiled token vector. */
228 CAT, /* CAT is an operator of two arguments that
229 matches the concatenation of its
230 arguments. CAT is never returned by the
233 OR, /* OR is an operator of two arguments that
234 matches either of its arguments. */
236 LPAREN, /* LPAREN never appears in the parse tree,
237 it is only a lexeme. */
239 RPAREN, /* RPAREN never appears in the parse tree. */
241 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
242 any multibyte (or single byte) characters.
243 It is used only if MB_CUR_MAX > 1. */
245 MBCSET, /* MBCSET is similar to CSET, but for
246 multibyte characters. */
248 WCHAR, /* Only returned by lex. wctok contains
249 the wide character representation. */
251 CSET /* CSET and (and any value greater) is a
252 terminal symbol that matches any of a
253 class of characters. */
257 /* States of the recognizer correspond to sets of positions in the parse
258 tree, together with the constraints under which they may be matched.
259 So a position is encoded as an index into the parse tree together with
263 size_t index; /* Index into the parse array. */
264 unsigned int constraint; /* Constraint for matching this position. */
267 /* Sets of positions are stored as arrays. */
270 position *elems; /* Elements of this position set. */
271 size_t nelem; /* Number of elements in this set. */
272 size_t alloc; /* Number of elements allocated in ELEMS. */
275 /* Sets of leaves are also stored as arrays. */
278 size_t *elems; /* Elements of this position set. */
279 size_t nelem; /* Number of elements in this set. */
282 /* A state of the dfa consists of a set of positions, some flags,
283 and the token value of the lowest-numbered position of the state that
284 contains an END token. */
287 size_t hash; /* Hash of the positions of this state. */
288 position_set elems; /* Positions this state could match. */
289 unsigned char context; /* Context from previous state. */
290 char backref; /* True if this state matches a \<digit>. */
291 unsigned short constraint; /* Constraint for this state to accept. */
292 token first_end; /* Token value of the first END in elems. */
293 position_set mbps; /* Positions which can match multibyte
294 characters. e.g. period.
295 These staff are used only if
299 /* States are indexed by state_num values. These are normally
300 nonnegative but -1 is used as a special value. */
301 typedef ptrdiff_t state_num;
303 /* A bracket operator.
304 e.g. [a-c], [[:alpha:]], etc. */
305 struct mb_char_classes
309 wchar_t *chars; /* Normal characters. */
311 wctype_t *ch_classes; /* Character classes. */
313 wchar_t *range_sts; /* Range characters (start of the range). */
314 wchar_t *range_ends; /* Range characters (end of the range). */
316 char **equivs; /* Equivalence classes. */
319 size_t ncoll_elems; /* Collating elements. */
322 /* A compiled regular expression. */
325 /* Fields filled by the scanner. */
326 charclass *charclasses; /* Array of character sets for CSET tokens. */
327 size_t cindex; /* Index for adding new charclasses. */
328 size_t calloc; /* Number of charclasses currently allocated. */
330 /* Fields filled by the parser. */
331 token *tokens; /* Postfix parse array. */
332 size_t tindex; /* Index for adding new tokens. */
333 size_t talloc; /* Number of tokens currently allocated. */
334 size_t depth; /* Depth required of an evaluation stack
335 used for depth-first traversal of the
337 size_t nleaves; /* Number of leaves on the parse tree. */
338 size_t nregexps; /* Count of parallel regexps being built
340 unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
341 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
343 /* The following are used only if MB_CUR_MAX > 1. */
345 /* The value of multibyte_prop[i] is defined by following rule.
346 if tokens[i] < NOTCHAR
347 bit 0 : tokens[i] is the first byte of a character, including
348 single-byte characters.
349 bit 1 : tokens[i] is the last byte of a character, including
350 single-byte characters.
352 if tokens[i] = MBCSET
353 ("the index of mbcsets corresponding to this operator" << 2) + 3
357 = 'single_byte_a', 'multi_byte_A', single_byte_b'
358 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
362 size_t nmultibyte_prop;
365 /* Array of the bracket expression in the DFA. */
366 struct mb_char_classes *mbcsets;
368 size_t mbcsets_alloc;
370 /* Fields filled by the state builder. */
371 dfa_state *states; /* States of the dfa. */
372 state_num sindex; /* Index for adding new states. */
373 state_num salloc; /* Number of states currently allocated. */
375 /* Fields filled by the parse tree->NFA conversion. */
376 position_set *follows; /* Array of follow sets, indexed by position
377 index. The follow of a position is the set
378 of positions containing characters that
379 could conceivably follow a character
380 matching the given position in a string
381 matching the regexp. Allocated to the
382 maximum possible position index. */
383 int searchflag; /* True if we are supposed to build a searching
384 as opposed to an exact matcher. A searching
385 matcher finds the first and shortest string
386 matching a regexp anywhere in the buffer,
387 whereas an exact matcher finds the longest
388 string matching, but anchored to the
389 beginning of the buffer. */
391 /* Fields filled by dfaexec. */
392 state_num tralloc; /* Number of transition tables that have
394 int trcount; /* Number of transition tables that have
395 actually been built. */
396 state_num **trans; /* Transition tables for states that can
397 never accept. If the transitions for a
398 state have not yet been computed, or the
399 state could possibly accept, its entry in
400 this table is NULL. */
401 state_num **realtrans; /* Trans always points to realtrans + 1; this
402 is so trans[-1] can contain NULL. */
403 state_num **fails; /* Transition tables after failing to accept
404 on a state that potentially could do so. */
405 int *success; /* Table of acceptance conditions used in
406 dfaexec and computed in build_state. */
407 state_num *newlines; /* Transitions on newlines. The entry for a
408 newline in any transition table is always
409 -1 so we can count lines without wasting
410 too many cycles. The transition for a
411 newline is stored separately and handled
412 as a special case. Newline is also used
413 as a sentinel at the end of the buffer. */
414 struct dfamust *musts; /* List of strings, at least one of which
415 is known to appear in any r.e. matching
419 /* Some macros for user access to dfa internals. */
421 /* ACCEPTING returns true if s could possibly be an accepting state of r. */
422 #define ACCEPTING(s, r) ((r).states[s].constraint)
424 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
425 specified context. */
426 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
427 SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
429 static void dfamust (struct dfa *dfa);
430 static void regexp (void);
432 /* These two macros are identical to the ones in gnulib's xalloc.h,
433 except that they not to case the result to "(t *)", and thus may
434 be used via type-free CALLOC and MALLOC macros. */
438 /* Allocate memory for N elements of type T, with error checking. */
439 /* extern t *XNMALLOC (size_t n, typename t); */
440 # define XNMALLOC(n, t) \
441 (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
443 /* Allocate memory for N elements of type T, with error checking,
445 /* extern t *XCALLOC (size_t n, typename t); */
446 # define XCALLOC(n, t) \
447 (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
449 #define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
450 #define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
451 #define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
453 /* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
454 #define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
457 if ((n_alloc) <= (n_required)) \
459 size_t new_n_alloc = (n_required) + !(p); \
460 (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
461 (n_alloc) = new_n_alloc; \
475 fprintf (stderr, "END");
476 else if (t < NOTCHAR)
479 fprintf (stderr, "%c", ch);
540 fprintf (stderr, "%s", s);
545 /* Stuff pertaining to charclasses. */
548 tstbit (unsigned int b, charclass const c)
550 return c[b / INTBITS] & 1 << b % INTBITS;
554 setbit (unsigned int b, charclass c)
556 c[b / INTBITS] |= 1 << b % INTBITS;
560 clrbit (unsigned int b, charclass c)
562 c[b / INTBITS] &= ~(1 << b % INTBITS);
566 copyset (charclass const src, charclass dst)
568 memcpy (dst, src, sizeof (charclass));
572 zeroset (charclass s)
574 memset (s, 0, sizeof (charclass));
582 for (i = 0; i < CHARCLASS_INTS; ++i)
587 equal (charclass const s1, charclass const s2)
589 return memcmp (s1, s2, sizeof (charclass)) == 0;
592 /* A pointer to the current dfa is kept here during parsing. */
593 static struct dfa *dfa;
595 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
597 charclass_index (charclass const s)
601 for (i = 0; i < dfa->cindex; ++i)
602 if (equal (s, dfa->charclasses[i]))
604 REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
606 copyset (s, dfa->charclasses[i]);
610 /* Syntax bits controlling the behavior of the lexical analyzer. */
611 static reg_syntax_t syntax_bits, syntax_bits_set;
613 /* Flag for case-folding letters into sets. */
614 static int case_fold;
616 /* End-of-line byte in data. */
617 static unsigned char eolbyte;
619 /* Cache of char-context values. */
620 static int sbit[NOTCHAR];
622 /* Set of characters considered letters. */
623 static charclass letters;
625 /* Set of characters that are newline. */
626 static charclass newline;
628 /* Add this to the test for whether a byte is word-constituent, since on
629 BSD-based systems, many values in the 128..255 range are classified as
630 alphabetic, while on glibc-based systems, they are not. */
632 # define is_valid_unibyte_character(c) 1
634 # define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
637 /* Return non-zero if C is a "word-constituent" byte; zero otherwise. */
638 #define IS_WORD_CONSTITUENT(C) \
639 (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
642 char_context (unsigned char c)
644 if (c == eolbyte || c == 0)
646 if (IS_WORD_CONSTITUENT (c))
652 wchar_context (wint_t wc)
654 if (wc == (wchar_t) eolbyte || wc == 0)
656 if (wc == L'_' || iswalnum (wc))
661 /* Entry point to set syntax options. */
663 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
672 for (i = 0; i < NOTCHAR; ++i)
674 sbit[i] = char_context (i);
687 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
688 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
689 this may happen when folding case in weird Turkish locales where
690 dotless i/dotted I are not included in the chosen character set.
691 Return whether a bit was set in the charclass. */
694 setbit_wc (wint_t wc, charclass c)
704 /* Set a bit in the charclass for the given single byte character,
705 if it is valid in the current character set. */
707 setbit_c (int b, charclass c)
709 /* Do nothing if b is invalid in this character set. */
710 if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
715 # define setbit_c setbit
717 setbit_wc (wint_t wc, charclass c)
720 /*NOTREACHED*/ return false;
724 /* Like setbit_c, but if case is folded, set both cases of a letter. For
725 MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
726 and the caller takes care of setting the appropriate field of struct
729 setbit_case_fold_c (int b, charclass c)
733 wint_t wc = btowc (b);
737 if (case_fold && iswalpha (wc))
738 setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
743 if (case_fold && isalpha (b))
744 setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
750 /* UTF-8 encoding allows some optimizations that we can't otherwise
751 assume in a multibyte encoding. */
755 static int utf8 = -1;
758 #if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
759 utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
768 /* Lexical analyzer. All the dross that deals with the obnoxious
769 GNU Regex syntax bits is located here. The poor, suffering
770 reader is referred to the GNU Regex documentation for the
771 meaning of the @#%!@#%^!@ syntax bits. */
773 static char const *lexptr; /* Pointer to next input character. */
774 static size_t lexleft; /* Number of characters remaining. */
775 static token lasttok; /* Previous token returned; initially END. */
776 static int laststart; /* True if we're separated from beginning or (, |
777 only by zero-width characters. */
778 static size_t parens; /* Count of outstanding left parens. */
779 static int minrep, maxrep; /* Repeat counts for {m,n}. */
781 static int cur_mb_len = 1; /* Length of the multibyte representation of
783 /* These variables are used only if (MB_CUR_MAX > 1). */
784 static mbstate_t mbs; /* Mbstate for mbrlen(). */
785 static wchar_t wctok; /* Wide character representation of the current
786 multibyte character. */
787 static unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec().
788 Each element store the amount of remain
789 byte of corresponding multibyte character
790 in the input string. A element's value
791 is 0 if corresponding character is a
792 single byte character.
793 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
794 mblen_buf : 0, 3, 2, 1
796 static wchar_t *inputwcs; /* Wide character representation of input
798 The length of this array is same as
799 the length of input string(char array).
800 inputstring[i] is a single-byte char,
801 or 1st byte of a multibyte char.
802 And inputwcs[i] is the codepoint. */
803 static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
804 static unsigned char const *buf_end; /* reference to end in dfaexec(). */
808 /* Note that characters become unsigned here. */
809 # define FETCH_WC(c, wc, eoferr) \
816 return lasttok = END; \
821 cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
822 if (cur_mb_len <= 0) \
826 (wc) = (c) = to_uchar (*lexptr++); \
830 lexptr += cur_mb_len; \
831 lexleft -= cur_mb_len; \
838 # define FETCH(c, eoferr) \
841 FETCH_WC (c, wc, eoferr); \
845 /* Note that characters become unsigned here. */
846 # define FETCH(c, eoferr) \
853 return lasttok = END; \
855 (c) = to_uchar (*lexptr++); \
859 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
861 #endif /* MBS_SUPPORT */
864 # define MIN(a,b) ((a) < (b) ? (a) : (b))
867 typedef int predicate (int);
869 /* The following list maps the names of the Posix named character classes
870 to predicate functions that determine whether a given character is in
871 the class. The leading [ has already been eaten by the lexical analyzer. */
876 bool single_byte_only;
879 static const struct dfa_ctype prednames[] = {
880 {"alpha", isalpha, false},
881 {"upper", isupper, false},
882 {"lower", islower, false},
883 {"digit", isdigit, true},
884 {"xdigit", isxdigit, true},
885 {"space", isspace, false},
886 {"punct", ispunct, false},
887 {"alnum", isalnum, false},
888 {"print", isprint, false},
889 {"graph", isgraph, false},
890 {"cntrl", iscntrl, false},
891 {"blank", isblank, false},
895 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
896 find_pred (const char *str)
899 for (i = 0; prednames[i].name; ++i)
900 if (STREQ (str, prednames[i].name))
903 return &prednames[i];
906 /* Multibyte character handling sub-routine for lex.
907 This function parse a bracket expression and build a struct
910 parse_bracket_exp (void)
916 /* Used to warn about [:space:].
917 Bit 0 = first character is a colon.
918 Bit 1 = last character is a colon.
919 Bit 2 = includes any other character but a colon.
920 Bit 3 = includes ranges, char/equiv classes or collation elements. */
921 int colon_warning_state;
927 /* Work area to build a mb_char_classes. */
928 struct mb_char_classes *work_mbc;
929 size_t chars_al, range_sts_al, range_ends_al, ch_classes_al,
930 equivs_al, coll_elems_al;
933 range_sts_al = range_ends_al = 0;
934 ch_classes_al = equivs_al = coll_elems_al = 0;
937 REALLOC_IF_NECESSARY (dfa->mbcsets, dfa->mbcsets_alloc,
940 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
941 We will update dfa->multibyte_prop[] in addtok(), because we can't
942 decide the index in dfa->tokens[]. */
944 /* Initialize work area. */
945 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
946 memset (work_mbc, 0, sizeof *work_mbc);
951 memset (ccl, 0, sizeof ccl);
952 FETCH_WC (c, wc, _("unbalanced ["));
955 FETCH_WC (c, wc, _("unbalanced ["));
961 colon_warning_state = (c == ':');
964 c1 = EOF; /* mark c1 is not initialized". */
965 colon_warning_state &= ~2;
967 /* Note that if we're looking at some other [:...:] construct,
968 we just treat it as a bunch of ordinary characters. We can do
969 this because we assume regex has checked for syntax errors before
970 dfa is ever called. */
971 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
973 #define BRACKET_BUFFER_SIZE 128
974 char str[BRACKET_BUFFER_SIZE];
975 FETCH_WC (c1, wc1, _("unbalanced ["));
977 /* If pattern contains '[[:', '[[.', or '[[='. */
979 /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */
980 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
985 FETCH_WC (c, wc, _("unbalanced ["));
986 if ((c == c1 && *lexptr == ']') || lexleft == 0)
988 if (len < BRACKET_BUFFER_SIZE)
991 /* This is in any case an invalid class name. */
997 FETCH_WC (c, wc, _("unbalanced ["));
999 /* build character class. */
1002 = (case_fold && (STREQ (str, "upper")
1003 || STREQ (str, "lower")) ? "alpha" : str);
1004 const struct dfa_ctype *pred = find_pred (class);
1006 dfaerror (_("invalid character class"));
1008 if (MB_CUR_MAX > 1 && !pred->single_byte_only)
1010 /* Store the character class as wctype_t. */
1011 wctype_t wt = wctype (class);
1013 REALLOC_IF_NECESSARY (work_mbc->ch_classes,
1015 work_mbc->nch_classes + 1);
1016 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
1019 for (c2 = 0; c2 < NOTCHAR; ++c2)
1020 if (pred->func (c2))
1021 setbit_case_fold_c (c2, ccl);
1024 else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
1026 char *elem = xmemdup (str, len + 1);
1029 /* build equivalence class. */
1031 REALLOC_IF_NECESSARY (work_mbc->equivs,
1032 equivs_al, work_mbc->nequivs + 1);
1033 work_mbc->equivs[work_mbc->nequivs++] = elem;
1037 /* build collating element. */
1039 REALLOC_IF_NECESSARY (work_mbc->coll_elems,
1041 work_mbc->ncoll_elems + 1);
1042 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
1045 colon_warning_state |= 8;
1047 /* Fetch new lookahead character. */
1048 FETCH_WC (c1, wc1, _("unbalanced ["));
1052 /* We treat '[' as a normal character here. c/c1/wc/wc1
1053 are already set up. */
1056 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1057 FETCH_WC (c, wc, _("unbalanced ["));
1060 FETCH_WC (c1, wc1, _("unbalanced ["));
1063 /* build range characters. */
1065 FETCH_WC (c2, wc2, _("unbalanced ["));
1068 /* In the case [x-], the - is an ordinary hyphen,
1069 which is left in c1, the lookahead character. */
1070 lexptr -= cur_mb_len;
1071 lexleft += cur_mb_len;
1075 if (c1 == '-' && c2 != ']')
1077 if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1078 FETCH_WC (c2, wc2, _("unbalanced ["));
1082 /* When case folding map a range, say [m-z] (or even [M-z])
1083 to the pair of ranges, [m-z] [M-Z]. */
1084 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1085 range_sts_al, work_mbc->nranges + 1);
1086 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1087 range_ends_al, work_mbc->nranges + 1);
1088 work_mbc->range_sts[work_mbc->nranges] =
1089 case_fold ? towlower (wc) : (wchar_t) wc;
1090 work_mbc->range_ends[work_mbc->nranges++] =
1091 case_fold ? towlower (wc2) : (wchar_t) wc2;
1094 if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
1096 REALLOC_IF_NECESSARY (work_mbc->range_sts,
1097 range_sts_al, work_mbc->nranges + 1);
1098 work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
1099 REALLOC_IF_NECESSARY (work_mbc->range_ends,
1100 range_ends_al, work_mbc->nranges + 1);
1101 work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
1107 /* Defer to the system regex library about the meaning
1108 of range expressions. */
1110 char pattern[6] = { '[', 0, '-', 0, ']', 0 };
1111 char subject[2] = { 0, 0 };
1121 regcomp (&re, pattern, REG_NOSUB);
1122 for (c = 0; c < NOTCHAR; ++c)
1124 if ((case_fold && isupper (c))
1125 || (MB_CUR_MAX > 1 && btowc (c) == WEOF))
1128 if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
1129 setbit_case_fold_c (c, ccl);
1134 colon_warning_state |= 8;
1135 FETCH_WC (c1, wc1, _("unbalanced ["));
1139 colon_warning_state |= (c == ':') ? 2 : 4;
1141 if (MB_CUR_MAX == 1)
1143 setbit_case_fold_c (c, ccl);
1147 if (case_fold && iswalpha (wc))
1150 if (!setbit_wc (wc, ccl))
1152 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1153 work_mbc->nchars + 1);
1154 work_mbc->chars[work_mbc->nchars++] = wc;
1162 if (!setbit_wc (wc, ccl))
1164 REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
1165 work_mbc->nchars + 1);
1166 work_mbc->chars[work_mbc->nchars++] = wc;
1169 while ((wc = wc1, (c = c1) != ']'));
1171 if (colon_warning_state == 7)
1172 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1176 static charclass zeroclass;
1177 work_mbc->invert = invert;
1178 work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
1184 assert (MB_CUR_MAX == 1);
1186 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1187 clrbit (eolbyte, ccl);
1190 return CSET + charclass_index (ccl);
1201 /* Basic plan: We fetch a character. If it's a backslash,
1202 we set the backslash flag and go through the loop again.
1203 On the plus side, this avoids having a duplicate of the
1204 main switch inside the backslash case. On the minus side,
1205 it means that just about every case begins with
1206 "if (backslash) ...". */
1207 for (i = 0; i < 2; ++i)
1211 FETCH_WC (c, wctok, NULL);
1224 dfaerror (_("unfinished \\ escape"));
1231 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1232 || lasttok == END || lasttok == LPAREN || lasttok == OR)
1233 return lasttok = BEGLINE;
1239 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1241 || (syntax_bits & RE_NO_BK_PARENS
1242 ? lexleft > 0 && *lexptr == ')'
1243 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1244 || (syntax_bits & RE_NO_BK_VBAR
1245 ? lexleft > 0 && *lexptr == '|'
1246 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1247 || ((syntax_bits & RE_NEWLINE_ALT)
1248 && lexleft > 0 && *lexptr == '\n'))
1249 return lasttok = ENDLINE;
1261 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1264 return lasttok = BACKREF;
1269 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1270 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1274 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1275 return lasttok = ENDLINE; /* FIXME: should be end of string */
1279 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1280 return lasttok = BEGWORD;
1284 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1285 return lasttok = ENDWORD;
1289 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1290 return lasttok = LIMWORD;
1294 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1295 return lasttok = NOTLIMWORD;
1299 if (syntax_bits & RE_LIMITED_OPS)
1301 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1303 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1305 return lasttok = QMARK;
1310 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1312 return lasttok = STAR;
1315 if (syntax_bits & RE_LIMITED_OPS)
1317 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1319 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1321 return lasttok = PLUS;
1324 if (!(syntax_bits & RE_INTERVALS))
1326 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1328 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1333 {M,} - minimum count, maximum is infinity
1335 {,} - 0 to infinity (same as '*')
1336 {M,N} - M through N */
1338 char const *p = lexptr;
1339 char const *lim = p + lexleft;
1340 minrep = maxrep = -1;
1341 for (; p != lim && ISASCIIDIGIT (*p); p++)
1346 minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
1356 while (++p != lim && ISASCIIDIGIT (*p))
1361 maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
1365 if (! ((! backslash || (p != lim && *p++ == '\\'))
1366 && p != lim && *p++ == '}'
1367 && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
1369 if (syntax_bits & RE_INVALID_INTERVAL_ORD)
1371 dfaerror (_("Invalid content of \\{\\}"));
1373 if (RE_DUP_MAX < maxrep)
1374 dfaerror (_("Regular expression too big"));
1379 return lasttok = REPMN;
1382 if (syntax_bits & RE_LIMITED_OPS)
1384 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1387 return lasttok = OR;
1390 if (syntax_bits & RE_LIMITED_OPS
1391 || backslash || !(syntax_bits & RE_NEWLINE_ALT))
1394 return lasttok = OR;
1397 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1401 return lasttok = LPAREN;
1404 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1406 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1410 return lasttok = RPAREN;
1417 /* In multibyte environment period must match with a single
1418 character not a byte. So we use ANYCHAR. */
1420 return lasttok = ANYCHAR;
1424 if (!(syntax_bits & RE_DOT_NEWLINE))
1425 clrbit (eolbyte, ccl);
1426 if (syntax_bits & RE_DOT_NOT_NULL)
1429 return lasttok = CSET + charclass_index (ccl);
1433 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1436 for (c2 = 0; c2 < NOTCHAR; ++c2)
1442 return lasttok = CSET + charclass_index (ccl);
1446 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1449 for (c2 = 0; c2 < NOTCHAR; ++c2)
1450 if (IS_WORD_CONSTITUENT (c2))
1455 return lasttok = CSET + charclass_index (ccl);
1461 return lasttok = parse_bracket_exp ();
1466 /* For multibyte character sets, folding is done in atom. Always
1469 return lasttok = WCHAR;
1471 if (case_fold && isalpha (c))
1474 setbit_case_fold_c (c, ccl);
1475 return lasttok = CSET + charclass_index (ccl);
1482 /* The above loop should consume at most a backslash
1483 and some other character. */
1485 return END; /* keeps pedantic compilers happy. */
1488 /* Recursive descent parser for regular expressions. */
1490 static token tok; /* Lookahead token. */
1491 static size_t depth; /* Current depth of a hypothetical stack
1492 holding deferred productions. This is
1493 used to determine the depth that will be
1494 required of the real stack later on in
1498 addtok_mb (token t, int mbprop)
1502 REALLOC_IF_NECESSARY (dfa->multibyte_prop, dfa->nmultibyte_prop,
1504 dfa->multibyte_prop[dfa->tindex] = mbprop;
1507 REALLOC_IF_NECESSARY (dfa->tokens, dfa->talloc, dfa->tindex + 1);
1508 dfa->tokens[dfa->tindex++] = t;
1528 if (depth > dfa->depth)
1532 static void addtok_wc (wint_t wc);
1534 /* Add the given token to the parse tree, maintaining the depth count and
1535 updating the maximum depth if necessary. */
1539 if (MB_CUR_MAX > 1 && t == MBCSET)
1541 bool need_or = false;
1542 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1544 /* Extract wide characters into alternations for better performance.
1545 This does not require UTF-8. */
1546 if (!work_mbc->invert)
1549 for (i = 0; i < work_mbc->nchars; i++)
1551 addtok_wc (work_mbc->chars[i]);
1556 work_mbc->nchars = 0;
1559 /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
1560 if (work_mbc->invert
1561 || (!using_utf8 () && work_mbc->cset != -1)
1562 || work_mbc->nchars != 0
1563 || work_mbc->nch_classes != 0
1564 || work_mbc->nranges != 0
1565 || work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
1567 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1573 /* Characters have been handled above, so it is possible
1574 that the mbcset is empty now. Do nothing in that case. */
1575 if (work_mbc->cset != -1)
1577 assert (using_utf8 ());
1578 addtok (CSET + work_mbc->cset);
1591 /* We treat a multibyte character as a single atom, so that DFA
1592 can treat a multibyte character as a single expression.
1594 e.g. We construct following tree from "<mb1><mb2>".
1595 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1596 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1598 addtok_wc (wint_t wc)
1600 unsigned char buf[MB_LEN_MAX];
1603 memset (&s, 0, sizeof s);
1604 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1606 /* This is merely stop-gap. When cur_mb_len is 0 or negative,
1607 buf[0] is undefined, yet skipping the addtok_mb call altogether
1608 can result in heap corruption. */
1609 if (cur_mb_len <= 0)
1612 addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
1613 for (i = 1; i < cur_mb_len; i++)
1615 addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
1621 addtok_wc (wint_t wc)
1627 add_utf8_anychar (void)
1630 static const charclass utf8_classes[5] = {
1631 {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-lead bytes */
1632 {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
1633 {0, 0, 0, 0, 0, 0, ~3, 0}, /* c2-df: 2-byte sequence */
1634 {0, 0, 0, 0, 0, 0, 0, 0xffff}, /* e0-ef: 3-byte sequence */
1635 {0, 0, 0, 0, 0, 0, 0, 0xff0000} /* f0-f7: 4-byte sequence */
1637 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1640 /* Define the five character classes that are needed below. */
1641 if (dfa->utf8_anychar_classes[0] == 0)
1642 for (i = 0; i < n; i++)
1645 copyset (utf8_classes[i], c);
1648 if (!(syntax_bits & RE_DOT_NEWLINE))
1649 clrbit (eolbyte, c);
1650 if (syntax_bits & RE_DOT_NOT_NULL)
1653 dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
1656 /* A valid UTF-8 character is
1659 |[0xc2-0xdf][0x80-0xbf]
1660 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1661 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1663 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1664 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1665 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1666 for (i = 1; i < n; i++)
1667 addtok (dfa->utf8_anychar_classes[i]);
1670 addtok (dfa->utf8_anychar_classes[0]);
1677 /* The grammar understood by the parser is as follows.
1696 <multibyte character>
1707 LPAREN regexp RPAREN
1710 The parser builds a parse tree in postfix form in an array of tokens. */
1719 else if (MBS_SUPPORT && tok == WCHAR)
1721 addtok_wc (case_fold ? towlower (wctok) : wctok);
1723 if (case_fold && iswalpha (wctok))
1725 addtok_wc (towupper (wctok));
1732 else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
1734 /* For UTF-8 expand the period to a series of CSETs that define a valid
1735 UTF-8 character. This avoids using the slow multibyte path. I'm
1736 pretty sure it would be both profitable and correct to do it for
1737 any encoding; however, the optimization must be done manually as
1738 it is done above in add_utf8_anychar. So, let's start with
1739 UTF-8: it is the most used, and the structure of the encoding
1740 makes the correctness more obvious. */
1741 add_utf8_anychar ();
1744 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1745 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1747 || tok == ANYCHAR || tok == MBCSET
1748 #endif /* MBS_SUPPORT */
1749 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1754 else if (tok == LPAREN)
1759 dfaerror (_("unbalanced ("));
1766 /* Return the number of tokens in the given subexpression. */
1767 static size_t _GL_ATTRIBUTE_PURE
1768 nsubtoks (size_t tindex)
1772 switch (dfa->tokens[tindex - 1])
1779 return 1 + nsubtoks (tindex - 1);
1782 ntoks1 = nsubtoks (tindex - 1);
1783 return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
1787 /* Copy the given subexpression to the top of the tree. */
1789 copytoks (size_t tindex, size_t ntokens)
1793 for (i = 0; i < ntokens; ++i)
1795 addtok (dfa->tokens[tindex + i]);
1796 /* Update index into multibyte csets. */
1797 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1798 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1806 size_t tindex, ntokens;
1809 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1810 if (tok == REPMN && (minrep || maxrep))
1812 ntokens = nsubtoks (dfa->tindex);
1813 tindex = dfa->tindex - ntokens;
1818 for (i = 1; i < minrep; ++i)
1820 copytoks (tindex, ntokens);
1823 for (; i < maxrep; ++i)
1825 copytoks (tindex, ntokens);
1831 else if (tok == REPMN)
1833 dfa->tindex -= nsubtoks (dfa->tindex);
1848 while (tok != RPAREN && tok != OR && tok >= 0)
1867 /* Main entry point for the parser. S is a string to be parsed, len is the
1868 length of the string, so s can include NUL characters. D is a pointer to
1869 the struct dfa to parse into. */
1871 dfaparse (char const *s, size_t len, struct dfa *d)
1882 memset (&mbs, 0, sizeof mbs);
1885 if (!syntax_bits_set)
1886 dfaerror (_("no syntax specified"));
1894 dfaerror (_("unbalanced )"));
1896 addtok (END - d->nregexps);
1905 /* Some primitives for operating on sets of positions. */
1907 /* Copy one set to another; the destination must be large enough. */
1909 copy (position_set const *src, position_set * dst)
1911 REALLOC_IF_NECESSARY (dst->elems, dst->alloc, src->nelem);
1912 memcpy (dst->elems, src->elems, sizeof (dst->elems[0]) * src->nelem);
1913 dst->nelem = src->nelem;
1917 alloc_position_set (position_set * s, size_t size)
1919 MALLOC (s->elems, size);
1924 /* Insert position P in set S. S is maintained in sorted order on
1925 decreasing index. If there is already an entry in S with P.index
1926 then merge (logically-OR) P's constraints into the one in S.
1927 S->elems must point to an array large enough to hold the resulting set. */
1929 insert (position p, position_set * s)
1931 size_t count = s->nelem;
1932 size_t lo = 0, hi = count;
1936 size_t mid = (lo + hi) >> 1;
1937 if (s->elems[mid].index > p.index)
1943 if (lo < count && p.index == s->elems[lo].index)
1945 s->elems[lo].constraint |= p.constraint;
1949 REALLOC_IF_NECESSARY (s->elems, s->alloc, count + 1);
1950 for (i = count; i > lo; i--)
1951 s->elems[i] = s->elems[i - 1];
1956 /* Merge two sets of positions into a third. The result is exactly as if
1957 the positions of both sets were inserted into an initially empty set. */
1959 merge (position_set const *s1, position_set const *s2, position_set * m)
1961 size_t i = 0, j = 0;
1963 REALLOC_IF_NECESSARY (m->elems, m->alloc, s1->nelem + s2->nelem);
1965 while (i < s1->nelem && j < s2->nelem)
1966 if (s1->elems[i].index > s2->elems[j].index)
1967 m->elems[m->nelem++] = s1->elems[i++];
1968 else if (s1->elems[i].index < s2->elems[j].index)
1969 m->elems[m->nelem++] = s2->elems[j++];
1972 m->elems[m->nelem] = s1->elems[i++];
1973 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1975 while (i < s1->nelem)
1976 m->elems[m->nelem++] = s1->elems[i++];
1977 while (j < s2->nelem)
1978 m->elems[m->nelem++] = s2->elems[j++];
1981 /* Delete a position from a set. */
1983 delete (position p, position_set * s)
1987 for (i = 0; i < s->nelem; ++i)
1988 if (p.index == s->elems[i].index)
1991 for (--s->nelem; i < s->nelem; ++i)
1992 s->elems[i] = s->elems[i + 1];
1995 /* Find the index of the state corresponding to the given position set with
1996 the given preceding context, or create a new state if there is no such
1997 state. Context tells whether we got here on a newline or letter. */
1999 state_index (struct dfa *d, position_set const *s, int context)
2005 for (i = 0; i < s->nelem; ++i)
2006 hash ^= s->elems[i].index + s->elems[i].constraint;
2008 /* Try to find a state that exactly matches the proposed one. */
2009 for (i = 0; i < d->sindex; ++i)
2011 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2012 || context != d->states[i].context)
2014 for (j = 0; j < s->nelem; ++j)
2015 if (s->elems[j].constraint
2016 != d->states[i].elems.elems[j].constraint
2017 || s->elems[j].index != d->states[i].elems.elems[j].index)
2023 /* We'll have to create a new state. */
2024 REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
2025 d->states[i].hash = hash;
2026 alloc_position_set (&d->states[i].elems, s->nelem);
2027 copy (s, &d->states[i].elems);
2028 d->states[i].context = context;
2029 d->states[i].backref = 0;
2030 d->states[i].constraint = 0;
2031 d->states[i].first_end = 0;
2034 d->states[i].mbps.nelem = 0;
2035 d->states[i].mbps.elems = NULL;
2037 for (j = 0; j < s->nelem; ++j)
2038 if (d->tokens[s->elems[j].index] < 0)
2040 constraint = s->elems[j].constraint;
2041 if (SUCCEEDS_IN_CONTEXT (constraint, context, CTX_ANY))
2042 d->states[i].constraint |= constraint;
2043 if (!d->states[i].first_end)
2044 d->states[i].first_end = d->tokens[s->elems[j].index];
2046 else if (d->tokens[s->elems[j].index] == BACKREF)
2048 d->states[i].constraint = NO_CONSTRAINT;
2049 d->states[i].backref = 1;
2057 /* Find the epsilon closure of a set of positions. If any position of the set
2058 contains a symbol that matches the empty string in some context, replace
2059 that position with the elements of its follow labeled with an appropriate
2060 constraint. Repeat exhaustively until no funny positions are left.
2061 S->elems must be large enough to hold the result. */
2063 epsclosure (position_set * s, struct dfa const *d)
2066 char *visited; /* array of booleans, enough to use char, not int */
2069 CALLOC (visited, d->tindex);
2071 for (i = 0; i < s->nelem; ++i)
2072 if (d->tokens[s->elems[i].index] >= NOTCHAR
2073 && d->tokens[s->elems[i].index] != BACKREF
2075 && d->tokens[s->elems[i].index] != ANYCHAR
2076 && d->tokens[s->elems[i].index] != MBCSET
2078 && d->tokens[s->elems[i].index] < CSET)
2081 p.constraint = old.constraint;
2082 delete (s->elems[i], s);
2083 if (visited[old.index])
2088 visited[old.index] = 1;
2089 switch (d->tokens[old.index])
2092 p.constraint &= BEGLINE_CONSTRAINT;
2095 p.constraint &= ENDLINE_CONSTRAINT;
2098 p.constraint &= BEGWORD_CONSTRAINT;
2101 p.constraint &= ENDWORD_CONSTRAINT;
2104 p.constraint &= LIMWORD_CONSTRAINT;
2107 p.constraint &= NOTLIMWORD_CONSTRAINT;
2112 for (j = 0; j < d->follows[old.index].nelem; ++j)
2114 p.index = d->follows[old.index].elems[j].index;
2117 /* Force rescan to start at the beginning. */
2124 /* Returns the set of contexts for which there is at least one
2125 character included in C. */
2128 charclass_context (charclass c)
2133 if (tstbit (eolbyte, c))
2134 context |= CTX_NEWLINE;
2136 for (j = 0; j < CHARCLASS_INTS; ++j)
2138 if (c[j] & letters[j])
2139 context |= CTX_LETTER;
2140 if (c[j] & ~(letters[j] | newline[j]))
2141 context |= CTX_NONE;
2147 /* Returns the contexts on which the position set S depends. Each context
2148 in the set of returned contexts (let's call it SC) may have a different
2149 follow set than other contexts in SC, and also different from the
2150 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2151 in the complement set will have the same follow set. */
2153 static int _GL_ATTRIBUTE_PURE
2154 state_separate_contexts (position_set const *s)
2156 int separate_contexts = 0;
2159 for (j = 0; j < s->nelem; ++j)
2161 if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2162 separate_contexts |= CTX_NEWLINE;
2163 if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2164 separate_contexts |= CTX_LETTER;
2167 return separate_contexts;
2171 /* Perform bottom-up analysis on the parse tree, computing various functions.
2172 Note that at this point, we're pretending constructs like \< are real
2173 characters rather than constraints on what can follow them.
2175 Nullable: A node is nullable if it is at the root of a regexp that can
2176 match the empty string.
2177 * EMPTY leaves are nullable.
2178 * No other leaf is nullable.
2179 * A QMARK or STAR node is nullable.
2180 * A PLUS node is nullable if its argument is nullable.
2181 * A CAT node is nullable if both its arguments are nullable.
2182 * An OR node is nullable if either argument is nullable.
2184 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2185 that could correspond to the first character of a string matching the
2186 regexp rooted at the given node.
2187 * EMPTY leaves have empty firstpos.
2188 * The firstpos of a nonempty leaf is that leaf itself.
2189 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2191 * The firstpos of a CAT node is the firstpos of the left argument, union
2192 the firstpos of the right if the left argument is nullable.
2193 * The firstpos of an OR node is the union of firstpos of each argument.
2195 Lastpos: The lastpos of a node is the set of positions that could
2196 correspond to the last character of a string matching the regexp at
2198 * EMPTY leaves have empty lastpos.
2199 * The lastpos of a nonempty leaf is that leaf itself.
2200 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2202 * The lastpos of a CAT node is the lastpos of its right argument, union
2203 the lastpos of the left if the right argument is nullable.
2204 * The lastpos of an OR node is the union of the lastpos of each argument.
2206 Follow: The follow of a position is the set of positions that could
2207 correspond to the character following a character matching the node in
2208 a string matching the regexp. At this point we consider special symbols
2209 that match the empty string in some context to be just normal characters.
2210 Later, if we find that a special symbol is in a follow set, we will
2211 replace it with the elements of its follow, labeled with an appropriate
2213 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2214 the follow of every node in the lastpos.
2215 * Every node in the firstpos of the second argument of a CAT node is in
2216 the follow of every node in the lastpos of the first argument.
2218 Because of the postfix representation of the parse tree, the depth-first
2219 analysis is conveniently done by a linear scan with the aid of a stack.
2220 Sets are stored as arrays of the elements, obeying a stack-like allocation
2221 scheme; the number of elements in each set deeper in the stack can be
2222 used to determine the address of a particular set's array. */
2224 dfaanalyze (struct dfa *d, int searchflag)
2226 int *nullable; /* Nullable stack. */
2227 size_t *nfirstpos; /* Element count stack for firstpos sets. */
2228 position *firstpos; /* Array where firstpos elements are stored. */
2229 size_t *nlastpos; /* Element count stack for lastpos sets. */
2230 position *lastpos; /* Array where lastpos elements are stored. */
2231 position_set tmp; /* Temporary set for merging sets. */
2232 position_set merged; /* Result of merging sets. */
2233 int separate_contexts; /* Context wanted by some position. */
2235 size_t *o_nfirst, *o_nlast;
2236 position *o_firstpos, *o_lastpos;
2241 fprintf (stderr, "dfaanalyze:\n");
2242 for (i = 0; i < d->tindex; ++i)
2244 fprintf (stderr, " %zd:", i);
2245 prtok (d->tokens[i]);
2247 putc ('\n', stderr);
2250 d->searchflag = searchflag;
2252 MALLOC (nullable, d->depth);
2253 o_nullable = nullable;
2254 MALLOC (nfirstpos, d->depth);
2255 o_nfirst = nfirstpos;
2256 MALLOC (firstpos, d->nleaves);
2257 o_firstpos = firstpos, firstpos += d->nleaves;
2258 MALLOC (nlastpos, d->depth);
2260 MALLOC (lastpos, d->nleaves);
2261 o_lastpos = lastpos, lastpos += d->nleaves;
2262 alloc_position_set (&merged, d->nleaves);
2264 CALLOC (d->follows, d->tindex);
2266 for (i = 0; i < d->tindex; ++i)
2268 switch (d->tokens[i])
2271 /* The empty set is nullable. */
2274 /* The firstpos and lastpos of the empty leaf are both empty. */
2275 *nfirstpos++ = *nlastpos++ = 0;
2280 /* Every element in the firstpos of the argument is in the follow
2281 of every element in the lastpos. */
2282 tmp.nelem = nfirstpos[-1];
2283 tmp.elems = firstpos;
2285 for (j = 0; j < nlastpos[-1]; ++j)
2287 merge (&tmp, &d->follows[pos[j].index], &merged);
2288 copy (&merged, &d->follows[pos[j].index]);
2292 /* A QMARK or STAR node is automatically nullable. */
2293 if (d->tokens[i] != PLUS)
2298 /* Every element in the firstpos of the second argument is in the
2299 follow of every element in the lastpos of the first argument. */
2300 tmp.nelem = nfirstpos[-1];
2301 tmp.elems = firstpos;
2302 pos = lastpos + nlastpos[-1];
2303 for (j = 0; j < nlastpos[-2]; ++j)
2305 merge (&tmp, &d->follows[pos[j].index], &merged);
2306 copy (&merged, &d->follows[pos[j].index]);
2309 /* The firstpos of a CAT node is the firstpos of the first argument,
2310 union that of the second argument if the first is nullable. */
2312 nfirstpos[-2] += nfirstpos[-1];
2314 firstpos += nfirstpos[-1];
2317 /* The lastpos of a CAT node is the lastpos of the second argument,
2318 union that of the first argument if the second is nullable. */
2320 nlastpos[-2] += nlastpos[-1];
2323 pos = lastpos + nlastpos[-2];
2324 for (j = nlastpos[-1]; j-- > 0;)
2325 pos[j] = lastpos[j];
2326 lastpos += nlastpos[-2];
2327 nlastpos[-2] = nlastpos[-1];
2331 /* A CAT node is nullable if both arguments are nullable. */
2332 nullable[-2] = nullable[-1] && nullable[-2];
2337 /* The firstpos is the union of the firstpos of each argument. */
2338 nfirstpos[-2] += nfirstpos[-1];
2341 /* The lastpos is the union of the lastpos of each argument. */
2342 nlastpos[-2] += nlastpos[-1];
2345 /* An OR node is nullable if either argument is nullable. */
2346 nullable[-2] = nullable[-1] || nullable[-2];
2351 /* Anything else is a nonempty position. (Note that special
2352 constructs like \< are treated as nonempty strings here;
2353 an "epsilon closure" effectively makes them nullable later.
2354 Backreferences have to get a real position so we can detect
2355 transitions on them later. But they are nullable. */
2356 *nullable++ = d->tokens[i] == BACKREF;
2358 /* This position is in its own firstpos and lastpos. */
2359 *nfirstpos++ = *nlastpos++ = 1;
2360 --firstpos, --lastpos;
2361 firstpos->index = lastpos->index = i;
2362 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2364 /* Allocate the follow set for this position. */
2365 alloc_position_set (&d->follows[i], 1);
2369 /* ... balance the above nonsyntactic #ifdef goo... */
2370 fprintf (stderr, "node %zd:", i);
2371 prtok (d->tokens[i]);
2372 putc ('\n', stderr);
2373 fprintf (stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2374 fprintf (stderr, " firstpos:");
2375 for (j = nfirstpos[-1]; j-- > 0;)
2377 fprintf (stderr, " %zd:", firstpos[j].index);
2378 prtok (d->tokens[firstpos[j].index]);
2380 fprintf (stderr, "\n lastpos:");
2381 for (j = nlastpos[-1]; j-- > 0;)
2383 fprintf (stderr, " %zd:", lastpos[j].index);
2384 prtok (d->tokens[lastpos[j].index]);
2386 putc ('\n', stderr);
2390 /* For each follow set that is the follow set of a real position, replace
2391 it with its epsilon closure. */
2392 for (i = 0; i < d->tindex; ++i)
2393 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2395 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2397 || d->tokens[i] >= CSET)
2400 fprintf (stderr, "follows(%zd:", i);
2401 prtok (d->tokens[i]);
2402 fprintf (stderr, "):");
2403 for (j = d->follows[i].nelem; j-- > 0;)
2405 fprintf (stderr, " %zd:", d->follows[i].elems[j].index);
2406 prtok (d->tokens[d->follows[i].elems[j].index]);
2408 putc ('\n', stderr);
2410 copy (&d->follows[i], &merged);
2411 epsclosure (&merged, d);
2412 copy (&merged, &d->follows[i]);
2415 /* Get the epsilon closure of the firstpos of the regexp. The result will
2416 be the set of positions of state 0. */
2418 for (i = 0; i < nfirstpos[-1]; ++i)
2419 insert (firstpos[i], &merged);
2420 epsclosure (&merged, d);
2422 /* Build the initial state. */
2425 MALLOC (d->states, d->salloc);
2427 separate_contexts = state_separate_contexts (&merged);
2428 state_index (d, &merged,
2429 (separate_contexts & CTX_NEWLINE
2430 ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
2437 free (merged.elems);
2441 /* Find, for each character, the transition out of state s of d, and store
2442 it in the appropriate slot of trans.
2444 We divide the positions of s into groups (positions can appear in more
2445 than one group). Each group is labeled with a set of characters that
2446 every position in the group matches (taking into account, if necessary,
2447 preceding context information of s). For each group, find the union
2448 of the its elements' follows. This set is the set of positions of the
2449 new state. For each character in the group's label, set the transition
2450 on this character to be to a state corresponding to the set's positions,
2451 and its associated backward context information, if necessary.
2453 If we are building a searching matcher, we include the positions of state
2456 The collection of groups is constructed by building an equivalence-class
2457 partition of the positions of s.
2459 For each position, find the set of characters C that it matches. Eliminate
2460 any characters from C that fail on grounds of backward context.
2462 Search through the groups, looking for a group whose label L has nonempty
2463 intersection with C. If L - C is nonempty, create a new group labeled
2464 L - C and having the same positions as the current group, and set L to
2465 the intersection of L and C. Insert the position in this group, set
2466 C = C - L, and resume scanning.
2468 If after comparing with every group there are characters remaining in C,
2469 create a new group labeled with the characters of C and insert this
2470 position in that group. */
2472 dfastate (state_num s, struct dfa *d, state_num trans[])
2474 leaf_set *grps; /* As many as will ever be needed. */
2475 charclass *labels; /* Labels corresponding to the groups. */
2476 size_t ngrps = 0; /* Number of groups actually used. */
2477 position pos; /* Current position being considered. */
2478 charclass matches; /* Set of matching characters. */
2479 int matchesf; /* True if matches is nonempty. */
2480 charclass intersect; /* Intersection with some label set. */
2481 int intersectf; /* True if intersect is nonempty. */
2482 charclass leftovers; /* Stuff in the label that didn't match. */
2483 int leftoversf; /* True if leftovers is nonempty. */
2484 position_set follows; /* Union of the follows of some group. */
2485 position_set tmp; /* Temporary space for merging sets. */
2486 int possible_contexts; /* Contexts that this group can match. */
2487 int separate_contexts; /* Context that new state wants to know. */
2488 state_num state; /* New state. */
2489 state_num state_newline; /* New state on a newline transition. */
2490 state_num state_letter; /* New state on a letter transition. */
2491 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2494 MALLOC (grps, NOTCHAR);
2495 MALLOC (labels, NOTCHAR);
2499 for (i = 0; i < d->states[s].elems.nelem; ++i)
2501 pos = d->states[s].elems.elems[i];
2502 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2503 setbit (d->tokens[pos.index], matches);
2504 else if (d->tokens[pos.index] >= CSET)
2505 copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2506 else if (MBS_SUPPORT
2507 && (d->tokens[pos.index] == ANYCHAR
2508 || d->tokens[pos.index] == MBCSET))
2509 /* MB_CUR_MAX > 1 */
2511 /* ANYCHAR and MBCSET must match with a single character, so we
2512 must put it to d->states[s].mbps, which contains the positions
2513 which can match with a single character not a byte. */
2514 if (d->states[s].mbps.nelem == 0)
2515 alloc_position_set (&d->states[s].mbps, 1);
2516 insert (pos, &(d->states[s].mbps));
2522 /* Some characters may need to be eliminated from matches because
2523 they fail in the current context. */
2524 if (pos.constraint != NO_CONSTRAINT)
2526 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2527 d->states[s].context, CTX_NEWLINE))
2528 for (j = 0; j < CHARCLASS_INTS; ++j)
2529 matches[j] &= ~newline[j];
2530 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2531 d->states[s].context, CTX_LETTER))
2532 for (j = 0; j < CHARCLASS_INTS; ++j)
2533 matches[j] &= ~letters[j];
2534 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2535 d->states[s].context, CTX_NONE))
2536 for (j = 0; j < CHARCLASS_INTS; ++j)
2537 matches[j] &= letters[j] | newline[j];
2539 /* If there are no characters left, there's no point in going on. */
2540 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2542 if (j == CHARCLASS_INTS)
2546 for (j = 0; j < ngrps; ++j)
2548 /* If matches contains a single character only, and the current
2549 group's label doesn't contain that character, go on to the
2551 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2552 && !tstbit (d->tokens[pos.index], labels[j]))
2555 /* Check if this group's label has a nonempty intersection with
2558 for (k = 0; k < CHARCLASS_INTS; ++k)
2559 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2563 /* It does; now find the set differences both ways. */
2564 leftoversf = matchesf = 0;
2565 for (k = 0; k < CHARCLASS_INTS; ++k)
2567 /* Even an optimizing compiler can't know this for sure. */
2568 int match = matches[k], label = labels[j][k];
2570 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2571 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2574 /* If there were leftovers, create a new group labeled with them. */
2577 copyset (leftovers, labels[ngrps]);
2578 copyset (intersect, labels[j]);
2579 MALLOC (grps[ngrps].elems, d->nleaves);
2580 memcpy (grps[ngrps].elems, grps[j].elems,
2581 sizeof (grps[j].elems[0]) * grps[j].nelem);
2582 grps[ngrps].nelem = grps[j].nelem;
2586 /* Put the position in the current group. The constraint is
2588 grps[j].elems[grps[j].nelem++] = pos.index;
2590 /* If every character matching the current position has been
2591 accounted for, we're done. */
2596 /* If we've passed the last group, and there are still characters
2597 unaccounted for, then we'll have to create a new group. */
2600 copyset (matches, labels[ngrps]);
2602 MALLOC (grps[ngrps].elems, d->nleaves);
2603 grps[ngrps].nelem = 1;
2604 grps[ngrps].elems[0] = pos.index;
2609 alloc_position_set (&follows, d->nleaves);
2610 alloc_position_set (&tmp, d->nleaves);
2612 /* If we are a searching matcher, the default transition is to a state
2613 containing the positions of state 0, otherwise the default transition
2614 is to fail miserably. */
2617 /* Find the state(s) corresponding to the positions of state 0. */
2618 copy (&d->states[0].elems, &follows);
2619 separate_contexts = state_separate_contexts (&follows);
2620 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2621 if (separate_contexts & CTX_NEWLINE)
2622 state_newline = state_index (d, &follows, CTX_NEWLINE);
2624 state_newline = state;
2625 if (separate_contexts & CTX_LETTER)
2626 state_letter = state_index (d, &follows, CTX_LETTER);
2628 state_letter = state;
2630 for (i = 0; i < NOTCHAR; ++i)
2631 trans[i] = (IS_WORD_CONSTITUENT (i)) ? state_letter : state;
2632 trans[eolbyte] = state_newline;
2635 for (i = 0; i < NOTCHAR; ++i)
2638 for (i = 0; i < ngrps; ++i)
2642 /* Find the union of the follows of the positions of the group.
2643 This is a hideously inefficient loop. Fix it someday. */
2644 for (j = 0; j < grps[i].nelem; ++j)
2645 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2646 insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2648 if (d->mb_cur_max > 1)
2650 /* If a token in follows.elems is not 1st byte of a multibyte
2651 character, or the states of follows must accept the bytes
2652 which are not 1st byte of the multibyte character.
2653 Then, if a state of follows encounter a byte, it must not be
2654 a 1st byte of a multibyte character nor single byte character.
2655 We cansel to add state[0].follows to next state, because
2656 state[0] must accept 1st-byte
2658 For example, we assume <sb a> is a certain single byte
2659 character, <mb A> is a certain multibyte character, and the
2660 codepoint of <sb a> equals the 2nd byte of the codepoint of
2662 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2663 by accepting accepts 1st byte of <mb A>, and state[i+1]
2664 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2665 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2666 <mb A>, so we cannot add state[0]. */
2668 next_isnt_1st_byte = 0;
2669 for (j = 0; j < follows.nelem; ++j)
2671 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2673 next_isnt_1st_byte = 1;
2679 /* If we are building a searching matcher, throw in the positions
2680 of state 0 as well. */
2682 && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
2683 for (j = 0; j < d->states[0].elems.nelem; ++j)
2684 insert (d->states[0].elems.elems[j], &follows);
2686 /* Find out if the new state will want any context information. */
2687 possible_contexts = charclass_context (labels[i]);
2688 separate_contexts = state_separate_contexts (&follows);
2690 /* Find the state(s) corresponding to the union of the follows. */
2691 if ((separate_contexts & possible_contexts) != possible_contexts)
2692 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2695 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2696 state_newline = state_index (d, &follows, CTX_NEWLINE);
2698 state_newline = state;
2699 if (separate_contexts & possible_contexts & CTX_LETTER)
2700 state_letter = state_index (d, &follows, CTX_LETTER);
2702 state_letter = state;
2704 /* Set the transitions for each character in the current label. */
2705 for (j = 0; j < CHARCLASS_INTS; ++j)
2706 for (k = 0; k < INTBITS; ++k)
2707 if (labels[i][j] & 1 << k)
2709 int c = j * INTBITS + k;
2712 trans[c] = state_newline;
2713 else if (IS_WORD_CONSTITUENT (c))
2714 trans[c] = state_letter;
2715 else if (c < NOTCHAR)
2720 for (i = 0; i < ngrps; ++i)
2721 free (grps[i].elems);
2722 free (follows.elems);
2728 /* Some routines for manipulating a compiled dfa's transition tables.
2729 Each state may or may not have a transition table; if it does, and it
2730 is a non-accepting state, then d->trans[state] points to its table.
2731 If it is an accepting state then d->fails[state] points to its table.
2732 If it has no table at all, then d->trans[state] is NULL.
2733 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2736 build_state (state_num s, struct dfa *d)
2738 state_num *trans; /* The new transition table. */
2741 /* Set an upper limit on the number of transition tables that will ever
2742 exist at once. 1024 is arbitrary. The idea is that the frequently
2743 used transition tables will be quickly rebuilt, whereas the ones that
2744 were only needed once or twice will be cleared away. */
2745 if (d->trcount >= 1024)
2747 for (i = 0; i < d->tralloc; ++i)
2751 d->trans[i] = d->fails[i] = NULL;
2758 /* Set up the success bits for this state. */
2760 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2761 d->success[s] |= CTX_NEWLINE;
2762 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2763 d->success[s] |= CTX_LETTER;
2764 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2765 d->success[s] |= CTX_NONE;
2767 MALLOC (trans, NOTCHAR);
2768 dfastate (s, d, trans);
2770 /* Now go through the new transition table, and make sure that the trans
2771 and fail arrays are allocated large enough to hold a pointer for the
2772 largest state mentioned in the table. */
2773 for (i = 0; i < NOTCHAR; ++i)
2774 if (trans[i] >= d->tralloc)
2776 state_num oldalloc = d->tralloc;
2778 while (trans[i] >= d->tralloc)
2780 REALLOC (d->realtrans, d->tralloc + 1);
2781 d->trans = d->realtrans + 1;
2782 REALLOC (d->fails, d->tralloc);
2783 REALLOC (d->success, d->tralloc);
2784 REALLOC (d->newlines, d->tralloc);
2785 while (oldalloc < d->tralloc)
2787 d->trans[oldalloc] = NULL;
2788 d->fails[oldalloc++] = NULL;
2792 /* Keep the newline transition in a special place so we can use it as
2794 d->newlines[s] = trans[eolbyte];
2795 trans[eolbyte] = -1;
2797 if (ACCEPTING (s, *d))
2798 d->fails[s] = trans;
2800 d->trans[s] = trans;
2804 build_state_zero (struct dfa *d)
2808 CALLOC (d->realtrans, d->tralloc + 1);
2809 d->trans = d->realtrans + 1;
2810 CALLOC (d->fails, d->tralloc);
2811 MALLOC (d->success, d->tralloc);
2812 MALLOC (d->newlines, d->tralloc);
2816 /* Multibyte character handling sub-routines for dfaexec. */
2818 /* Initial state may encounter the byte which is not a single byte character
2819 nor 1st byte of a multibyte character. But it is incorrect for initial
2820 state to accept such a byte.
2821 For example, in sjis encoding the regular expression like "\\" accepts
2822 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2823 0x815c. Then Initial state must skip the bytes which are not a single byte
2824 character nor 1st byte of a multibyte character. */
2825 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2828 while (inputwcs[p - buf_begin] == 0 \
2829 && mblen_buf[p - buf_begin] > 0 \
2830 && (unsigned char const *) p < buf_end) \
2832 if ((char *) p >= end) \
2842 realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2844 /* Make sure that the trans and fail arrays are allocated large enough
2845 to hold a pointer for the new state. */
2846 if (new_state >= d->tralloc)
2848 state_num oldalloc = d->tralloc;
2850 while (new_state >= d->tralloc)
2852 REALLOC (d->realtrans, d->tralloc + 1);
2853 d->trans = d->realtrans + 1;
2854 REALLOC (d->fails, d->tralloc);
2855 REALLOC (d->success, d->tralloc);
2856 REALLOC (d->newlines, d->tralloc);
2857 while (oldalloc < d->tralloc)
2859 d->trans[oldalloc] = NULL;
2860 d->fails[oldalloc++] = NULL;
2865 /* Return values of transit_state_singlebyte(), and
2866 transit_state_consume_1char. */
2869 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2870 TRANSIT_STATE_DONE, /* State transition has finished. */
2871 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2872 } status_transit_state;
2874 /* Consume a single byte and transit state from 's' to '*next_state'.
2875 This function is almost same as the state transition routin in dfaexec().
2876 But state transition is done just once, otherwise matching succeed or
2877 reach the end of the buffer. */
2878 static status_transit_state
2879 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
2880 state_num * next_state)
2883 state_num works = s;
2885 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2887 while (rval == TRANSIT_STATE_IN_PROGRESS)
2889 if ((t = d->trans[works]) != NULL)
2892 rval = TRANSIT_STATE_DONE;
2900 /* At the moment, it must not happen. */
2905 else if (d->fails[works])
2907 works = d->fails[works][*p];
2908 rval = TRANSIT_STATE_DONE;
2912 build_state (works, d);
2915 *next_state = works;
2919 /* Match a "." against the current context. buf_begin[IDX] is the
2920 current position. Return the length of the match, in bytes.
2921 POS is the position of the ".". */
2923 match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
2930 mbclen = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2932 /* Check syntax bits. */
2933 if (wc == (wchar_t) eolbyte)
2935 if (!(syntax_bits & RE_DOT_NEWLINE))
2938 else if (wc == (wchar_t) '\0')
2940 if (syntax_bits & RE_DOT_NOT_NULL)
2944 context = wchar_context (wc);
2945 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2951 /* Match a bracket expression against the current context.
2952 buf_begin[IDX] is the current position.
2953 Return the length of the match, in bytes.
2954 POS is the position of the bracket expression. */
2956 match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
2959 int match; /* Flag which represent that matching succeed. */
2960 int match_len; /* Length of the character (or collating element)
2961 with which this operator match. */
2962 int op_len; /* Length of the operator. */
2965 /* Pointer to the structure to which we are currently referring. */
2966 struct mb_char_classes *work_mbc;
2969 wchar_t wc; /* Current referring character. */
2973 /* Check syntax bits. */
2974 if (wc == (wchar_t) eolbyte)
2976 if (!(syntax_bits & RE_DOT_NEWLINE))
2979 else if (wc == (wchar_t) '\0')
2981 if (syntax_bits & RE_DOT_NOT_NULL)
2985 context = wchar_context (wc);
2986 if (!SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context, context))
2989 /* Assign the current referring operator to work_mbc. */
2990 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2991 match = !work_mbc->invert;
2992 match_len = (mblen_buf[idx] == 0) ? 1 : mblen_buf[idx];
2994 /* Match in range 0-255? */
2995 if (wc < NOTCHAR && work_mbc->cset != -1
2996 && tstbit ((unsigned char) wc, d->charclasses[work_mbc->cset]))
2997 goto charset_matched;
2999 /* match with a character class? */
3000 for (i = 0; i < work_mbc->nch_classes; i++)
3002 if (iswctype ((wint_t) wc, work_mbc->ch_classes[i]))
3003 goto charset_matched;
3006 strncpy (buffer, (char const *) buf_begin + idx, match_len);
3007 buffer[match_len] = '\0';
3009 /* match with an equivalence class? */
3010 for (i = 0; i < work_mbc->nequivs; i++)
3012 op_len = strlen (work_mbc->equivs[i]);
3013 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3014 buffer[op_len] = '\0';
3015 if (strcoll (work_mbc->equivs[i], buffer) == 0)
3018 goto charset_matched;
3022 /* match with a collating element? */
3023 for (i = 0; i < work_mbc->ncoll_elems; i++)
3025 op_len = strlen (work_mbc->coll_elems[i]);
3026 strncpy (buffer, (char const *) buf_begin + idx, op_len);
3027 buffer[op_len] = '\0';
3029 if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
3032 goto charset_matched;
3036 /* match with a range? */
3037 for (i = 0; i < work_mbc->nranges; i++)
3039 if (work_mbc->range_sts[i] <= wc && wc <= work_mbc->range_ends[i])
3040 goto charset_matched;
3043 /* match with a character? */
3044 for (i = 0; i < work_mbc->nchars; i++)
3046 if (wc == work_mbc->chars[i])
3047 goto charset_matched;
3053 return match ? match_len : 0;
3056 /* Check each of 'd->states[s].mbps.elem' can match or not. Then return the
3057 array which corresponds to 'd->states[s].mbps.elem' and each element of
3058 the array contains the amount of the bytes with which the element can
3060 'idx' is the index from the buf_begin, and it is the current position
3062 Caller MUST free the array which this function return. */
3064 check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
3069 MALLOC (rarray, d->states[s].mbps.nelem);
3070 for (i = 0; i < d->states[s].mbps.nelem; ++i)
3072 position pos = d->states[s].mbps.elems[i];
3073 switch (d->tokens[pos.index])
3076 rarray[i] = match_anychar (d, s, pos, idx);
3079 rarray[i] = match_mb_charset (d, s, pos, idx);
3082 break; /* cannot happen. */
3088 /* Consume a single character and enumerate all of the positions which can
3089 be next position from the state 's'.
3090 'match_lens' is the input. It can be NULL, but it can also be the output
3091 of check_matching_with_multibyte_ops() for optimization.
3092 'mbclen' and 'pps' are the output. 'mbclen' is the length of the
3093 character consumed, and 'pps' is the set this function enumerate. */
3094 static status_transit_state
3095 transit_state_consume_1char (struct dfa *d, state_num s,
3096 unsigned char const **pp,
3097 int *match_lens, int *mbclen, position_set * pps)
3103 status_transit_state rs = TRANSIT_STATE_DONE;
3105 /* Calculate the length of the (single/multi byte) character
3106 to which p points. */
3107 *mbclen = (mblen_buf[*pp - buf_begin] == 0) ? 1 : mblen_buf[*pp - buf_begin];
3109 /* Calculate the state which can be reached from the state 's' by
3110 consuming '*mbclen' single bytes from the buffer. */
3112 for (k = 0; k < *mbclen; k++)
3115 rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
3117 /* Copy the positions contained by 's1' to the set 'pps'. */
3118 copy (&(d->states[s1].elems), pps);
3120 /* Check (input) match_lens, and initialize if it is NULL. */
3121 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3122 work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3124 work_mbls = match_lens;
3126 /* Add all of the positions which can be reached from 's' by consuming
3127 a single character. */
3128 for (i = 0; i < d->states[s].mbps.nelem; i++)
3130 if (work_mbls[i] == *mbclen)
3131 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3133 insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], pps);
3136 if (match_lens == NULL && work_mbls != NULL)
3139 /* FIXME: this return value is always ignored. */
3143 /* Transit state from s, then return new state and update the pointer of the
3144 buffer. This function is for some operator which can match with a multi-
3145 byte character or a collating element (which may be multi characters). */
3147 transit_state (struct dfa *d, state_num s, unsigned char const **pp)
3150 int mbclen; /* The length of current input multibyte character. */
3153 int *match_lens = NULL;
3154 size_t nelem = d->states[s].mbps.nelem; /* Just a alias. */
3155 position_set follows;
3156 unsigned char const *p1 = *pp;
3160 /* This state has (a) multibyte operator(s).
3161 We check whether each of them can match or not. */
3163 /* Note: caller must free the return value of this function. */
3164 match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
3166 for (i = 0; i < nelem; i++)
3167 /* Search the operator which match the longest string,
3170 if (match_lens[i] > maxlen)
3171 maxlen = match_lens[i];
3175 if (nelem == 0 || maxlen == 0)
3176 /* This state has no multibyte operator which can match.
3177 We need to check only one single byte character. */
3179 status_transit_state rs;
3180 rs = transit_state_singlebyte (d, s, *pp, &s1);
3182 /* We must update the pointer if state transition succeeded. */
3183 if (rs == TRANSIT_STATE_DONE)
3190 /* This state has some operators which can match a multibyte character. */
3191 alloc_position_set (&follows, d->nleaves);
3193 /* 'maxlen' may be longer than the length of a character, because it may
3194 not be a character but a (multi character) collating element.
3195 We enumerate all of the positions which 's' can reach by consuming
3197 transit_state_consume_1char (d, s, pp, match_lens, &mbclen, &follows);
3199 wc = inputwcs[*pp - mbclen - buf_begin];
3200 s1 = state_index (d, &follows, wchar_context (wc));
3201 realloc_trans_if_necessary (d, s1);
3203 while (*pp - p1 < maxlen)
3205 transit_state_consume_1char (d, s1, pp, NULL, &mbclen, &follows);
3207 for (i = 0; i < nelem; i++)
3209 if (match_lens[i] == *pp - p1)
3211 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3212 insert (d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3216 wc = inputwcs[*pp - mbclen - buf_begin];
3217 s1 = state_index (d, &follows, wchar_context (wc));
3218 realloc_trans_if_necessary (d, s1);
3221 free (follows.elems);
3226 /* Initialize mblen_buf and inputwcs with data from the next line. */
3229 prepare_wc_buf (const char *begin, const char *end)
3232 unsigned char eol = eolbyte;
3233 size_t remain_bytes, i;
3235 buf_begin = (unsigned char *) begin;
3238 for (i = 0; i < end - begin + 1; i++)
3240 if (remain_bytes == 0)
3243 = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3244 if (remain_bytes < 1
3245 || remain_bytes == (size_t) -1
3246 || remain_bytes == (size_t) -2
3247 || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
3250 inputwcs[i] = (wchar_t) begin[i];
3252 if (begin[i] == eol)
3257 mblen_buf[i] = remain_bytes;
3263 mblen_buf[i] = remain_bytes;
3269 buf_end = (unsigned char *) (begin + i);
3271 inputwcs[i] = 0; /* sentinel */
3272 #endif /* MBS_SUPPORT */
3275 /* Search through a buffer looking for a match to the given struct dfa.
3276 Find the first occurrence of a string matching the regexp in the
3277 buffer, and the shortest possible version thereof. Return a pointer to
3278 the first character after the match, or NULL if none is found. BEGIN
3279 points to the beginning of the buffer, and END points to the first byte
3280 after its end. Note however that we store a sentinel byte (usually
3281 newline) in *END, so the actual buffer must be one byte longer.
3282 When ALLOW_NL is nonzero, newlines may appear in the matching string.
3283 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3284 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3285 encountered a back-reference (1) or not (0). The caller may use this
3286 to decide whether to fall back on a backtracking matcher. */
3288 dfaexec (struct dfa *d, char const *begin, char *end,
3289 int allow_nl, size_t *count, int *backref)
3291 state_num s, s1; /* Current state. */
3292 unsigned char const *p; /* Current input character. */
3293 state_num **trans, *t; /* Copy of d->trans so it can be optimized
3295 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3296 unsigned char saved_end;
3299 build_state_zero (d);
3302 p = (unsigned char const *) begin;
3304 saved_end = *(unsigned char *) end;
3307 if (d->mb_cur_max > 1)
3309 MALLOC (mblen_buf, end - begin + 2);
3310 MALLOC (inputwcs, end - begin + 2);
3311 memset (&mbs, 0, sizeof (mbstate_t));
3312 prepare_wc_buf ((const char *) p, end);
3317 if (d->mb_cur_max > 1)
3318 while ((t = trans[s]) != NULL)
3323 SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
3325 if (d->states[s].mbps.nelem == 0)
3331 /* Falling back to the glibc matcher in this case gives
3332 better performance (up to 25% better on [a-z], for
3333 example) and enables support for collating symbols and
3334 equivalence classes. */
3344 /* Can match with a multibyte character (and multi character
3345 collating element). Transition table might be updated. */
3346 s = transit_state (d, s, &p);
3351 while ((t = trans[s]) != NULL)
3354 if ((t = trans[s1]) == NULL)
3358 s1 = tmp; /* swap */
3365 if (s >= 0 && (char *) p <= end && d->fails[s])
3367 if (d->success[s] & sbit[*p])
3370 *backref = (d->states[s].backref != 0);
3371 if (d->mb_cur_max > 1)
3381 if (d->mb_cur_max > 1)
3383 /* Can match with a multibyte character (and multicharacter
3384 collating element). Transition table might be updated. */
3385 s = transit_state (d, s, &p);
3389 s = d->fails[s][*p++];
3393 /* If the previous character was a newline, count it. */
3394 if ((char *) p <= end && p[-1] == eol)
3399 if (d->mb_cur_max > 1)
3400 prepare_wc_buf ((const char *) p, end);
3403 /* Check if we've run off the end of the buffer. */
3404 if ((char *) p > end)
3406 if (d->mb_cur_max > 1)
3422 if (p[-1] == eol && allow_nl)
3424 s = d->newlines[s1];
3433 free_mbdata (struct dfa *d)
3437 free (d->multibyte_prop);
3438 d->multibyte_prop = NULL;
3440 for (i = 0; i < d->nmbcsets; ++i)
3443 struct mb_char_classes *p = &(d->mbcsets[i]);
3445 free (p->ch_classes);
3446 free (p->range_sts);
3447 free (p->range_ends);
3449 for (j = 0; j < p->nequivs; ++j)
3450 free (p->equivs[j]);
3453 for (j = 0; j < p->ncoll_elems; ++j)
3454 free (p->coll_elems[j]);
3455 free (p->coll_elems);
3463 /* Initialize the components of a dfa that the other routines don't
3464 initialize for themselves. */
3466 dfainit (struct dfa *d)
3468 memset (d, 0, sizeof *d);
3471 MALLOC (d->charclasses, d->calloc);
3474 MALLOC (d->tokens, d->talloc);
3476 d->mb_cur_max = MB_CUR_MAX;
3478 if (d->mb_cur_max > 1)
3480 d->nmultibyte_prop = 1;
3481 MALLOC (d->multibyte_prop, d->nmultibyte_prop);
3482 d->mbcsets_alloc = 1;
3483 MALLOC (d->mbcsets, d->mbcsets_alloc);
3488 dfaoptimize (struct dfa *d)
3492 if (!MBS_SUPPORT || !using_utf8 ())
3495 for (i = 0; i < d->tindex; ++i)
3497 switch (d->tokens[i])
3503 /* Requires multi-byte algorithm. */
3514 /* Parse and analyze a single string of the given length. */
3516 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3519 dfaparse (s, len, d);
3522 dfaanalyze (d, searchflag);
3525 /* Free the storage held by the components of a dfa. */
3527 dfafree (struct dfa *d)
3530 struct dfamust *dm, *ndm;
3532 free (d->charclasses);
3535 if (d->mb_cur_max > 1)
3538 for (i = 0; i < d->sindex; ++i)
3540 free (d->states[i].elems.elems);
3542 free (d->states[i].mbps.elems);
3545 for (i = 0; i < d->tindex; ++i)
3546 free (d->follows[i].elems);
3548 for (i = 0; i < d->tralloc; ++i)
3553 free (d->realtrans);
3557 for (dm = d->musts; dm; dm = ndm)
3565 /* Having found the postfix representation of the regular expression,
3566 try to find a long sequence of characters that must appear in any line
3568 Finding a "longest" sequence is beyond the scope here;
3569 we take an easy way out and hope for the best.
3570 (Take "(ab|a)b"--please.)
3572 We do a bottom-up calculation of sequences of characters that must appear
3573 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3575 sequences that must appear at the left of the match ("left")
3576 sequences that must appear at the right of the match ("right")
3577 lists of sequences that must appear somewhere in the match ("in")
3578 sequences that must constitute the match ("is")
3580 When we get to the root of the tree, we use one of the longest of its
3581 calculated "in" sequences as our answer. The sequence we find is returned in
3582 d->must (where "d" is the single argument passed to "dfamust");
3583 the length of the sequence is returned in d->mustn.
3585 The sequences calculated for the various types of node (in pseudo ANSI c)
3586 are shown below. "p" is the operand of unary operators (and the left-hand
3587 operand of binary operators); "q" is the right-hand operand of binary
3590 "ZERO" means "a zero-length sequence" below.
3592 Type left right is in
3593 ---- ---- ----- -- --
3594 char c # c # c # c # c
3596 ANYCHAR ZERO ZERO ZERO ZERO
3598 MBCSET ZERO ZERO ZERO ZERO
3600 CSET ZERO ZERO ZERO ZERO
3602 STAR ZERO ZERO ZERO ZERO
3604 QMARK ZERO ZERO ZERO ZERO
3606 PLUS p->left p->right ZERO p->in
3608 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3609 p->left : q->right : q->is!=ZERO) ? q->in plus
3610 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3613 OR longest common longest common (do p->is and substrings common to
3614 leading trailing q->is have same p->in and q->in
3615 (sub)sequence (sub)sequence length and
3616 of p->left of p->right content) ?
3617 and q->left and q->right p->is : NULL
3619 If there's anything else we recognize in the tree, all four sequences get set
3620 to zero-length sequences. If there's something we don't recognize in the tree,
3621 we just return a zero-length sequence.
3623 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3626 And. . .is it here or someplace that we might ponder "optimizations" such as
3627 egrep 'psi|epsilon' -> egrep 'psi'
3628 egrep 'pepsi|epsilon' -> egrep 'epsi'
3629 (Yes, we now find "epsi" as a "string
3630 that must occur", but we might also
3631 simplify the *entire* r.e. being sought)
3632 grep '[c]' -> grep 'c'
3633 grep '(ab|a)b' -> grep 'ab'
3634 grep 'ab*' -> grep 'a'
3635 grep 'a*b' -> grep 'b'
3637 There are several issues:
3639 Is optimization easy (enough)?
3641 Does optimization actually accomplish anything,
3642 or is the automaton you get from "psi|epsilon" (for example)
3643 the same as the one you get from "psi" (for example)?
3645 Are optimizable r.e.'s likely to be used in real-life situations
3646 (something like 'ab*' is probably unlikely; something like is
3647 'psi|epsilon' is likelier)? */
3650 icatalloc (char *old, char const *new)
3653 size_t oldsize = old == NULL ? 0 : strlen (old);
3654 size_t newsize = new == NULL ? 0 : strlen (new);
3657 result = xrealloc (old, oldsize + newsize + 1);
3658 memcpy (result + oldsize, new, newsize + 1);
3663 icpyalloc (char const *string)
3665 return icatalloc (NULL, string);
3668 static char *_GL_ATTRIBUTE_PURE
3669 istrstr (char const *lookin, char const *lookfor)
3674 len = strlen (lookfor);
3675 for (cp = lookin; *cp != '\0'; ++cp)
3676 if (strncmp (cp, lookfor, len) == 0)
3682 freelist (char **cpp)
3688 for (i = 0; cpp[i] != NULL; ++i)
3696 enlist (char **cpp, char *new, size_t len)
3702 if ((new = icpyalloc (new)) == NULL)
3708 /* Is there already something in the list that's new (or longer)? */
3709 for (i = 0; cpp[i] != NULL; ++i)
3710 if (istrstr (cpp[i], new) != NULL)
3715 /* Eliminate any obsoleted strings. */
3717 while (cpp[j] != NULL)
3718 if (istrstr (new, cpp[j]) == NULL)
3728 /* Add the new string. */
3729 REALLOC (cpp, i + 2);
3735 /* Given pointers to two strings, return a pointer to an allocated
3736 list of their distinct common substrings. Return NULL if something
3739 comsubs (char *left, char const *right)
3746 if (left == NULL || right == NULL)
3748 cpp = malloc (sizeof *cpp);
3752 for (lcp = left; *lcp != '\0'; ++lcp)
3755 rcp = strchr (right, *lcp);
3758 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3762 rcp = strchr (rcp + 1, *lcp);
3767 char **p = enlist (cpp, lcp, len);
3781 addlists (char **old, char **new)
3785 if (old == NULL || new == NULL)
3787 for (i = 0; new[i] != NULL; ++i)
3789 old = enlist (old, new[i], strlen (new[i]));
3796 /* Given two lists of substrings, return a new list giving substrings
3799 inboth (char **left, char **right)
3805 if (left == NULL || right == NULL)
3807 both = malloc (sizeof *both);
3811 for (lnum = 0; left[lnum] != NULL; ++lnum)
3813 for (rnum = 0; right[rnum] != NULL; ++rnum)
3815 temp = comsubs (left[lnum], right[rnum]);
3821 both = addlists (both, temp);
3840 resetmust (must * mp)
3842 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3847 dfamust (struct dfa *d)
3858 static char empty_string[] = "";
3860 result = empty_string;
3862 MALLOC (musts, d->tindex + 1);
3864 for (i = 0; i <= d->tindex; ++i)
3866 for (i = 0; i <= d->tindex; ++i)
3868 mp[i].in = xmalloc (sizeof *mp[i].in);
3869 mp[i].left = xmalloc (2);
3870 mp[i].right = xmalloc (2);
3871 mp[i].is = xmalloc (2);
3872 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3876 fprintf (stderr, "dfamust:\n");
3877 for (i = 0; i < d->tindex; ++i)
3879 fprintf (stderr, " %zd:", i);
3880 prtok (d->tokens[i]);
3882 putc ('\n', stderr);
3884 for (ri = 0; ri < d->tindex; ++ri)
3886 switch (t = d->tokens[ri])
3890 assert (!"neither LPAREN nor RPAREN may appear here");
3903 assert (musts < mp);
3908 assert (&musts[2] <= mp);
3913 size_t j, ln, rn, n;
3917 /* Guaranteed to be. Unlikely, but. . . */
3918 if (!STREQ (lmp->is, rmp->is))
3920 /* Left side--easy */
3922 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3924 lmp->left[i] = '\0';
3926 ln = strlen (lmp->right);
3927 rn = strlen (rmp->right);
3931 for (i = 0; i < n; ++i)
3932 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3934 for (j = 0; j < i; ++j)
3935 lmp->right[j] = lmp->right[(ln - i) + j];
3936 lmp->right[j] = '\0';
3937 new = inboth (lmp->in, rmp->in);
3946 assert (musts < mp);
3951 assert (mp == &musts[1]);
3952 for (i = 0; musts[0].in[i] != NULL; ++i)
3953 if (strlen (musts[0].in[i]) > strlen (result))
3954 result = musts[0].in[i];
3955 if (STREQ (result, musts[0].is))
3959 assert (&musts[2] <= mp);
3966 /* In. Everything in left, plus everything in
3967 right, plus concatenation of
3968 left's right and right's left. */
3969 lmp->in = addlists (lmp->in, rmp->in);
3970 if (lmp->in == NULL)
3972 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3976 tp = icpyalloc (lmp->right);
3977 tp = icatalloc (tp, rmp->left);
3978 lmp->in = enlist (lmp->in, tp, strlen (tp));
3980 if (lmp->in == NULL)
3984 if (lmp->is[0] != '\0')
3986 lmp->left = icatalloc (lmp->left, rmp->left);
3987 if (lmp->left == NULL)
3991 if (rmp->is[0] == '\0')
3992 lmp->right[0] = '\0';
3993 lmp->right = icatalloc (lmp->right, rmp->right);
3994 if (lmp->right == NULL)
3996 /* Guaranteed to be */
3997 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3999 lmp->is = icatalloc (lmp->is, rmp->is);
4000 if (lmp->is == NULL)
4010 assert (!"oops! t >= END");
4014 /* not on *my* shift */
4017 else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
4024 /* plain character */
4026 mp->is[0] = mp->left[0] = mp->right[0] = t;
4027 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4028 mp->in = enlist (mp->in, mp->is, (size_t) 1);
4035 fprintf (stderr, " node: %zd:", ri);
4036 prtok (d->tokens[ri]);
4037 fprintf (stderr, "\n in:");
4038 for (i = 0; mp->in[i]; ++i)
4039 fprintf (stderr, " \"%s\"", mp->in[i]);
4040 fprintf (stderr, "\n is: \"%s\"\n", mp->is);
4041 fprintf (stderr, " left: \"%s\"\n", mp->left);
4042 fprintf (stderr, " right: \"%s\"\n", mp->right);
4047 if (strlen (result))
4051 dm->must = xmemdup (result, strlen (result) + 1);
4052 dm->next = d->musts;
4056 for (i = 0; i <= d->tindex; ++i)
4058 freelist (mp[i].in);
4070 return xmalloc (sizeof (struct dfa));
4073 struct dfamust *_GL_ATTRIBUTE_PURE
4074 dfamusts (struct dfa const *d)
4079 /* vim:set shiftwidth=2: */