1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004, 2005, 2007-2010 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
27 #include <sys/types.h>
34 #define STREQ(a, b) (strcmp (a, b) == 0)
36 /* ISASCIIDIGIT differs from isdigit, as follows:
37 - Its arg may be any int or unsigned int; it need not be an unsigned char.
38 - It's guaranteed to evaluate its argument exactly once.
39 - It's typically faster.
40 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
41 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
42 it's important to use the locale's definition of `digit' even when the
43 host does not conform to Posix. */
44 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
46 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
48 #define _(str) gettext (str)
50 #include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
54 #if HAVE_LANGINFO_CODESET
55 # include <langinfo.h>
60 #include "hard-locale.h"
63 /* HPUX, define those as macros in sys/param.h */
71 /* Number of bits in an unsigned char. */
76 /* First integer value that is greater than any character code. */
77 #define NOTCHAR (1 << CHARBITS)
79 /* INTBITS need not be exact, just a lower bound. */
81 # define INTBITS (CHARBITS * sizeof (int))
84 /* Number of ints required to hold a bit for every character. */
85 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
87 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
88 typedef int charclass[CHARCLASS_INTS];
90 /* Sometimes characters can only be matched depending on the surrounding
91 context. Such context decisions depend on what the previous character
92 was, and the value of the current (lookahead) character. Context
93 dependent constraints are encoded as 8 bit integers. Each bit that
94 is set indicates that the constraint succeeds in the corresponding
97 bit 7 - previous and current are newlines
98 bit 6 - previous was newline, current isn't
99 bit 5 - previous wasn't newline, current is
100 bit 4 - neither previous nor current is a newline
101 bit 3 - previous and current are word-constituents
102 bit 2 - previous was word-constituent, current isn't
103 bit 1 - previous wasn't word-constituent, current is
104 bit 0 - neither previous nor current is word-constituent
106 Word-constituent characters are those that satisfy isalnum().
108 The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
109 succeeds in a particular context. Prevn is true if the previous character
110 was a newline, currn is true if the lookahead character is a newline.
111 Prevl and currl similarly depend upon whether the previous and current
112 characters are word-constituent letters. */
113 #define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
114 ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
115 #define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
116 ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
117 #define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
118 (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
119 && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
121 /* The following macros give information about what a constraint depends on. */
122 #define PREV_NEWLINE_DEPENDENT(constraint) \
123 (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
124 #define PREV_LETTER_DEPENDENT(constraint) \
125 (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
127 /* Tokens that match the empty string subject to some constraint actually
128 work by applying that constraint to determine what may follow them,
129 taking into account what has gone before. The following values are
130 the constraints corresponding to the special tokens previously defined. */
131 #define NO_CONSTRAINT 0xff
132 #define BEGLINE_CONSTRAINT 0xcf
133 #define ENDLINE_CONSTRAINT 0xaf
134 #define BEGWORD_CONSTRAINT 0xf2
135 #define ENDWORD_CONSTRAINT 0xf4
136 #define LIMWORD_CONSTRAINT 0xf6
137 #define NOTLIMWORD_CONSTRAINT 0xf9
139 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
140 are operators and others are terminal symbols. Most (but not all) of these
141 codes are returned by the lexical analyzer. */
144 END = -1, /* END is a terminal symbol that matches the
145 end of input; any value of END or less in
146 the parse tree is such a symbol. Accepting
147 states of the DFA are those that would have
148 a transition on END. */
150 /* Ordinary character values are terminal symbols that match themselves. */
152 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
155 BACKREF, /* BACKREF is generated by \<digit>; it
156 it not completely handled. If the scanner
157 detects a transition on backref, it returns
158 a kind of "semi-success" indicating that
159 the match will have to be verified with
160 a backtracking matcher. */
162 BEGLINE, /* BEGLINE is a terminal symbol that matches
163 the empty string if it is at the beginning
166 ENDLINE, /* ENDLINE is a terminal symbol that matches
167 the empty string if it is at the end of
170 BEGWORD, /* BEGWORD is a terminal symbol that matches
171 the empty string if it is at the beginning
174 ENDWORD, /* ENDWORD is a terminal symbol that matches
175 the empty string if it is at the end of
178 LIMWORD, /* LIMWORD is a terminal symbol that matches
179 the empty string if it is at the beginning
180 or the end of a word. */
182 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
183 matches the empty string if it is not at
184 the beginning or end of a word. */
186 QMARK, /* QMARK is an operator of one argument that
187 matches zero or one occurences of its
190 STAR, /* STAR is an operator of one argument that
191 matches the Kleene closure (zero or more
192 occurrences) of its argument. */
194 PLUS, /* PLUS is an operator of one argument that
195 matches the positive closure (one or more
196 occurrences) of its argument. */
198 REPMN, /* REPMN is a lexical token corresponding
199 to the {m,n} construct. REPMN never
200 appears in the compiled token vector. */
202 CAT, /* CAT is an operator of two arguments that
203 matches the concatenation of its
204 arguments. CAT is never returned by the
207 OR, /* OR is an operator of two arguments that
208 matches either of its arguments. */
210 LPAREN, /* LPAREN never appears in the parse tree,
211 it is only a lexeme. */
213 RPAREN, /* RPAREN never appears in the parse tree. */
215 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
216 any multibyte (or single byte) characters.
217 It is used only if MB_CUR_MAX > 1. */
219 MBCSET, /* MBCSET is similar to CSET, but for
220 multibyte characters. */
222 WCHAR, /* Only returned by lex. wctok contains
223 the wide character representation. */
225 CSET /* CSET and (and any value greater) is a
226 terminal symbol that matches any of a
227 class of characters. */
231 /* States of the recognizer correspond to sets of positions in the parse
232 tree, together with the constraints under which they may be matched.
233 So a position is encoded as an index into the parse tree together with
237 unsigned int index; /* Index into the parse array. */
238 unsigned int constraint; /* Constraint for matching this position. */
241 /* Sets of positions are stored as arrays. */
244 position *elems; /* Elements of this position set. */
245 int nelem; /* Number of elements in this set. */
248 /* A state of the dfa consists of a set of positions, some flags,
249 and the token value of the lowest-numbered position of the state that
250 contains an END token. */
253 int hash; /* Hash of the positions of this state. */
254 position_set elems; /* Positions this state could match. */
255 char newline; /* True if previous state matched newline. */
256 char letter; /* True if previous state matched a letter. */
257 char backref; /* True if this state matches a \<digit>. */
258 unsigned char constraint; /* Constraint for this state to accept. */
259 int first_end; /* Token value of the first END in elems. */
261 position_set mbps; /* Positions which can match multibyte
262 characters. e.g. period.
263 These staff are used only if
269 /* A bracket operator.
270 e.g. [a-c], [[:alpha:]], etc. */
271 struct mb_char_classes
275 wchar_t *chars; /* Normal characters. */
277 wctype_t *ch_classes; /* Character classes. */
279 wchar_t *range_sts; /* Range characters (start of the range). */
280 wchar_t *range_ends; /* Range characters (end of the range). */
282 char **equivs; /* Equivalent classes. */
285 int ncoll_elems; /* Collating elements. */
289 /* A compiled regular expression. */
292 /* Fields filled by the scanner. */
293 charclass *charclasses; /* Array of character sets for CSET tokens. */
294 int cindex; /* Index for adding new charclasses. */
295 int calloc; /* Number of charclasses currently allocated. */
297 /* Fields filled by the parser. */
298 token *tokens; /* Postfix parse array. */
299 int tindex; /* Index for adding new tokens. */
300 int talloc; /* Number of tokens currently allocated. */
301 int depth; /* Depth required of an evaluation stack
302 used for depth-first traversal of the
304 int nleaves; /* Number of leaves on the parse tree. */
305 int nregexps; /* Count of parallel regexps being built
307 unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
308 int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
310 /* The following are used only if MB_CUR_MAX > 1. */
312 /* The value of multibyte_prop[i] is defined by following rule.
313 if tokens[i] < NOTCHAR
314 bit 0 : tokens[i] is the first byte of a character, including
315 single-byte characters.
316 bit 1 : tokens[i] is the last byte of a character, including
317 single-byte characters.
319 if tokens[i] = MBCSET
320 ("the index of mbcsets correspnd to this operator" << 2) + 3
324 = 'single_byte_a', 'multi_byte_A', single_byte_b'
325 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
332 /* Array of the bracket expression in the DFA. */
333 struct mb_char_classes *mbcsets;
337 /* Fields filled by the state builder. */
338 dfa_state *states; /* States of the dfa. */
339 int sindex; /* Index for adding new states. */
340 int salloc; /* Number of states currently allocated. */
342 /* Fields filled by the parse tree->NFA conversion. */
343 position_set *follows; /* Array of follow sets, indexed by position
344 index. The follow of a position is the set
345 of positions containing characters that
346 could conceivably follow a character
347 matching the given position in a string
348 matching the regexp. Allocated to the
349 maximum possible position index. */
350 int searchflag; /* True if we are supposed to build a searching
351 as opposed to an exact matcher. A searching
352 matcher finds the first and shortest string
353 matching a regexp anywhere in the buffer,
354 whereas an exact matcher finds the longest
355 string matching, but anchored to the
356 beginning of the buffer. */
358 /* Fields filled by dfaexec. */
359 int tralloc; /* Number of transition tables that have
361 int trcount; /* Number of transition tables that have
362 actually been built. */
363 int **trans; /* Transition tables for states that can
364 never accept. If the transitions for a
365 state have not yet been computed, or the
366 state could possibly accept, its entry in
367 this table is NULL. */
368 int **realtrans; /* Trans always points to realtrans + 1; this
369 is so trans[-1] can contain NULL. */
370 int **fails; /* Transition tables after failing to accept
371 on a state that potentially could do so. */
372 int *success; /* Table of acceptance conditions used in
373 dfaexec and computed in build_state. */
374 int *newlines; /* Transitions on newlines. The entry for a
375 newline in any transition table is always
376 -1 so we can count lines without wasting
377 too many cycles. The transition for a
378 newline is stored separately and handled
379 as a special case. Newline is also used
380 as a sentinel at the end of the buffer. */
381 struct dfamust *musts; /* List of strings, at least one of which
382 is known to appear in any r.e. matching
386 /* Some macros for user access to dfa internals. */
388 /* ACCEPTING returns true if s could possibly be an accepting state of r. */
389 #define ACCEPTING(s, r) ((r).states[s].constraint)
391 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
392 specified context. */
393 #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
394 SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, \
395 prevn, currn, prevl, currl)
397 static void dfamust (struct dfa *dfa);
398 static void regexp (void);
400 #define CALLOC(p, t, n) ((p) = xcalloc((size_t)(n), sizeof (t)))
401 #define MALLOC(p, t, n) ((p) = xmalloc((n) * sizeof (t)))
402 #define REALLOC(p, t, n) ((p) = xrealloc((p), (n) * sizeof (t)))
404 /* Reallocate an array of type t if nalloc is too small for index. */
405 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
406 if ((index) >= (nalloc)) \
410 while ((index) >= (nalloc)); \
411 REALLOC(p, t, nalloc); \
423 fprintf(stderr, "END");
424 else if (t < NOTCHAR)
425 fprintf(stderr, "%c", t);
430 case EMPTY: s = "EMPTY"; break;
431 case BACKREF: s = "BACKREF"; break;
432 case BEGLINE: s = "BEGLINE"; break;
433 case ENDLINE: s = "ENDLINE"; break;
434 case BEGWORD: s = "BEGWORD"; break;
435 case ENDWORD: s = "ENDWORD"; break;
436 case LIMWORD: s = "LIMWORD"; break;
437 case NOTLIMWORD: s = "NOTLIMWORD"; break;
438 case QMARK: s = "QMARK"; break;
439 case STAR: s = "STAR"; break;
440 case PLUS: s = "PLUS"; break;
441 case CAT: s = "CAT"; break;
442 case OR: s = "OR"; break;
443 case LPAREN: s = "LPAREN"; break;
444 case RPAREN: s = "RPAREN"; break;
446 case ANYCHAR: s = "ANYCHAR"; break;
447 case MBCSET: s = "MBCSET"; break;
448 #endif /* MBS_SUPPORT */
449 default: s = "CSET"; break;
451 fprintf(stderr, "%s", s);
456 /* Stuff pertaining to charclasses. */
459 tstbit (unsigned int b, charclass const c)
461 return c[b / INTBITS] & 1 << b % INTBITS;
465 setbit (unsigned int b, charclass c)
467 c[b / INTBITS] |= 1 << b % INTBITS;
471 clrbit (unsigned int b, charclass c)
473 c[b / INTBITS] &= ~(1 << b % INTBITS);
477 copyset (charclass const src, charclass dst)
479 memcpy (dst, src, sizeof (charclass));
483 zeroset (charclass s)
485 memset (s, 0, sizeof (charclass));
493 for (i = 0; i < CHARCLASS_INTS; ++i)
498 equal (charclass const s1, charclass const s2)
500 return memcmp (s1, s2, sizeof (charclass)) == 0;
503 /* A pointer to the current dfa is kept here during parsing. */
504 static struct dfa *dfa;
506 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
508 charclass_index (charclass const s)
512 for (i = 0; i < dfa->cindex; ++i)
513 if (equal(s, dfa->charclasses[i]))
515 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
517 copyset(s, dfa->charclasses[i]);
521 /* Syntax bits controlling the behavior of the lexical analyzer. */
522 static reg_syntax_t syntax_bits, syntax_bits_set;
524 /* Flag for case-folding letters into sets. */
525 static int case_fold;
527 /* End-of-line byte in data. */
528 static unsigned char eolbyte;
530 /* Entry point to set syntax options. */
532 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
540 /* Like setbit, but if case is folded, set both cases of a letter.
541 For MB_CUR_MAX > 1, one or both of the two cases may not be set,
542 so the resulting charset may only be used as an optimization. */
557 wint_t b1 = iswupper(b) ? towlower(b) : b;
558 wint_t b2 = iswlower(b) ? towupper(b) : b;
559 if (wctob ((unsigned char)b1) == b1)
561 if (b2 != b1 && wctob ((unsigned char)b2) == b2)
567 unsigned char b1 = isupper(b) ? tolower(b) : b;
568 unsigned char b2 = islower(b) ? toupper(b) : b;
577 if (wctob ((unsigned char)b) == b)
584 /* UTF-8 encoding allows some optimizations that we can't otherwise
585 assume in a multibyte encoding. */
589 static int utf8 = -1;
592 #if defined HAVE_LANGINFO_CODESET && defined MBS_SUPPORT
593 utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
602 /* Lexical analyzer. All the dross that deals with the obnoxious
603 GNU Regex syntax bits is located here. The poor, suffering
604 reader is referred to the GNU Regex documentation for the
605 meaning of the @#%!@#%^!@ syntax bits. */
607 static char const *lexptr; /* Pointer to next input character. */
608 static int lexleft; /* Number of characters remaining. */
609 static token lasttok; /* Previous token returned; initially END. */
610 static int laststart; /* True if we're separated from beginning or (, |
611 only by zero-width characters. */
612 static int parens; /* Count of outstanding left parens. */
613 static int minrep, maxrep; /* Repeat counts for {m,n}. */
614 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
616 static int cur_mb_len = 1; /* Length of the multibyte representation of
619 /* These variables are used only if (MB_CUR_MAX > 1). */
620 static mbstate_t mbs; /* Mbstate for mbrlen(). */
621 static wchar_t wctok; /* Wide character representation of the current
622 multibyte character. */
623 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
624 Each element store the amount of remain
625 byte of corresponding multibyte character
626 in the input string. A element's value
627 is 0 if corresponding character is a
628 single byte chracter.
629 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
630 mblen_buf : 0, 3, 2, 1
632 static wchar_t *inputwcs; /* Wide character representation of input
634 The length of this array is same as
635 the length of input string(char array).
636 inputstring[i] is a single-byte char,
637 or 1st byte of a multibyte char.
638 And inputwcs[i] is the codepoint. */
639 static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
640 static unsigned char const *buf_end; /* reference to end in dfaexec(). */
641 #endif /* MBS_SUPPORT */
645 /* Note that characters become unsigned here. */
646 # define FETCH_WC(c, wc, eoferr) \
653 return lasttok = END; \
658 cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
659 if (cur_mb_len <= 0) \
663 (wc) = (c) = (unsigned char) *lexptr++; \
667 lexptr += cur_mb_len; \
668 lexleft -= cur_mb_len; \
675 # define FETCH(c, eoferr) \
678 FETCH_WC(c, wc, eoferr); \
682 /* Note that characters become unsigned here. */
683 # define FETCH(c, eoferr) \
690 return lasttok = END; \
692 (c) = (unsigned char) *lexptr++; \
696 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
698 #endif /* MBS_SUPPORT */
701 in_coll_range (char ch, char from, char to)
703 char c[6] = { from, 0, ch, 0, to, 0 };
704 return strcoll (&c[0], &c[2]) <= 0 && strcoll (&c[2], &c[4]) <= 0;
707 typedef int predicate (int);
709 /* The following list maps the names of the Posix named character classes
710 to predicate functions that determine whether a given character is in
711 the class. The leading [ has already been eaten by the lexical analyzer. */
715 bool single_byte_only;
718 static const struct dfa_ctype prednames[] = {
719 { "alpha", isalpha, false },
720 { "upper", isupper, false },
721 { "lower", islower, false },
722 { "digit", isdigit, true },
723 { "xdigit", isxdigit, true },
724 { "space", isspace, false },
725 { "punct", ispunct, false },
726 { "alnum", isalnum, false },
727 { "print", isprint, false },
728 { "graph", isgraph, false },
729 { "cntrl", iscntrl, false },
730 { "blank", isblank, false },
731 { NULL, NULL, false }
734 static const struct dfa_ctype *
735 find_pred (const char *str)
738 for (i = 0; prednames[i].name; ++i)
739 if (STREQ (str, prednames[i].name))
742 return &prednames[i];
745 /* Multibyte character handling sub-routine for lex.
746 This function parse a bracket expression and build a struct
749 parse_bracket_exp (void)
755 /* Used to warn about [:space:].
756 Bit 0 = first character is a colon.
757 Bit 1 = last character is a colon.
758 Bit 2 = includes any other character but a colon.
759 Bit 3 = includes ranges, char/equiv classes or collation elements. */
760 int colon_warning_state;
765 /* Work area to build a mb_char_classes. */
766 struct mb_char_classes *work_mbc;
767 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
768 equivs_al, coll_elems_al;
771 range_sts_al = range_ends_al = 0;
772 ch_classes_al = equivs_al = coll_elems_al = 0;
775 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
776 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
778 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
779 We will update dfa->multibyte_prop[] in addtok(), because we can't
780 decide the index in dfa->tokens[]. */
782 /* Initialize work area. */
783 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
784 memset (work_mbc, 0, sizeof *work_mbc);
790 memset (ccl, 0, sizeof ccl);
791 FETCH_WC (c, wc, _("unbalanced ["));
794 FETCH_WC (c, wc, _("unbalanced ["));
800 colon_warning_state = (c == ':');
803 c1 = EOF; /* mark c1 is not initialized". */
804 colon_warning_state &= ~2;
806 /* Note that if we're looking at some other [:...:] construct,
807 we just treat it as a bunch of ordinary characters. We can do
808 this because we assume regex has checked for syntax errors before
809 dfa is ever called. */
810 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
812 #define BRACKET_BUFFER_SIZE 128
813 char str[BRACKET_BUFFER_SIZE];
814 FETCH_WC (c1, wc1, _("unbalanced ["));
816 /* If pattern contains `[[:', `[[.', or `[[='. */
819 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
820 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
827 FETCH_WC (c, wc, _("unbalanced ["));
828 if ((c == c1 && *lexptr == ']') || lexleft == 0)
830 if (len < BRACKET_BUFFER_SIZE)
833 /* This is in any case an invalid class name. */
839 FETCH_WC (c, wc, _("unbalanced ["));
841 /* build character class. */
844 = (case_fold && (STREQ (str, "upper")
845 || STREQ (str, "lower"))
848 const struct dfa_ctype *pred = find_pred (class);
850 dfaerror(_("invalid character class"));
853 if (MB_CUR_MAX > 1 && !pred->single_byte_only)
855 /* Store the character class as wctype_t. */
856 wctype_t wt = wctype (class);
858 if (ch_classes_al == 0)
859 MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
860 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
862 work_mbc->nch_classes + 1);
863 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
867 for (c2 = 0; c2 < NOTCHAR; ++c2)
869 setbit_case_fold (c2, ccl);
873 else if (c1 == '=' || c1 == '.')
876 MALLOC(elem, char, len + 1);
877 strncpy(elem, str, len + 1);
880 /* build equivalent class. */
883 MALLOC(work_mbc->equivs, char*, ++equivs_al);
884 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
886 work_mbc->nequivs + 1);
887 work_mbc->equivs[work_mbc->nequivs++] = elem;
891 /* build collating element. */
893 if (coll_elems_al == 0)
894 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
895 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
897 work_mbc->ncoll_elems + 1);
898 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
902 colon_warning_state |= 8;
904 /* Fetch new lookahead character. */
905 FETCH_WC (c1, wc1, _("unbalanced ["));
909 /* We treat '[' as a normal character here. c/c1/wc/wc1
910 are already set up. */
913 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
914 FETCH_WC(c, wc, _("unbalanced ["));
917 FETCH_WC(c1, wc1, _("unbalanced ["));
920 /* build range characters. */
922 FETCH_WC(c2, wc2, _("unbalanced ["));
925 /* In the case [x-], the - is an ordinary hyphen,
926 which is left in c1, the lookahead character. */
927 lexptr -= cur_mb_len;
928 lexleft += cur_mb_len;
932 if (c1 == '-' && c2 != ']')
935 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
936 FETCH_WC(c2, wc2, _("unbalanced ["));
941 /* When case folding map a range, say [m-z] (or even [M-z])
942 to the pair of ranges, [m-z] [M-Z]. */
943 if (range_sts_al == 0)
945 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
946 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
948 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
949 range_sts_al, work_mbc->nranges + 1);
950 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
951 range_ends_al, work_mbc->nranges + 1);
952 work_mbc->range_sts[work_mbc->nranges] =
953 case_fold ? towlower(wc) : (wchar_t)wc;
954 work_mbc->range_ends[work_mbc->nranges++] =
955 case_fold ? towlower(wc2) : (wchar_t)wc2;
958 if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
960 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
961 range_sts_al, work_mbc->nranges + 1);
962 work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
963 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
964 range_ends_al, work_mbc->nranges + 1);
965 work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
978 if (!hard_LC_COLLATE)
979 for (c = c1; c <= c2; c++)
980 setbit_case_fold (c, ccl);
982 for (c = 0; c < NOTCHAR; ++c)
983 if (!(case_fold && isupper (c))
984 && in_coll_range (c, c1, c2))
985 setbit_case_fold (c, ccl);
988 colon_warning_state |= 8;
989 FETCH_WC(c1, wc1, _("unbalanced ["));
993 colon_warning_state |= (c == ':') ? 2 : 4;
996 /* Build normal characters. */
997 setbit_case_fold (wc, ccl);
1000 if (case_fold && iswalpha(wc))
1004 if (c == EOF || (wint_t)c != (wint_t)wc)
1006 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
1007 work_mbc->nchars + 1);
1008 work_mbc->chars[work_mbc->nchars++] = wc;
1017 if (c == EOF || (wint_t)c != (wint_t)wc)
1019 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
1020 work_mbc->nchars + 1);
1021 work_mbc->chars[work_mbc->nchars++] = wc;
1025 setbit_case_fold (c, ccl);
1034 if (colon_warning_state == 7)
1035 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1041 || work_mbc->nchars != 0
1042 || work_mbc->nch_classes != 0
1043 || work_mbc->nranges != 0
1044 || work_mbc->nequivs != 0
1045 || work_mbc->ncoll_elems != 0))
1047 static charclass zeroclass;
1048 work_mbc->invert = invert;
1049 work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
1057 assert(MB_CUR_MAX == 1);
1060 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1061 clrbit(eolbyte, ccl);
1064 return CSET + charclass_index(ccl);
1067 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
1068 #define IS_WORD_CONSTITUENT(C) (isalnum(C) || (C) == '_')
1078 /* Basic plan: We fetch a character. If it's a backslash,
1079 we set the backslash flag and go through the loop again.
1080 On the plus side, this avoids having a duplicate of the
1081 main switch inside the backslash case. On the minus side,
1082 it means that just about every case begins with
1083 "if (backslash) ...". */
1084 for (i = 0; i < 2; ++i)
1089 FETCH_WC (c, wctok, NULL);
1094 #endif /* MBS_SUPPORT */
1103 dfaerror(_("unfinished \\ escape"));
1110 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1112 || lasttok == LPAREN
1114 return lasttok = BEGLINE;
1120 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1122 || (syntax_bits & RE_NO_BK_PARENS
1123 ? lexleft > 0 && *lexptr == ')'
1124 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1125 || (syntax_bits & RE_NO_BK_VBAR
1126 ? lexleft > 0 && *lexptr == '|'
1127 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1128 || ((syntax_bits & RE_NEWLINE_ALT)
1129 && lexleft > 0 && *lexptr == '\n'))
1130 return lasttok = ENDLINE;
1142 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1145 return lasttok = BACKREF;
1150 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1151 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1155 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1156 return lasttok = ENDLINE; /* FIXME: should be end of string */
1160 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1161 return lasttok = BEGWORD;
1165 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1166 return lasttok = ENDWORD;
1170 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1171 return lasttok = LIMWORD;
1175 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1176 return lasttok = NOTLIMWORD;
1180 if (syntax_bits & RE_LIMITED_OPS)
1182 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1184 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1186 return lasttok = QMARK;
1191 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1193 return lasttok = STAR;
1196 if (syntax_bits & RE_LIMITED_OPS)
1198 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1200 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1202 return lasttok = PLUS;
1205 if (!(syntax_bits & RE_INTERVALS))
1207 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1209 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1212 if (syntax_bits & RE_NO_BK_BRACES)
1214 /* Scan ahead for a valid interval; if it's not valid,
1215 treat it as a literal '{'. */
1216 int lo = -1, hi = -1;
1217 char const *p = lexptr;
1218 char const *lim = p + lexleft;
1219 for (; p != lim && ISASCIIDIGIT (*p); p++)
1220 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
1221 if (p != lim && *p == ',')
1222 while (++p != lim && ISASCIIDIGIT (*p))
1223 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
1226 if (p == lim || *p != '}'
1227 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
1234 {M,} - minimum count, maximum is infinity
1235 {M,N} - M through N */
1236 FETCH(c, _("unfinished repeat count"));
1237 if (ISASCIIDIGIT (c))
1242 FETCH(c, _("unfinished repeat count"));
1243 if (! ISASCIIDIGIT (c))
1245 minrep = 10 * minrep + c - '0';
1249 dfaerror(_("malformed repeat count"));
1252 FETCH (c, _("unfinished repeat count"));
1253 if (! ISASCIIDIGIT (c))
1260 FETCH (c, _("unfinished repeat count"));
1261 if (! ISASCIIDIGIT (c))
1263 maxrep = 10 * maxrep + c - '0';
1265 if (0 <= maxrep && maxrep < minrep)
1266 dfaerror (_("malformed repeat count"));
1271 if (!(syntax_bits & RE_NO_BK_BRACES))
1274 dfaerror(_("malformed repeat count"));
1275 FETCH(c, _("unfinished repeat count"));
1278 dfaerror(_("malformed repeat count"));
1280 return lasttok = REPMN;
1283 if (syntax_bits & RE_LIMITED_OPS)
1285 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1288 return lasttok = OR;
1291 if (syntax_bits & RE_LIMITED_OPS
1293 || !(syntax_bits & RE_NEWLINE_ALT))
1296 return lasttok = OR;
1299 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1303 return lasttok = LPAREN;
1306 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1308 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1312 return lasttok = RPAREN;
1320 /* In multibyte environment period must match with a single
1321 character not a byte. So we use ANYCHAR. */
1323 return lasttok = ANYCHAR;
1325 #endif /* MBS_SUPPORT */
1328 if (!(syntax_bits & RE_DOT_NEWLINE))
1329 clrbit(eolbyte, ccl);
1330 if (syntax_bits & RE_DOT_NOT_NULL)
1333 return lasttok = CSET + charclass_index(ccl);
1338 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1341 for (c2 = 0; c2 < NOTCHAR; ++c2)
1347 return lasttok = CSET + charclass_index(ccl);
1352 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1355 for (c2 = 0; c2 < NOTCHAR; ++c2)
1356 if (IS_WORD_CONSTITUENT(c2))
1361 return lasttok = CSET + charclass_index(ccl);
1367 return lasttok = parse_bracket_exp();
1373 /* For multibyte character sets, folding is done in atom. Always
1376 return lasttok = WCHAR;
1379 if (case_fold && isalpha(c))
1382 setbit_case_fold (c, ccl);
1383 return lasttok = CSET + charclass_index(ccl);
1390 /* The above loop should consume at most a backslash
1391 and some other character. */
1393 return END; /* keeps pedantic compilers happy. */
1396 /* Recursive descent parser for regular expressions. */
1398 static token tok; /* Lookahead token. */
1399 static int depth; /* Current depth of a hypothetical stack
1400 holding deferred productions. This is
1401 used to determine the depth that will be
1402 required of the real stack later on in
1406 addtok_mb (token t, int mbprop)
1411 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1413 dfa->multibyte_prop[dfa->tindex] = mbprop;
1419 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1420 dfa->tokens[dfa->tindex++] = t;
1440 if (depth > dfa->depth)
1444 /* Add the given token to the parse tree, maintaining the depth count and
1445 updating the maximum depth if necessary. */
1450 if (MB_CUR_MAX > 1 && t == MBCSET)
1451 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1458 /* We treat a multibyte character as a single atom, so that DFA
1459 can treat a multibyte character as a single expression.
1461 e.g. We construct following tree from "<mb1><mb2>".
1462 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1463 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1465 addtok_wc (wint_t wc)
1467 unsigned char buf[MB_LEN_MAX];
1470 memset (&s, 0, sizeof s);
1471 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1473 /* This is merely stop-gap. When cur_mb_len is 0 or negative,
1474 buf[0] is undefined, yet skipping the addtok_mb call altogether
1475 can result in heap corruption. */
1476 if (cur_mb_len <= 0)
1479 addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
1480 for (i = 1; i < cur_mb_len; i++)
1482 addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
1489 add_utf8_anychar (void)
1491 static const charclass utf8_classes[5] = {
1492 { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
1493 { ~0, ~0, ~0, ~0, 0, 0, 0, 0 }, /* 00-7f: 1-byte sequence */
1494 { 0, 0, 0, 0, 0, 0, 0xfffffffcU, 0 }, /* c2-df: 2-byte sequence */
1495 { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */
1496 { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */
1498 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1501 /* Define the five character classes that are needed below. */
1502 if (dfa->utf8_anychar_classes[0] == 0)
1503 for (i = 0; i < n; i++)
1506 memcpy (c, utf8_classes[i], sizeof c);
1509 if (!(syntax_bits & RE_DOT_NEWLINE))
1510 clrbit (eolbyte, c);
1511 if (syntax_bits & RE_DOT_NOT_NULL)
1514 dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
1517 /* A valid UTF-8 character is
1520 |[0xc2-0xdf][0x80-0xbf]
1521 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1522 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1524 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1525 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1526 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1527 for (i = 1; i < n; i++)
1528 addtok (dfa->utf8_anychar_classes[i]);
1531 addtok (dfa->utf8_anychar_classes[0]);
1537 /* The grammar understood by the parser is as follows.
1556 <multibyte character>
1567 LPAREN regexp RPAREN
1570 The parser builds a parse tree in postfix form in an array of tokens. */
1580 else if (tok == WCHAR)
1582 addtok_wc (case_fold ? towlower(wctok) : wctok);
1584 if (case_fold && iswalpha(wctok))
1586 addtok_wc (towupper(wctok));
1594 else if (tok == ANYCHAR && using_utf8())
1596 /* For UTF-8 expand the period to a series of CSETs that define a valid
1597 UTF-8 character. This avoids using the slow multibyte path. I'm
1598 pretty sure it would be both profitable and correct to do it for
1599 any encoding; however, the optimization must be done manually as
1600 it is done above in add_utf8_anychar. So, let's start with
1601 UTF-8: it is the most used, and the structure of the encoding
1602 makes the correctness more obvious. */
1606 #endif /* MBS_SUPPORT */
1608 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1609 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1611 || tok == ANYCHAR || tok == MBCSET
1612 #endif /* MBS_SUPPORT */
1613 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1618 else if (tok == LPAREN)
1623 dfaerror(_("unbalanced ("));
1630 /* Return the number of tokens in the given subexpression. */
1632 nsubtoks (int tindex)
1636 switch (dfa->tokens[tindex - 1])
1643 return 1 + nsubtoks(tindex - 1);
1646 ntoks1 = nsubtoks(tindex - 1);
1647 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1651 /* Copy the given subexpression to the top of the tree. */
1653 copytoks (int tindex, int ntokens)
1657 for (i = 0; i < ntokens; ++i)
1659 addtok(dfa->tokens[tindex + i]);
1661 /* Update index into multibyte csets. */
1662 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1663 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1671 int tindex, ntokens, i;
1674 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1675 if (tok == REPMN && (minrep || maxrep))
1677 ntokens = nsubtoks(dfa->tindex);
1678 tindex = dfa->tindex - ntokens;
1683 for (i = 1; i < minrep; ++i)
1685 copytoks(tindex, ntokens);
1688 for (; i < maxrep; ++i)
1690 copytoks(tindex, ntokens);
1696 else if (tok == REPMN)
1698 dfa->tindex -= nsubtoks(dfa->tindex);
1713 while (tok != RPAREN && tok != OR && tok >= 0)
1732 /* Main entry point for the parser. S is a string to be parsed, len is the
1733 length of the string, so s can include NUL characters. D is a pointer to
1734 the struct dfa to parse into. */
1736 dfaparse (char const *s, size_t len, struct dfa *d)
1745 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1751 memset(&mbs, 0, sizeof mbs);
1753 #endif /* MBS_SUPPORT */
1755 if (! syntax_bits_set)
1756 dfaerror(_("no syntax specified"));
1764 dfaerror(_("unbalanced )"));
1766 addtok(END - d->nregexps);
1775 /* Some primitives for operating on sets of positions. */
1777 /* Copy one set to another; the destination must be large enough. */
1779 copy (position_set const *src, position_set *dst)
1783 for (i = 0; i < src->nelem; ++i)
1784 dst->elems[i] = src->elems[i];
1785 dst->nelem = src->nelem;
1788 /* Insert a position in a set. Position sets are maintained in sorted
1789 order according to index. If position already exists in the set with
1790 the same index then their constraints are logically or'd together.
1791 S->elems must point to an array large enough to hold the resulting set. */
1793 insert (position p, position_set *s)
1795 int count = s->nelem;
1796 int lo = 0, hi = count;
1799 int mid = ((unsigned) lo + (unsigned) hi) >> 1;
1800 if (s->elems[mid].index < p.index)
1806 if (lo < count && p.index == s->elems[lo].index)
1807 s->elems[lo].constraint |= p.constraint;
1811 for (i = count; i > lo; i--)
1812 s->elems[i] = s->elems[i - 1];
1818 /* Merge two sets of positions into a third. The result is exactly as if
1819 the positions of both sets were inserted into an initially empty set. */
1821 merge (position_set const *s1, position_set const *s2, position_set *m)
1826 while (i < s1->nelem && j < s2->nelem)
1827 if (s1->elems[i].index > s2->elems[j].index)
1828 m->elems[m->nelem++] = s1->elems[i++];
1829 else if (s1->elems[i].index < s2->elems[j].index)
1830 m->elems[m->nelem++] = s2->elems[j++];
1833 m->elems[m->nelem] = s1->elems[i++];
1834 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1836 while (i < s1->nelem)
1837 m->elems[m->nelem++] = s1->elems[i++];
1838 while (j < s2->nelem)
1839 m->elems[m->nelem++] = s2->elems[j++];
1842 /* Delete a position from a set. */
1844 delete (position p, position_set *s)
1848 for (i = 0; i < s->nelem; ++i)
1849 if (p.index == s->elems[i].index)
1852 for (--s->nelem; i < s->nelem; ++i)
1853 s->elems[i] = s->elems[i + 1];
1856 /* Find the index of the state corresponding to the given position set with
1857 the given preceding context, or create a new state if there is no such
1858 state. Newline and letter tell whether we got here on a newline or
1859 letter, respectively. */
1861 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1867 newline = newline ? 1 : 0;
1868 letter = letter ? 1 : 0;
1870 for (i = 0; i < s->nelem; ++i)
1871 hash ^= s->elems[i].index + s->elems[i].constraint;
1873 /* Try to find a state that exactly matches the proposed one. */
1874 for (i = 0; i < d->sindex; ++i)
1876 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1877 || newline != d->states[i].newline || letter != d->states[i].letter)
1879 for (j = 0; j < s->nelem; ++j)
1880 if (s->elems[j].constraint
1881 != d->states[i].elems.elems[j].constraint
1882 || s->elems[j].index != d->states[i].elems.elems[j].index)
1888 /* We'll have to create a new state. */
1889 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1890 d->states[i].hash = hash;
1891 MALLOC(d->states[i].elems.elems, position, s->nelem);
1892 copy(s, &d->states[i].elems);
1893 d->states[i].newline = newline;
1894 d->states[i].letter = letter;
1895 d->states[i].backref = 0;
1896 d->states[i].constraint = 0;
1897 d->states[i].first_end = 0;
1899 d->states[i].mbps.nelem = 0;
1900 d->states[i].mbps.elems = NULL;
1902 for (j = 0; j < s->nelem; ++j)
1903 if (d->tokens[s->elems[j].index] < 0)
1905 constraint = s->elems[j].constraint;
1906 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1907 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1908 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1909 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1910 d->states[i].constraint |= constraint;
1911 if (! d->states[i].first_end)
1912 d->states[i].first_end = d->tokens[s->elems[j].index];
1914 else if (d->tokens[s->elems[j].index] == BACKREF)
1916 d->states[i].constraint = NO_CONSTRAINT;
1917 d->states[i].backref = 1;
1925 /* Find the epsilon closure of a set of positions. If any position of the set
1926 contains a symbol that matches the empty string in some context, replace
1927 that position with the elements of its follow labeled with an appropriate
1928 constraint. Repeat exhaustively until no funny positions are left.
1929 S->elems must be large enough to hold the result. */
1931 epsclosure (position_set *s, struct dfa const *d)
1934 char *visited; /* array of booleans, enough to use char, not int */
1937 CALLOC(visited, char, d->tindex);
1939 for (i = 0; i < s->nelem; ++i)
1940 if (d->tokens[s->elems[i].index] >= NOTCHAR
1941 && d->tokens[s->elems[i].index] != BACKREF
1943 && d->tokens[s->elems[i].index] != ANYCHAR
1944 && d->tokens[s->elems[i].index] != MBCSET
1946 && d->tokens[s->elems[i].index] < CSET)
1949 p.constraint = old.constraint;
1950 delete(s->elems[i], s);
1951 if (visited[old.index])
1956 visited[old.index] = 1;
1957 switch (d->tokens[old.index])
1960 p.constraint &= BEGLINE_CONSTRAINT;
1963 p.constraint &= ENDLINE_CONSTRAINT;
1966 p.constraint &= BEGWORD_CONSTRAINT;
1969 p.constraint &= ENDWORD_CONSTRAINT;
1972 p.constraint &= LIMWORD_CONSTRAINT;
1975 p.constraint &= NOTLIMWORD_CONSTRAINT;
1980 for (j = 0; j < d->follows[old.index].nelem; ++j)
1982 p.index = d->follows[old.index].elems[j].index;
1985 /* Force rescan to start at the beginning. */
1992 /* Perform bottom-up analysis on the parse tree, computing various functions.
1993 Note that at this point, we're pretending constructs like \< are real
1994 characters rather than constraints on what can follow them.
1996 Nullable: A node is nullable if it is at the root of a regexp that can
1997 match the empty string.
1998 * EMPTY leaves are nullable.
1999 * No other leaf is nullable.
2000 * A QMARK or STAR node is nullable.
2001 * A PLUS node is nullable if its argument is nullable.
2002 * A CAT node is nullable if both its arguments are nullable.
2003 * An OR node is nullable if either argument is nullable.
2005 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2006 that could correspond to the first character of a string matching the
2007 regexp rooted at the given node.
2008 * EMPTY leaves have empty firstpos.
2009 * The firstpos of a nonempty leaf is that leaf itself.
2010 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2012 * The firstpos of a CAT node is the firstpos of the left argument, union
2013 the firstpos of the right if the left argument is nullable.
2014 * The firstpos of an OR node is the union of firstpos of each argument.
2016 Lastpos: The lastpos of a node is the set of positions that could
2017 correspond to the last character of a string matching the regexp at
2019 * EMPTY leaves have empty lastpos.
2020 * The lastpos of a nonempty leaf is that leaf itself.
2021 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2023 * The lastpos of a CAT node is the lastpos of its right argument, union
2024 the lastpos of the left if the right argument is nullable.
2025 * The lastpos of an OR node is the union of the lastpos of each argument.
2027 Follow: The follow of a position is the set of positions that could
2028 correspond to the character following a character matching the node in
2029 a string matching the regexp. At this point we consider special symbols
2030 that match the empty string in some context to be just normal characters.
2031 Later, if we find that a special symbol is in a follow set, we will
2032 replace it with the elements of its follow, labeled with an appropriate
2034 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2035 the follow of every node in the lastpos.
2036 * Every node in the firstpos of the second argument of a CAT node is in
2037 the follow of every node in the lastpos of the first argument.
2039 Because of the postfix representation of the parse tree, the depth-first
2040 analysis is conveniently done by a linear scan with the aid of a stack.
2041 Sets are stored as arrays of the elements, obeying a stack-like allocation
2042 scheme; the number of elements in each set deeper in the stack can be
2043 used to determine the address of a particular set's array. */
2045 dfaanalyze (struct dfa *d, int searchflag)
2047 int *nullable; /* Nullable stack. */
2048 int *nfirstpos; /* Element count stack for firstpos sets. */
2049 position *firstpos; /* Array where firstpos elements are stored. */
2050 int *nlastpos; /* Element count stack for lastpos sets. */
2051 position *lastpos; /* Array where lastpos elements are stored. */
2052 int *nalloc; /* Sizes of arrays allocated to follow sets. */
2053 position_set tmp; /* Temporary set for merging sets. */
2054 position_set merged; /* Result of merging sets. */
2055 int wants_newline; /* True if some position wants newline info. */
2057 int *o_nfirst, *o_nlast;
2058 position *o_firstpos, *o_lastpos;
2063 fprintf(stderr, "dfaanalyze:\n");
2064 for (i = 0; i < d->tindex; ++i)
2066 fprintf(stderr, " %d:", i);
2067 prtok(d->tokens[i]);
2072 d->searchflag = searchflag;
2074 MALLOC(nullable, int, d->depth);
2075 o_nullable = nullable;
2076 MALLOC(nfirstpos, int, d->depth);
2077 o_nfirst = nfirstpos;
2078 MALLOC(firstpos, position, d->nleaves);
2079 o_firstpos = firstpos, firstpos += d->nleaves;
2080 MALLOC(nlastpos, int, d->depth);
2082 MALLOC(lastpos, position, d->nleaves);
2083 o_lastpos = lastpos, lastpos += d->nleaves;
2084 CALLOC(nalloc, int, d->tindex);
2085 MALLOC(merged.elems, position, d->nleaves);
2087 CALLOC(d->follows, position_set, d->tindex);
2089 for (i = 0; i < d->tindex; ++i)
2091 { /* Nonsyntactic #ifdef goo... */
2093 switch (d->tokens[i])
2096 /* The empty set is nullable. */
2099 /* The firstpos and lastpos of the empty leaf are both empty. */
2100 *nfirstpos++ = *nlastpos++ = 0;
2105 /* Every element in the firstpos of the argument is in the follow
2106 of every element in the lastpos. */
2107 tmp.nelem = nfirstpos[-1];
2108 tmp.elems = firstpos;
2110 for (j = 0; j < nlastpos[-1]; ++j)
2112 merge(&tmp, &d->follows[pos[j].index], &merged);
2113 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
2114 nalloc[pos[j].index], merged.nelem - 1);
2115 copy(&merged, &d->follows[pos[j].index]);
2119 /* A QMARK or STAR node is automatically nullable. */
2120 if (d->tokens[i] != PLUS)
2125 /* Every element in the firstpos of the second argument is in the
2126 follow of every element in the lastpos of the first argument. */
2127 tmp.nelem = nfirstpos[-1];
2128 tmp.elems = firstpos;
2129 pos = lastpos + nlastpos[-1];
2130 for (j = 0; j < nlastpos[-2]; ++j)
2132 merge(&tmp, &d->follows[pos[j].index], &merged);
2133 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
2134 nalloc[pos[j].index], merged.nelem - 1);
2135 copy(&merged, &d->follows[pos[j].index]);
2138 /* The firstpos of a CAT node is the firstpos of the first argument,
2139 union that of the second argument if the first is nullable. */
2141 nfirstpos[-2] += nfirstpos[-1];
2143 firstpos += nfirstpos[-1];
2146 /* The lastpos of a CAT node is the lastpos of the second argument,
2147 union that of the first argument if the second is nullable. */
2149 nlastpos[-2] += nlastpos[-1];
2152 pos = lastpos + nlastpos[-2];
2153 for (j = nlastpos[-1] - 1; j >= 0; --j)
2154 pos[j] = lastpos[j];
2155 lastpos += nlastpos[-2];
2156 nlastpos[-2] = nlastpos[-1];
2160 /* A CAT node is nullable if both arguments are nullable. */
2161 nullable[-2] = nullable[-1] && nullable[-2];
2166 /* The firstpos is the union of the firstpos of each argument. */
2167 nfirstpos[-2] += nfirstpos[-1];
2170 /* The lastpos is the union of the lastpos of each argument. */
2171 nlastpos[-2] += nlastpos[-1];
2174 /* An OR node is nullable if either argument is nullable. */
2175 nullable[-2] = nullable[-1] || nullable[-2];
2180 /* Anything else is a nonempty position. (Note that special
2181 constructs like \< are treated as nonempty strings here;
2182 an "epsilon closure" effectively makes them nullable later.
2183 Backreferences have to get a real position so we can detect
2184 transitions on them later. But they are nullable. */
2185 *nullable++ = d->tokens[i] == BACKREF;
2187 /* This position is in its own firstpos and lastpos. */
2188 *nfirstpos++ = *nlastpos++ = 1;
2189 --firstpos, --lastpos;
2190 firstpos->index = lastpos->index = i;
2191 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2193 /* Allocate the follow set for this position. */
2195 MALLOC(d->follows[i].elems, position, nalloc[i]);
2199 /* ... balance the above nonsyntactic #ifdef goo... */
2200 fprintf(stderr, "node %d:", i);
2201 prtok(d->tokens[i]);
2203 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2204 fprintf(stderr, " firstpos:");
2205 for (j = nfirstpos[-1] - 1; j >= 0; --j)
2207 fprintf(stderr, " %d:", firstpos[j].index);
2208 prtok(d->tokens[firstpos[j].index]);
2210 fprintf(stderr, "\n lastpos:");
2211 for (j = nlastpos[-1] - 1; j >= 0; --j)
2213 fprintf(stderr, " %d:", lastpos[j].index);
2214 prtok(d->tokens[lastpos[j].index]);
2220 /* For each follow set that is the follow set of a real position, replace
2221 it with its epsilon closure. */
2222 for (i = 0; i < d->tindex; ++i)
2223 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2225 || d->tokens[i] == ANYCHAR
2226 || d->tokens[i] == MBCSET
2228 || d->tokens[i] >= CSET)
2231 fprintf(stderr, "follows(%d:", i);
2232 prtok(d->tokens[i]);
2233 fprintf(stderr, "):");
2234 for (j = d->follows[i].nelem - 1; j >= 0; --j)
2236 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
2237 prtok(d->tokens[d->follows[i].elems[j].index]);
2241 copy(&d->follows[i], &merged);
2242 epsclosure(&merged, d);
2243 if (d->follows[i].nelem < merged.nelem)
2244 REALLOC(d->follows[i].elems, position, merged.nelem);
2245 copy(&merged, &d->follows[i]);
2248 /* Get the epsilon closure of the firstpos of the regexp. The result will
2249 be the set of positions of state 0. */
2251 for (i = 0; i < nfirstpos[-1]; ++i)
2252 insert(firstpos[i], &merged);
2253 epsclosure(&merged, d);
2255 /* Check if any of the positions of state 0 will want newline context. */
2257 for (i = 0; i < merged.nelem; ++i)
2258 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
2261 /* Build the initial state. */
2264 MALLOC(d->states, dfa_state, d->salloc);
2265 state_index(d, &merged, wants_newline, 0);
2276 /* Find, for each character, the transition out of state s of d, and store
2277 it in the appropriate slot of trans.
2279 We divide the positions of s into groups (positions can appear in more
2280 than one group). Each group is labeled with a set of characters that
2281 every position in the group matches (taking into account, if necessary,
2282 preceding context information of s). For each group, find the union
2283 of the its elements' follows. This set is the set of positions of the
2284 new state. For each character in the group's label, set the transition
2285 on this character to be to a state corresponding to the set's positions,
2286 and its associated backward context information, if necessary.
2288 If we are building a searching matcher, we include the positions of state
2291 The collection of groups is constructed by building an equivalence-class
2292 partition of the positions of s.
2294 For each position, find the set of characters C that it matches. Eliminate
2295 any characters from C that fail on grounds of backward context.
2297 Search through the groups, looking for a group whose label L has nonempty
2298 intersection with C. If L - C is nonempty, create a new group labeled
2299 L - C and having the same positions as the current group, and set L to
2300 the intersection of L and C. Insert the position in this group, set
2301 C = C - L, and resume scanning.
2303 If after comparing with every group there are characters remaining in C,
2304 create a new group labeled with the characters of C and insert this
2305 position in that group. */
2307 dfastate (int s, struct dfa *d, int trans[])
2309 position_set *grps; /* As many as will ever be needed. */
2310 charclass *labels; /* Labels corresponding to the groups. */
2311 int ngrps = 0; /* Number of groups actually used. */
2312 position pos; /* Current position being considered. */
2313 charclass matches; /* Set of matching characters. */
2314 int matchesf; /* True if matches is nonempty. */
2315 charclass intersect; /* Intersection with some label set. */
2316 int intersectf; /* True if intersect is nonempty. */
2317 charclass leftovers; /* Stuff in the label that didn't match. */
2318 int leftoversf; /* True if leftovers is nonempty. */
2319 static charclass letters; /* Set of characters considered letters. */
2320 static charclass newline; /* Set of characters that aren't newline. */
2321 position_set follows; /* Union of the follows of some group. */
2322 position_set tmp; /* Temporary space for merging sets. */
2323 int state; /* New state. */
2324 int wants_newline; /* New state wants to know newline context. */
2325 int state_newline; /* New state on a newline transition. */
2326 int wants_letter; /* New state wants to know letter context. */
2327 int state_letter; /* New state on a letter transition. */
2328 static int initialized; /* Flag for static initialization. */
2330 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2334 grps = xnmalloc (NOTCHAR, sizeof *grps);
2335 labels = xnmalloc (NOTCHAR, sizeof *labels);
2337 /* Initialize the set of letters, if necessary. */
2341 for (i = 0; i < NOTCHAR; ++i)
2342 if (IS_WORD_CONSTITUENT(i))
2344 setbit(eolbyte, newline);
2349 for (i = 0; i < d->states[s].elems.nelem; ++i)
2351 pos = d->states[s].elems.elems[i];
2352 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2353 setbit(d->tokens[pos.index], matches);
2354 else if (d->tokens[pos.index] >= CSET)
2355 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
2357 else if (d->tokens[pos.index] == ANYCHAR
2358 || d->tokens[pos.index] == MBCSET)
2359 /* MB_CUR_MAX > 1 */
2361 /* ANYCHAR and MBCSET must match with a single character, so we
2362 must put it to d->states[s].mbps, which contains the positions
2363 which can match with a single character not a byte. */
2364 if (d->states[s].mbps.nelem == 0)
2366 MALLOC(d->states[s].mbps.elems, position,
2367 d->states[s].elems.nelem);
2369 insert(pos, &(d->states[s].mbps));
2372 #endif /* MBS_SUPPORT */
2376 /* Some characters may need to be eliminated from matches because
2377 they fail in the current context. */
2378 if (pos.constraint != 0xFF)
2380 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2381 d->states[s].newline, 1))
2382 clrbit(eolbyte, matches);
2383 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2384 d->states[s].newline, 0))
2385 for (j = 0; j < CHARCLASS_INTS; ++j)
2386 matches[j] &= newline[j];
2387 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2388 d->states[s].letter, 1))
2389 for (j = 0; j < CHARCLASS_INTS; ++j)
2390 matches[j] &= ~letters[j];
2391 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2392 d->states[s].letter, 0))
2393 for (j = 0; j < CHARCLASS_INTS; ++j)
2394 matches[j] &= letters[j];
2396 /* If there are no characters left, there's no point in going on. */
2397 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2399 if (j == CHARCLASS_INTS)
2403 for (j = 0; j < ngrps; ++j)
2405 /* If matches contains a single character only, and the current
2406 group's label doesn't contain that character, go on to the
2408 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2409 && !tstbit(d->tokens[pos.index], labels[j]))
2412 /* Check if this group's label has a nonempty intersection with
2415 for (k = 0; k < CHARCLASS_INTS; ++k)
2416 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2420 /* It does; now find the set differences both ways. */
2421 leftoversf = matchesf = 0;
2422 for (k = 0; k < CHARCLASS_INTS; ++k)
2424 /* Even an optimizing compiler can't know this for sure. */
2425 int match = matches[k], label = labels[j][k];
2427 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2428 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2431 /* If there were leftovers, create a new group labeled with them. */
2434 copyset(leftovers, labels[ngrps]);
2435 copyset(intersect, labels[j]);
2436 MALLOC(grps[ngrps].elems, position, d->nleaves);
2437 copy(&grps[j], &grps[ngrps]);
2441 /* Put the position in the current group. Note that there is no
2442 reason to call insert() here. */
2443 grps[j].elems[grps[j].nelem++] = pos;
2445 /* If every character matching the current position has been
2446 accounted for, we're done. */
2451 /* If we've passed the last group, and there are still characters
2452 unaccounted for, then we'll have to create a new group. */
2455 copyset(matches, labels[ngrps]);
2457 MALLOC(grps[ngrps].elems, position, d->nleaves);
2458 grps[ngrps].nelem = 1;
2459 grps[ngrps].elems[0] = pos;
2464 MALLOC(follows.elems, position, d->nleaves);
2465 MALLOC(tmp.elems, position, d->nleaves);
2467 /* If we are a searching matcher, the default transition is to a state
2468 containing the positions of state 0, otherwise the default transition
2469 is to fail miserably. */
2474 for (i = 0; i < d->states[0].elems.nelem; ++i)
2476 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2478 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2481 copy(&d->states[0].elems, &follows);
2482 state = state_index(d, &follows, 0, 0);
2484 state_newline = state_index(d, &follows, 1, 0);
2486 state_newline = state;
2488 state_letter = state_index(d, &follows, 0, 1);
2490 state_letter = state;
2491 for (i = 0; i < NOTCHAR; ++i)
2492 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2493 trans[eolbyte] = state_newline;
2496 for (i = 0; i < NOTCHAR; ++i)
2499 for (i = 0; i < ngrps; ++i)
2503 /* Find the union of the follows of the positions of the group.
2504 This is a hideously inefficient loop. Fix it someday. */
2505 for (j = 0; j < grps[i].nelem; ++j)
2506 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2507 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2510 if (d->mb_cur_max > 1)
2512 /* If a token in follows.elems is not 1st byte of a multibyte
2513 character, or the states of follows must accept the bytes
2514 which are not 1st byte of the multibyte character.
2515 Then, if a state of follows encounter a byte, it must not be
2516 a 1st byte of a multibyte character nor single byte character.
2517 We cansel to add state[0].follows to next state, because
2518 state[0] must accept 1st-byte
2520 For example, we assume <sb a> is a certain single byte
2521 character, <mb A> is a certain multibyte character, and the
2522 codepoint of <sb a> equals the 2nd byte of the codepoint of
2524 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2525 by accepting accepts 1st byte of <mb A>, and state[i+1]
2526 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2527 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2528 <mb A>, so we can not add state[0]. */
2530 next_isnt_1st_byte = 0;
2531 for (j = 0; j < follows.nelem; ++j)
2533 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2535 next_isnt_1st_byte = 1;
2542 /* If we are building a searching matcher, throw in the positions
2543 of state 0 as well. */
2545 if (d->searchflag && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
2549 for (j = 0; j < d->states[0].elems.nelem; ++j)
2550 insert(d->states[0].elems.elems[j], &follows);
2552 /* Find out if the new state will want any context information. */
2554 if (tstbit(eolbyte, labels[i]))
2555 for (j = 0; j < follows.nelem; ++j)
2556 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2560 for (j = 0; j < CHARCLASS_INTS; ++j)
2561 if (labels[i][j] & letters[j])
2563 if (j < CHARCLASS_INTS)
2564 for (j = 0; j < follows.nelem; ++j)
2565 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2568 /* Find the state(s) corresponding to the union of the follows. */
2569 state = state_index(d, &follows, 0, 0);
2571 state_newline = state_index(d, &follows, 1, 0);
2573 state_newline = state;
2575 state_letter = state_index(d, &follows, 0, 1);
2577 state_letter = state;
2579 /* Set the transitions for each character in the current label. */
2580 for (j = 0; j < CHARCLASS_INTS; ++j)
2581 for (k = 0; k < INTBITS; ++k)
2582 if (labels[i][j] & 1 << k)
2584 int c = j * INTBITS + k;
2587 trans[c] = state_newline;
2588 else if (IS_WORD_CONSTITUENT(c))
2589 trans[c] = state_letter;
2590 else if (c < NOTCHAR)
2595 for (i = 0; i < ngrps; ++i)
2596 free(grps[i].elems);
2597 free(follows.elems);
2603 /* Some routines for manipulating a compiled dfa's transition tables.
2604 Each state may or may not have a transition table; if it does, and it
2605 is a non-accepting state, then d->trans[state] points to its table.
2606 If it is an accepting state then d->fails[state] points to its table.
2607 If it has no table at all, then d->trans[state] is NULL.
2608 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2611 build_state (int s, struct dfa *d)
2613 int *trans; /* The new transition table. */
2616 /* Set an upper limit on the number of transition tables that will ever
2617 exist at once. 1024 is arbitrary. The idea is that the frequently
2618 used transition tables will be quickly rebuilt, whereas the ones that
2619 were only needed once or twice will be cleared away. */
2620 if (d->trcount >= 1024)
2622 for (i = 0; i < d->tralloc; ++i)
2626 d->trans[i] = d->fails[i] = NULL;
2633 /* Set up the success bits for this state. */
2635 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2638 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2641 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2645 MALLOC(trans, int, NOTCHAR);
2646 dfastate(s, d, trans);
2648 /* Now go through the new transition table, and make sure that the trans
2649 and fail arrays are allocated large enough to hold a pointer for the
2650 largest state mentioned in the table. */
2651 for (i = 0; i < NOTCHAR; ++i)
2652 if (trans[i] >= d->tralloc)
2654 int oldalloc = d->tralloc;
2656 while (trans[i] >= d->tralloc)
2658 REALLOC(d->realtrans, int *, d->tralloc + 1);
2659 d->trans = d->realtrans + 1;
2660 REALLOC(d->fails, int *, d->tralloc);
2661 REALLOC(d->success, int, d->tralloc);
2662 REALLOC(d->newlines, int, d->tralloc);
2663 while (oldalloc < d->tralloc)
2665 d->trans[oldalloc] = NULL;
2666 d->fails[oldalloc++] = NULL;
2670 /* Keep the newline transition in a special place so we can use it as
2672 d->newlines[s] = trans[eolbyte];
2673 trans[eolbyte] = -1;
2675 if (ACCEPTING(s, *d))
2676 d->fails[s] = trans;
2678 d->trans[s] = trans;
2682 build_state_zero (struct dfa *d)
2686 CALLOC(d->realtrans, int *, d->tralloc + 1);
2687 d->trans = d->realtrans + 1;
2688 CALLOC(d->fails, int *, d->tralloc);
2689 MALLOC(d->success, int, d->tralloc);
2690 MALLOC(d->newlines, int, d->tralloc);
2695 /* Multibyte character handling sub-routines for dfaexec. */
2697 /* Initial state may encounter the byte which is not a single byte character
2698 nor 1st byte of a multibyte character. But it is incorrect for initial
2699 state to accept such a byte.
2700 For example, in sjis encoding the regular expression like "\\" accepts
2701 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2702 0x815c. Then Initial state must skip the bytes which are not a single byte
2703 character nor 1st byte of a multibyte character. */
2704 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2707 while (inputwcs[p - buf_begin] == 0 \
2708 && mblen_buf[p - buf_begin] > 0 \
2709 && (unsigned char const *) p < buf_end) \
2711 if ((char *) p >= end) \
2721 realloc_trans_if_necessary(struct dfa *d, int new_state)
2723 /* Make sure that the trans and fail arrays are allocated large enough
2724 to hold a pointer for the new state. */
2725 if (new_state >= d->tralloc)
2727 int oldalloc = d->tralloc;
2729 while (new_state >= d->tralloc)
2731 REALLOC(d->realtrans, int *, d->tralloc + 1);
2732 d->trans = d->realtrans + 1;
2733 REALLOC(d->fails, int *, d->tralloc);
2734 REALLOC(d->success, int, d->tralloc);
2735 REALLOC(d->newlines, int, d->tralloc);
2736 while (oldalloc < d->tralloc)
2738 d->trans[oldalloc] = NULL;
2739 d->fails[oldalloc++] = NULL;
2744 /* Return values of transit_state_singlebyte(), and
2745 transit_state_consume_1char. */
2748 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2749 TRANSIT_STATE_DONE, /* State transition has finished. */
2750 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2751 } status_transit_state;
2753 /* Consume a single byte and transit state from 's' to '*next_state'.
2754 This function is almost same as the state transition routin in dfaexec().
2755 But state transition is done just once, otherwise matching succeed or
2756 reach the end of the buffer. */
2757 static status_transit_state
2758 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2764 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2766 while (rval == TRANSIT_STATE_IN_PROGRESS)
2768 if ((t = d->trans[works]) != NULL)
2771 rval = TRANSIT_STATE_DONE;
2779 /* At the moment, it must not happen. */
2784 else if (d->fails[works])
2786 works = d->fails[works][*p];
2787 rval = TRANSIT_STATE_DONE;
2791 build_state(works, d);
2794 *next_state = works;
2798 /* Check whether period can match or not in the current context. If it can,
2799 return the amount of the bytes with which period can match, otherwise
2801 `pos' is the position of the period. `idx' is the index from the
2802 buf_begin, and it is the current position in the buffer. */
2804 match_anychar (struct dfa *d, int s, position pos, int idx)
2812 mbclen = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2814 /* Check context. */
2815 if (wc == (wchar_t)eolbyte)
2817 if (!(syntax_bits & RE_DOT_NEWLINE))
2821 else if (wc == (wchar_t)'\0')
2823 if (syntax_bits & RE_DOT_NOT_NULL)
2828 if (iswalnum(wc) || wc == L'_')
2831 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2832 newline, d->states[s].letter, letter))
2838 /* Check whether bracket expression can match or not in the current context.
2839 If it can, return the amount of the bytes with which expression can match,
2841 `pos' is the position of the bracket expression. `idx' is the index
2842 from the buf_begin, and it is the current position in the buffer. */
2844 match_mb_charset (struct dfa *d, int s, position pos, int idx)
2847 int match; /* Flag which represent that matching succeed. */
2848 int match_len; /* Length of the character (or collating element)
2849 with which this operator match. */
2850 int op_len; /* Length of the operator. */
2854 /* Pointer to the structure to which we are currently refering. */
2855 struct mb_char_classes *work_mbc;
2859 wchar_t wc; /* Current refering character. */
2863 /* Check context. */
2864 if (wc == (wchar_t)eolbyte)
2866 if (!(syntax_bits & RE_DOT_NEWLINE))
2870 else if (wc == (wchar_t)'\0')
2872 if (syntax_bits & RE_DOT_NOT_NULL)
2876 if (iswalnum(wc) || wc == L'_')
2878 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2879 newline, d->states[s].letter, letter))
2882 /* Assign the current refering operator to work_mbc. */
2883 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2884 match = !work_mbc->invert;
2885 match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2887 /* Match in range 0-255? */
2888 if (wc < NOTCHAR && work_mbc->cset != -1
2889 && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
2890 goto charset_matched;
2892 /* match with a character class? */
2893 for (i = 0; i<work_mbc->nch_classes; i++)
2895 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2896 goto charset_matched;
2899 strncpy(buffer, (char const *) buf_begin + idx, match_len);
2900 buffer[match_len] = '\0';
2902 /* match with an equivalent class? */
2903 for (i = 0; i<work_mbc->nequivs; i++)
2905 op_len = strlen(work_mbc->equivs[i]);
2906 strncpy(buffer, (char const *) buf_begin + idx, op_len);
2907 buffer[op_len] = '\0';
2908 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2911 goto charset_matched;
2915 /* match with a collating element? */
2916 for (i = 0; i<work_mbc->ncoll_elems; i++)
2918 op_len = strlen(work_mbc->coll_elems[i]);
2919 strncpy(buffer, (char const *) buf_begin + idx, op_len);
2920 buffer[op_len] = '\0';
2922 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2925 goto charset_matched;
2930 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2932 /* match with a range? */
2933 for (i = 0; i<work_mbc->nranges; i++)
2935 wcbuf[2] = work_mbc->range_sts[i];
2936 wcbuf[4] = work_mbc->range_ends[i];
2938 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2939 wcscoll(wcbuf+4, wcbuf) >= 0)
2940 goto charset_matched;
2943 /* match with a character? */
2944 for (i = 0; i<work_mbc->nchars; i++)
2946 if (wc == work_mbc->chars[i])
2947 goto charset_matched;
2953 return match ? match_len : 0;
2956 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2957 array which corresponds to `d->states[s].mbps.elem' and each element of
2958 the array contains the amount of the bytes with which the element can
2960 `idx' is the index from the buf_begin, and it is the current position
2962 Caller MUST free the array which this function return. */
2964 check_matching_with_multibyte_ops (struct dfa *d, int s, int idx)
2969 MALLOC(rarray, int, d->states[s].mbps.nelem);
2970 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2972 position pos = d->states[s].mbps.elems[i];
2973 switch(d->tokens[pos.index])
2976 rarray[i] = match_anychar(d, s, pos, idx);
2979 rarray[i] = match_mb_charset(d, s, pos, idx);
2982 break; /* can not happen. */
2988 /* Consume a single character and enumerate all of the positions which can
2989 be next position from the state `s'.
2990 `match_lens' is the input. It can be NULL, but it can also be the output
2991 of check_matching_with_multibyte_ops() for optimization.
2992 `mbclen' and `pps' are the output. `mbclen' is the length of the
2993 character consumed, and `pps' is the set this function enumerate. */
2994 static status_transit_state
2995 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2996 int *match_lens, int *mbclen, position_set *pps)
3001 status_transit_state rs = TRANSIT_STATE_DONE;
3003 /* Calculate the length of the (single/multi byte) character
3004 to which p points. */
3005 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
3006 : mblen_buf[*pp - buf_begin];
3008 /* Calculate the state which can be reached from the state `s' by
3009 consuming `*mbclen' single bytes from the buffer. */
3011 for (i = 0; i < *mbclen; i++)
3014 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
3016 /* Copy the positions contained by `s1' to the set `pps'. */
3017 copy(&(d->states[s1].elems), pps);
3019 /* Check (inputed)match_lens, and initialize if it is NULL. */
3020 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3021 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
3023 work_mbls = match_lens;
3025 /* Add all of the positions which can be reached from `s' by consuming
3026 a single character. */
3027 for (i = 0; i < d->states[s].mbps.nelem ; i++)
3029 if (work_mbls[i] == *mbclen)
3030 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3032 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
3036 if (match_lens == NULL && work_mbls != NULL)
3041 /* Transit state from s, then return new state and update the pointer of the
3042 buffer. This function is for some operator which can match with a multi-
3043 byte character or a collating element (which may be multi characters). */
3045 transit_state (struct dfa *d, int s, unsigned char const **pp)
3048 int mbclen; /* The length of current input multibyte character. */
3051 int *match_lens = NULL;
3052 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
3053 position_set follows;
3054 unsigned char const *p1 = *pp;
3058 /* This state has (a) multibyte operator(s).
3059 We check whether each of them can match or not. */
3061 /* Note: caller must free the return value of this function. */
3062 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
3064 for (i = 0; i < nelem; i++)
3065 /* Search the operator which match the longest string,
3068 if (match_lens[i] > maxlen)
3069 maxlen = match_lens[i];
3073 if (nelem == 0 || maxlen == 0)
3074 /* This state has no multibyte operator which can match.
3075 We need to check only one single byte character. */
3077 status_transit_state rs;
3078 rs = transit_state_singlebyte(d, s, *pp, &s1);
3080 /* We must update the pointer if state transition succeeded. */
3081 if (rs == TRANSIT_STATE_DONE)
3088 /* This state has some operators which can match a multibyte character. */
3090 MALLOC(follows.elems, position, d->nleaves);
3092 /* `maxlen' may be longer than the length of a character, because it may
3093 not be a character but a (multi character) collating element.
3094 We enumerate all of the positions which `s' can reach by consuming
3096 transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
3098 wc = inputwcs[*pp - mbclen - buf_begin];
3099 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
3100 realloc_trans_if_necessary(d, s1);
3102 while (*pp - p1 < maxlen)
3105 transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
3107 for (i = 0; i < nelem ; i++)
3109 if (match_lens[i] == *pp - p1)
3111 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3112 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3116 wc = inputwcs[*pp - mbclen - buf_begin];
3117 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
3118 realloc_trans_if_necessary(d, s1);
3121 free(follows.elems);
3125 #endif /* MBS_SUPPORT */
3127 /* Initialize mblen_buf and inputwcs with data from the next line. */
3130 prepare_wc_buf (const char *begin, const char *end)
3132 unsigned char eol = eolbyte;
3133 size_t remain_bytes, i;
3135 buf_begin = (unsigned char *) begin;
3138 for (i = 0; i < end - begin + 1; i++)
3140 if (remain_bytes == 0)
3143 = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3144 if (remain_bytes < 1
3145 || remain_bytes == (size_t) -1
3146 || remain_bytes == (size_t) -2
3147 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
3150 inputwcs[i] = (wchar_t)begin[i];
3152 if (begin[i] == eol)
3157 mblen_buf[i] = remain_bytes;
3163 mblen_buf[i] = remain_bytes;
3169 buf_end = (unsigned char *) (begin + i);
3171 inputwcs[i] = 0; /* sentinel */
3174 /* Search through a buffer looking for a match to the given struct dfa.
3175 Find the first occurrence of a string matching the regexp in the
3176 buffer, and the shortest possible version thereof. Return a pointer to
3177 the first character after the match, or NULL if none is found. BEGIN
3178 points to the beginning of the buffer, and END points to the first byte
3179 after its end. Note however that we store a sentinel byte (usually
3180 newline) in *END, so the actual buffer must be one byte longer.
3181 When NEWLINE is nonzero, newlines may appear in the matching string.
3182 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3183 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3184 encountered a back-reference (1) or not (0). The caller may use this
3185 to decide whether to fall back on a backtracking matcher. */
3187 dfaexec (struct dfa *d, char const *begin, char *end,
3188 int newline, int *count, int *backref)
3190 int s, s1, tmp; /* Current state. */
3191 unsigned char const *p; /* Current input character. */
3192 int **trans, *t; /* Copy of d->trans so it can be optimized
3194 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3195 unsigned char saved_end;
3196 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
3197 static int sbit_init;
3204 for (i = 0; i < NOTCHAR; ++i)
3205 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
3210 build_state_zero(d);
3213 p = (unsigned char const *) begin;
3215 saved_end = *(unsigned char *) end;
3219 if (d->mb_cur_max > 1)
3221 MALLOC(mblen_buf, unsigned char, end - begin + 2);
3222 MALLOC(inputwcs, wchar_t, end - begin + 2);
3223 memset(&mbs, 0, sizeof(mbstate_t));
3224 prepare_wc_buf ((const char *) p, end);
3226 #endif /* MBS_SUPPORT */
3231 if (d->mb_cur_max > 1)
3232 while ((t = trans[s]))
3237 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
3239 if (d->states[s].mbps.nelem == 0)
3245 /* Falling back to the glibc matcher in this case gives
3246 better performance (up to 25% better on [a-z], for
3247 example) and enables support for collating symbols and
3248 equivalence classes. */
3258 /* Can match with a multibyte character (and multi character
3259 collating element). Transition table might be updated. */
3260 s = transit_state(d, s, &p);
3264 #endif /* MBS_SUPPORT */
3265 while ((t = trans[s]) != 0) { /* hand-optimized loop */
3267 if ((t = trans[s1]) == 0) {
3268 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
3274 if (s >= 0 && (char *) p <= end && d->fails[s])
3276 if (d->success[s] & sbit[*p])
3279 *backref = (d->states[s].backref != 0);
3281 if (d->mb_cur_max > 1)
3286 #endif /* MBS_SUPPORT */
3293 if (d->mb_cur_max > 1)
3295 /* Can match with a multibyte character (and multicharacter
3296 collating element). Transition table might be updated. */
3297 s = transit_state(d, s, &p);
3301 #endif /* MBS_SUPPORT */
3302 s = d->fails[s][*p++];
3306 /* If the previous character was a newline, count it. */
3307 if ((char *) p <= end && p[-1] == eol)
3313 if (d->mb_cur_max > 1)
3314 prepare_wc_buf ((const char *) p, end);
3318 /* Check if we've run off the end of the buffer. */
3319 if ((char *) p > end)
3322 if (d->mb_cur_max > 1)
3327 #endif /* MBS_SUPPORT */
3339 if (p[-1] == eol && newline)
3341 s = d->newlines[s1];
3351 free_mbdata (struct dfa *d)
3355 free(d->multibyte_prop);
3356 d->multibyte_prop = NULL;
3358 for (i = 0; i < d->nmbcsets; ++i)
3361 struct mb_char_classes *p = &(d->mbcsets[i]);
3363 free(p->ch_classes);
3365 free(p->range_ends);
3367 for (j = 0; j < p->nequivs; ++j)
3371 for (j = 0; j < p->ncoll_elems; ++j)
3372 free(p->coll_elems[j]);
3373 free(p->coll_elems);
3382 /* Initialize the components of a dfa that the other routines don't
3383 initialize for themselves. */
3385 dfainit (struct dfa *d)
3387 memset (d, 0, sizeof *d);
3390 MALLOC(d->charclasses, charclass, d->calloc);
3393 MALLOC(d->tokens, token, d->talloc);
3396 d->mb_cur_max = MB_CUR_MAX;
3397 if (d->mb_cur_max > 1)
3399 d->nmultibyte_prop = 1;
3400 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
3401 d->mbcsets_alloc = 1;
3402 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
3409 dfaoptimize (struct dfa *d)
3415 for (i = 0; i < d->tindex; ++i)
3417 switch(d->tokens[i])
3423 /* Requires multi-byte algorithm. */
3435 /* Parse and analyze a single string of the given length. */
3437 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3440 dfaparse(s, len, d);
3445 dfaanalyze(d, searchflag);
3448 /* Free the storage held by the components of a dfa. */
3450 dfafree (struct dfa *d)
3453 struct dfamust *dm, *ndm;
3455 free(d->charclasses);
3459 if (d->mb_cur_max > 1)
3461 #endif /* MBS_SUPPORT */
3463 for (i = 0; i < d->sindex; ++i) {
3464 free(d->states[i].elems.elems);
3466 free(d->states[i].mbps.elems);
3467 #endif /* MBS_SUPPORT */
3470 for (i = 0; i < d->tindex; ++i)
3471 free(d->follows[i].elems);
3473 for (i = 0; i < d->tralloc; ++i)
3482 for (dm = d->musts; dm; dm = ndm)
3490 /* Having found the postfix representation of the regular expression,
3491 try to find a long sequence of characters that must appear in any line
3493 Finding a "longest" sequence is beyond the scope here;
3494 we take an easy way out and hope for the best.
3495 (Take "(ab|a)b"--please.)
3497 We do a bottom-up calculation of sequences of characters that must appear
3498 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3500 sequences that must appear at the left of the match ("left")
3501 sequences that must appear at the right of the match ("right")
3502 lists of sequences that must appear somewhere in the match ("in")
3503 sequences that must constitute the match ("is")
3505 When we get to the root of the tree, we use one of the longest of its
3506 calculated "in" sequences as our answer. The sequence we find is returned in
3507 d->must (where "d" is the single argument passed to "dfamust");
3508 the length of the sequence is returned in d->mustn.
3510 The sequences calculated for the various types of node (in pseudo ANSI c)
3511 are shown below. "p" is the operand of unary operators (and the left-hand
3512 operand of binary operators); "q" is the right-hand operand of binary
3515 "ZERO" means "a zero-length sequence" below.
3517 Type left right is in
3518 ---- ---- ----- -- --
3519 char c # c # c # c # c
3521 ANYCHAR ZERO ZERO ZERO ZERO
3523 MBCSET ZERO ZERO ZERO ZERO
3525 CSET ZERO ZERO ZERO ZERO
3527 STAR ZERO ZERO ZERO ZERO
3529 QMARK ZERO ZERO ZERO ZERO
3531 PLUS p->left p->right ZERO p->in
3533 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3534 p->left : q->right : q->is!=ZERO) ? q->in plus
3535 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3538 OR longest common longest common (do p->is and substrings common to
3539 leading trailing q->is have same p->in and q->in
3540 (sub)sequence (sub)sequence length and
3541 of p->left of p->right content) ?
3542 and q->left and q->right p->is : NULL
3544 If there's anything else we recognize in the tree, all four sequences get set
3545 to zero-length sequences. If there's something we don't recognize in the tree,
3546 we just return a zero-length sequence.
3548 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3551 And. . .is it here or someplace that we might ponder "optimizations" such as
3552 egrep 'psi|epsilon' -> egrep 'psi'
3553 egrep 'pepsi|epsilon' -> egrep 'epsi'
3554 (Yes, we now find "epsi" as a "string
3555 that must occur", but we might also
3556 simplify the *entire* r.e. being sought)
3557 grep '[c]' -> grep 'c'
3558 grep '(ab|a)b' -> grep 'ab'
3559 grep 'ab*' -> grep 'a'
3560 grep 'a*b' -> grep 'b'
3562 There are several issues:
3564 Is optimization easy (enough)?
3566 Does optimization actually accomplish anything,
3567 or is the automaton you get from "psi|epsilon" (for example)
3568 the same as the one you get from "psi" (for example)?
3570 Are optimizable r.e.'s likely to be used in real-life situations
3571 (something like 'ab*' is probably unlikely; something like is
3572 'psi|epsilon' is likelier)? */
3575 icatalloc (char const *old, char const *new)
3578 size_t oldsize, newsize;
3580 newsize = (new == NULL) ? 0 : strlen(new);
3583 else if (newsize == 0)
3584 return (char *) old;
3585 else oldsize = strlen(old);
3587 result = malloc(newsize + 1);
3589 result = realloc((void *) old, oldsize + newsize + 1);
3590 if (result != NULL && new != NULL)
3591 (void) strcpy(result + oldsize, new);
3596 icpyalloc (char const *string)
3598 return icatalloc (NULL, string);
3602 istrstr (char const *lookin, char const *lookfor)
3607 len = strlen(lookfor);
3608 for (cp = lookin; *cp != '\0'; ++cp)
3609 if (strncmp(cp, lookfor, len) == 0)
3615 freelist (char **cpp)
3621 for (i = 0; cpp[i] != NULL; ++i)
3629 enlist (char **cpp, char *new, size_t len)
3635 if ((new = icpyalloc(new)) == NULL)
3641 /* Is there already something in the list that's new (or longer)? */
3642 for (i = 0; cpp[i] != NULL; ++i)
3643 if (istrstr(cpp[i], new) != NULL)
3648 /* Eliminate any obsoleted strings. */
3650 while (cpp[j] != NULL)
3651 if (istrstr(new, cpp[j]) == NULL)
3661 /* Add the new string. */
3662 cpp = realloc((char *) cpp, (i + 2) * sizeof *cpp);
3670 /* Given pointers to two strings, return a pointer to an allocated
3671 list of their distinct common substrings. Return NULL if something
3674 comsubs (char *left, char const *right)
3681 if (left == NULL || right == NULL)
3683 cpp = malloc(sizeof *cpp);
3687 for (lcp = left; *lcp != '\0'; ++lcp)
3690 rcp = strchr (right, *lcp);
3693 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3697 rcp = strchr (rcp + 1, *lcp);
3701 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3708 addlists (char **old, char **new)
3712 if (old == NULL || new == NULL)
3714 for (i = 0; new[i] != NULL; ++i)
3716 old = enlist(old, new[i], strlen(new[i]));
3723 /* Given two lists of substrings, return a new list giving substrings
3726 inboth (char **left, char **right)
3732 if (left == NULL || right == NULL)
3734 both = malloc(sizeof *both);
3738 for (lnum = 0; left[lnum] != NULL; ++lnum)
3740 for (rnum = 0; right[rnum] != NULL; ++rnum)
3742 temp = comsubs(left[lnum], right[rnum]);
3748 both = addlists(both, temp);
3767 resetmust (must *mp)
3769 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3774 dfamust (struct dfa *d)
3785 static char empty_string[] = "";
3787 result = empty_string;
3789 musts = malloc((d->tindex + 1) * sizeof *musts);
3793 for (i = 0; i <= d->tindex; ++i)
3795 for (i = 0; i <= d->tindex; ++i)
3797 mp[i].in = malloc(sizeof *mp[i].in);
3798 mp[i].left = malloc(2);
3799 mp[i].right = malloc(2);
3800 mp[i].is = malloc(2);
3801 if (mp[i].in == NULL || mp[i].left == NULL ||
3802 mp[i].right == NULL || mp[i].is == NULL)
3804 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3808 fprintf(stderr, "dfamust:\n");
3809 for (i = 0; i < d->tindex; ++i)
3811 fprintf(stderr, " %d:", i);
3812 prtok(d->tokens[i]);
3816 for (ri = 0; ri < d->tindex; ++ri)
3818 switch (t = d->tokens[ri])
3822 assert (!"neither LPAREN nor RPAREN may appear here");
3835 assert (musts < mp);
3840 assert (&musts[2] <= mp);
3849 /* Guaranteed to be. Unlikely, but. . . */
3850 if (!STREQ (lmp->is, rmp->is))
3852 /* Left side--easy */
3854 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3856 lmp->left[i] = '\0';
3858 ln = strlen(lmp->right);
3859 rn = strlen(rmp->right);
3863 for (i = 0; i < n; ++i)
3864 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3866 for (j = 0; j < i; ++j)
3867 lmp->right[j] = lmp->right[(ln - i) + j];
3868 lmp->right[j] = '\0';
3869 new = inboth(lmp->in, rmp->in);
3878 assert (musts < mp);
3883 assert (mp == &musts[1]);
3884 for (i = 0; musts[0].in[i] != NULL; ++i)
3885 if (strlen(musts[0].in[i]) > strlen(result))
3886 result = musts[0].in[i];
3887 if (STREQ (result, musts[0].is))
3891 assert (&musts[2] <= mp);
3898 /* In. Everything in left, plus everything in
3899 right, plus catenation of
3900 left's right and right's left. */
3901 lmp->in = addlists(lmp->in, rmp->in);
3902 if (lmp->in == NULL)
3904 if (lmp->right[0] != '\0' &&
3905 rmp->left[0] != '\0')
3909 tp = icpyalloc(lmp->right);
3912 tp = icatalloc(tp, rmp->left);
3915 lmp->in = enlist(lmp->in, tp,
3918 if (lmp->in == NULL)
3922 if (lmp->is[0] != '\0')
3924 lmp->left = icatalloc(lmp->left,
3926 if (lmp->left == NULL)
3930 if (rmp->is[0] == '\0')
3931 lmp->right[0] = '\0';
3932 lmp->right = icatalloc(lmp->right, rmp->right);
3933 if (lmp->right == NULL)
3935 /* Guaranteed to be */
3936 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3938 lmp->is = icatalloc(lmp->is, rmp->is);
3939 if (lmp->is == NULL)
3949 assert (!"oops! t >= END");
3953 /* not on *my* shift */
3960 #endif /* MBS_SUPPORT */
3968 /* plain character */
3970 mp->is[0] = mp->left[0] = mp->right[0] = t;
3971 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3972 mp->in = enlist(mp->in, mp->is, (size_t)1);
3979 fprintf(stderr, " node: %d:", ri);
3980 prtok(d->tokens[ri]);
3981 fprintf(stderr, "\n in:");
3982 for (i = 0; mp->in[i]; ++i)
3983 fprintf(stderr, " \"%s\"", mp->in[i]);
3984 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3985 fprintf(stderr, " left: \"%s\"\n", mp->left);
3986 fprintf(stderr, " right: \"%s\"\n", mp->right);
3993 MALLOC(dm, struct dfamust, 1);
3995 MALLOC(dm->must, char, strlen(result) + 1);
3996 strcpy(dm->must, result);
3997 dm->next = d->musts;
4001 for (i = 0; i <= d->tindex; ++i)
4015 return xmalloc (sizeof (struct dfa));
4019 dfamusts (struct dfa const *d)
4024 /* vim:set shiftwidth=2: */