1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
32 #include <sys/types.h>
33 extern char *calloc(), *malloc(), *realloc();
37 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
45 #ifndef DEBUG /* use the same approach as regex.c */
51 #define isgraph(C) (isprint(C) && !isspace(C))
54 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
55 #define ISALPHA(C) isalpha(C)
56 #define ISUPPER(C) isupper(C)
57 #define ISLOWER(C) islower(C)
58 #define ISDIGIT(C) isdigit(C)
59 #define ISXDIGIT(C) isxdigit(C)
60 #define ISSPACE(C) isspace(C)
61 #define ISPUNCT(C) ispunct(C)
62 #define ISALNUM(C) isalnum(C)
63 #define ISPRINT(C) isprint(C)
64 #define ISGRAPH(C) isgraph(C)
65 #define ISCNTRL(C) iscntrl(C)
67 #define ISALPHA(C) (isascii(C) && isalpha(C))
68 #define ISUPPER(C) (isascii(C) && isupper(C))
69 #define ISLOWER(C) (isascii(C) && islower(C))
70 #define ISDIGIT(C) (isascii(C) && isdigit(C))
71 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
72 #define ISSPACE(C) (isascii(C) && isspace(C))
73 #define ISPUNCT(C) (isascii(C) && ispunct(C))
74 #define ISALNUM(C) (isascii(C) && isalnum(C))
75 #define ISPRINT(C) (isascii(C) && isprint(C))
76 #define ISGRAPH(C) (isascii(C) && isgraph(C))
77 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
96 static void dfamust _RE_ARGS((struct dfa *dfa));
98 static ptr_t xcalloc _RE_ARGS((size_t n, size_t s));
99 static ptr_t xmalloc _RE_ARGS((size_t n));
100 static ptr_t xrealloc _RE_ARGS((ptr_t p, size_t n));
102 static void prtok _RE_ARGS((token t));
104 static int tstbit _RE_ARGS((int b, charclass c));
105 static void setbit _RE_ARGS((int b, charclass c));
106 static void clrbit _RE_ARGS((int b, charclass c));
107 static void copyset _RE_ARGS((charclass src, charclass dst));
108 static void zeroset _RE_ARGS((charclass s));
109 static void notset _RE_ARGS((charclass s));
110 static int equal _RE_ARGS((charclass s1, charclass s2));
111 static int charclass_index _RE_ARGS((charclass s));
112 static int looking_at _RE_ARGS((const char *s));
113 static token lex _RE_ARGS((void));
114 static void addtok _RE_ARGS((token t));
115 static void atom _RE_ARGS((void));
116 static int nsubtoks _RE_ARGS((int tindex));
117 static void copytoks _RE_ARGS((int tindex, int ntokens));
118 static void closure _RE_ARGS((void));
119 static void branch _RE_ARGS((void));
120 static void regexp _RE_ARGS((int toplevel));
121 static void copy _RE_ARGS((position_set *src, position_set *dst));
122 static void insert _RE_ARGS((position p, position_set *s));
123 static void merge _RE_ARGS((position_set *s1, position_set *s2, position_set *m));
124 static void delete _RE_ARGS((position p, position_set *s));
125 static int state_index _RE_ARGS((struct dfa *d, position_set *s,
126 int newline, int letter));
127 static void build_state _RE_ARGS((int s, struct dfa *d));
128 static void build_state_zero _RE_ARGS((struct dfa *d));
129 static char *icatalloc _RE_ARGS((char *old, char *new));
130 static char *icpyalloc _RE_ARGS((char *string));
131 static char *istrstr _RE_ARGS((char *lookin, char *lookfor));
132 static void ifree _RE_ARGS((char *cp));
133 static void freelist _RE_ARGS((char **cpp));
134 static char **enlist _RE_ARGS((char **cpp, char *new, size_t len));
135 static char **comsubs _RE_ARGS((char *left, char *right));
136 static char **addlists _RE_ARGS((char **old, char **new));
137 static char **inboth _RE_ARGS((char **left, char **right));
140 static int collate_range_cmp (a, b)
146 if ((unsigned char)a == (unsigned char)b)
150 if ((r = strcoll(s[0], s[1])) == 0)
151 r = (unsigned char)a - (unsigned char)b;
161 ptr_t r = calloc(n, s);
164 dfaerror("Memory exhausted");
176 dfaerror("Memory exhausted");
185 ptr_t r = realloc(p, n);
189 dfaerror("Memory exhausted");
193 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
194 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
195 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
197 /* Reallocate an array of type t if nalloc is too small for index. */
198 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
199 if ((index) >= (nalloc)) \
201 while ((index) >= (nalloc)) \
203 REALLOC(p, t, nalloc); \
215 fprintf(stderr, "END");
216 else if (t < NOTCHAR)
217 fprintf(stderr, "%c", t);
222 case EMPTY: s = "EMPTY"; break;
223 case BACKREF: s = "BACKREF"; break;
224 case BEGLINE: s = "BEGLINE"; break;
225 case ENDLINE: s = "ENDLINE"; break;
226 case BEGWORD: s = "BEGWORD"; break;
227 case ENDWORD: s = "ENDWORD"; break;
228 case LIMWORD: s = "LIMWORD"; break;
229 case NOTLIMWORD: s = "NOTLIMWORD"; break;
230 case QMARK: s = "QMARK"; break;
231 case STAR: s = "STAR"; break;
232 case PLUS: s = "PLUS"; break;
233 case CAT: s = "CAT"; break;
234 case OR: s = "OR"; break;
235 case ORTOP: s = "ORTOP"; break;
236 case LPAREN: s = "LPAREN"; break;
237 case RPAREN: s = "RPAREN"; break;
238 default: s = "CSET"; break;
240 fprintf(stderr, "%s", s);
245 /* Stuff pertaining to charclasses. */
252 return c[b / INTBITS] & 1 << b % INTBITS;
260 c[b / INTBITS] |= 1 << b % INTBITS;
268 c[b / INTBITS] &= ~(1 << b % INTBITS);
278 for (i = 0; i < CHARCLASS_INTS; ++i)
288 for (i = 0; i < CHARCLASS_INTS; ++i)
298 for (i = 0; i < CHARCLASS_INTS; ++i)
309 for (i = 0; i < CHARCLASS_INTS; ++i)
315 /* A pointer to the current dfa is kept here during parsing. */
316 static struct dfa *dfa;
318 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
325 for (i = 0; i < dfa->cindex; ++i)
326 if (equal(s, dfa->charclasses[i]))
328 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
330 copyset(s, dfa->charclasses[i]);
334 /* Syntax bits controlling the behavior of the lexical analyzer. */
335 static reg_syntax_t syntax_bits, syntax_bits_set;
337 /* Flag for case-folding letters into sets. */
338 static int case_fold;
340 /* Entry point to set syntax options. */
342 dfasyntax(bits, fold)
351 /* Lexical analyzer. All the dross that deals with the obnoxious
352 GNU Regex syntax bits is located here. The poor, suffering
353 reader is referred to the GNU Regex documentation for the
354 meaning of the @#%!@#%^!@ syntax bits. */
356 static char *lexstart; /* Pointer to beginning of input string. */
357 static char *lexptr; /* Pointer to next input character. */
358 static int lexleft; /* Number of characters remaining. */
359 static token lasttok; /* Previous token returned; initially END. */
360 static int laststart; /* True if we're separated from beginning or (, |
361 only by zero-width characters. */
362 static int parens; /* Count of outstanding left parens. */
363 static int minrep, maxrep; /* Repeat counts for {m,n}. */
365 /* Note that characters become unsigned here. */
366 #define FETCH(c, eoferr) \
372 return lasttok = END; \
373 (c) = (unsigned char) *lexptr++; \
378 #define FUNC(F, P) static int F(int c) { return P(c); }
380 #define FUNC(F, P) static int F(c) int c; { return P(c); }
383 FUNC(is_alpha, ISALPHA)
384 FUNC(is_upper, ISUPPER)
385 FUNC(is_lower, ISLOWER)
386 FUNC(is_digit, ISDIGIT)
387 FUNC(is_xdigit, ISXDIGIT)
388 FUNC(is_space, ISSPACE)
389 FUNC(is_punct, ISPUNCT)
390 FUNC(is_alnum, ISALNUM)
391 FUNC(is_print, ISPRINT)
392 FUNC(is_graph, ISGRAPH)
393 FUNC(is_cntrl, ISCNTRL)
395 static int is_blank(c)
398 return (c == ' ' || c == '\t');
401 /* The following list maps the names of the Posix named character classes
402 to predicate functions that determine whether a given character is in
403 the class. The leading [ has already been eaten by the lexical analyzer. */
406 int (*pred) _RE_ARGS((int));
408 { ":alpha:]", is_alpha },
409 { ":upper:]", is_upper },
410 { ":lower:]", is_lower },
411 { ":digit:]", is_digit },
412 { ":xdigit:]", is_xdigit },
413 { ":space:]", is_space },
414 { ":punct:]", is_punct },
415 { ":alnum:]", is_alnum },
416 { ":print:]", is_print },
417 { ":graph:]", is_graph },
418 { ":cntrl:]", is_cntrl },
419 { ":blank:]", is_blank },
432 return strncmp(s, lexptr, len) == 0;
439 int backslash = 0, invert;
443 /* Basic plan: We fetch a character. If it's a backslash,
444 we set the backslash flag and go through the loop again.
445 On the plus side, this avoids having a duplicate of the
446 main switch inside the backslash case. On the minus side,
447 it means that just about every case begins with
448 "if (backslash) ...". */
449 for (i = 0; i < 2; ++i)
458 dfaerror("Unfinished \\ escape");
465 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
469 return lasttok = BEGLINE;
475 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
477 || (syntax_bits & RE_NO_BK_PARENS
478 ? lexleft > 0 && *lexptr == ')'
479 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
480 || (syntax_bits & RE_NO_BK_VBAR
481 ? lexleft > 0 && *lexptr == '|'
482 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
483 || ((syntax_bits & RE_NEWLINE_ALT)
484 && lexleft > 0 && *lexptr == '\n'))
485 return lasttok = ENDLINE;
497 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
500 return lasttok = BACKREF;
505 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
506 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
510 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
511 return lasttok = ENDLINE; /* FIXME: should be end of string */
515 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
516 return lasttok = BEGWORD;
520 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
521 return lasttok = ENDWORD;
525 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
526 return lasttok = LIMWORD;
530 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
531 return lasttok = NOTLIMWORD;
535 if (syntax_bits & RE_LIMITED_OPS)
537 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
539 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
541 return lasttok = QMARK;
546 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
548 return lasttok = STAR;
551 if (syntax_bits & RE_LIMITED_OPS)
553 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
555 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
557 return lasttok = PLUS;
560 if (!(syntax_bits & RE_INTERVALS))
562 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
567 {M,} - minimum count, maximum is infinity
569 {M,N} - M through N */
570 FETCH(c, "unfinished repeat count");
576 FETCH(c, "unfinished repeat count");
579 minrep = 10 * minrep + c - '0';
583 dfaerror("malformed repeat count");
587 FETCH(c, "unfinished repeat count");
590 maxrep = 10 * maxrep + c - '0';
594 if (!(syntax_bits & RE_NO_BK_BRACES))
597 dfaerror("malformed repeat count");
598 FETCH(c, "unfinished repeat count");
601 dfaerror("malformed repeat count");
603 return lasttok = REPMN;
606 if (syntax_bits & RE_LIMITED_OPS)
608 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
614 if (syntax_bits & RE_LIMITED_OPS
616 || !(syntax_bits & RE_NEWLINE_ALT))
622 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
626 return lasttok = LPAREN;
629 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
631 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
635 return lasttok = RPAREN;
642 if (!(syntax_bits & RE_DOT_NEWLINE))
644 if (syntax_bits & RE_DOT_NOT_NULL)
647 return lasttok = CSET + charclass_index(ccl);
651 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
654 for (c2 = 0; c2 < NOTCHAR; ++c2)
661 return lasttok = CSET + charclass_index(ccl);
667 FETCH(c, "Unbalanced [");
670 FETCH(c, "Unbalanced [");
677 /* Nobody ever said this had to be fast. :-)
678 Note that if we're looking at some other [:...:]
679 construct, we just treat it as a bunch of ordinary
680 characters. We can do this because we assume
681 regex has checked for syntax errors before
682 dfa is ever called. */
683 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
684 for (c1 = 0; prednames[c1].name; ++c1)
685 if (looking_at(prednames[c1].name))
687 int (*pred)() = prednames[c1].pred;
689 && (pred == is_upper || pred == is_lower))
692 for (c2 = 0; c2 < NOTCHAR; ++c2)
695 lexptr += strlen(prednames[c1].name);
696 lexleft -= strlen(prednames[c1].name);
697 FETCH(c1, "Unbalanced [");
700 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
701 FETCH(c, "Unbalanced [");
702 FETCH(c1, "Unbalanced [");
705 FETCH(c2, "Unbalanced [");
708 /* In the case [x-], the - is an ordinary hyphen,
709 which is left in c1, the lookahead character. */
717 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
718 FETCH(c2, "Unbalanced [");
719 FETCH(c1, "Unbalanced [");
727 if (collate_range_cmp(c, c2) > 0) {
728 FETCH(c2, "Invalid range");
732 for (c3 = 0; c3 < NOTCHAR; ++c3)
733 if ( collate_range_cmp(c, c3) <= 0
734 && collate_range_cmp(c3, c2) <= 0
739 setbit(tolower(c3), ccl);
740 else if (ISLOWER(c3))
741 setbit(toupper(c3), ccl);
750 setbit(tolower(c), ccl);
752 setbit(toupper(c), ccl);
759 while ((c = c1) != ']');
763 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
767 return lasttok = CSET + charclass_index(ccl);
772 if (case_fold && ISALPHA(c))
777 setbit(tolower(c), ccl);
779 setbit(toupper(c), ccl);
780 return lasttok = CSET + charclass_index(ccl);
786 /* The above loop should consume at most a backslash
787 and some other character. */
789 return END; /* keeps pedantic compilers happy. */
792 /* Recursive descent parser for regular expressions. */
794 static token tok; /* Lookahead token. */
795 static int depth; /* Current depth of a hypothetical stack
796 holding deferred productions. This is
797 used to determine the depth that will be
798 required of the real stack later on in
801 /* Add the given token to the parse tree, maintaining the depth count and
802 updating the maximum depth if necessary. */
807 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
808 dfa->tokens[dfa->tindex++] = t;
829 if (depth > dfa->depth)
833 /* The grammar understood by the parser is as follows.
861 The parser builds a parse tree in postfix form in an array of tokens. */
866 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
867 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
868 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
873 else if (tok == LPAREN)
878 dfaerror("Unbalanced (");
885 /* Return the number of tokens in the given subexpression. */
892 switch (dfa->tokens[tindex - 1])
899 return 1 + nsubtoks(tindex - 1);
903 ntoks1 = nsubtoks(tindex - 1);
904 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
908 /* Copy the given subexpression to the top of the tree. */
910 copytoks(tindex, ntokens)
915 for (i = 0; i < ntokens; ++i)
916 addtok(dfa->tokens[tindex + i]);
922 int tindex, ntokens, i;
925 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
928 ntokens = nsubtoks(dfa->tindex);
929 tindex = dfa->tindex - ntokens;
934 for (i = 1; i < minrep; ++i)
936 copytoks(tindex, ntokens);
939 for (; i < maxrep; ++i)
941 copytoks(tindex, ntokens);
958 while (tok != RPAREN && tok != OR && tok >= 0)
981 /* Main entry point for the parser. S is a string to be parsed, len is the
982 length of the string, so s can include NUL characters. D is a pointer to
983 the struct dfa to parse into. */
992 lexstart = lexptr = s;
998 if (! syntax_bits_set)
999 dfaerror("No syntax specified");
1007 dfaerror("Unbalanced )");
1009 addtok(END - d->nregexps);
1018 /* Some primitives for operating on sets of positions. */
1020 /* Copy one set to another; the destination must be large enough. */
1028 for (i = 0; i < src->nelem; ++i)
1029 dst->elems[i] = src->elems[i];
1030 dst->nelem = src->nelem;
1033 /* Insert a position in a set. Position sets are maintained in sorted
1034 order according to index. If position already exists in the set with
1035 the same index then their constraints are logically or'd together.
1036 S->elems must point to an array large enough to hold the resulting set. */
1045 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1047 if (i < s->nelem && p.index == s->elems[i].index)
1048 s->elems[i].constraint |= p.constraint;
1053 while (i < s->nelem)
1062 /* Merge two sets of positions into a third. The result is exactly as if
1063 the positions of both sets were inserted into an initially empty set. */
1073 while (i < s1->nelem && j < s2->nelem)
1074 if (s1->elems[i].index > s2->elems[j].index)
1075 m->elems[m->nelem++] = s1->elems[i++];
1076 else if (s1->elems[i].index < s2->elems[j].index)
1077 m->elems[m->nelem++] = s2->elems[j++];
1080 m->elems[m->nelem] = s1->elems[i++];
1081 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1083 while (i < s1->nelem)
1084 m->elems[m->nelem++] = s1->elems[i++];
1085 while (j < s2->nelem)
1086 m->elems[m->nelem++] = s2->elems[j++];
1089 /* Delete a position from a set. */
1097 for (i = 0; i < s->nelem; ++i)
1098 if (p.index == s->elems[i].index)
1101 for (--s->nelem; i < s->nelem; ++i)
1102 s->elems[i] = s->elems[i + 1];
1105 /* Find the index of the state corresponding to the given position set with
1106 the given preceding context, or create a new state if there is no such
1107 state. Newline and letter tell whether we got here on a newline or
1108 letter, respectively. */
1110 state_index(d, s, newline, letter)
1120 newline = newline ? 1 : 0;
1121 letter = letter ? 1 : 0;
1123 for (i = 0; i < s->nelem; ++i)
1124 hash ^= s->elems[i].index + s->elems[i].constraint;
1126 /* Try to find a state that exactly matches the proposed one. */
1127 for (i = 0; i < d->sindex; ++i)
1129 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1130 || newline != d->states[i].newline || letter != d->states[i].letter)
1132 for (j = 0; j < s->nelem; ++j)
1133 if (s->elems[j].constraint
1134 != d->states[i].elems.elems[j].constraint
1135 || s->elems[j].index != d->states[i].elems.elems[j].index)
1141 /* We'll have to create a new state. */
1142 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1143 d->states[i].hash = hash;
1144 MALLOC(d->states[i].elems.elems, position, s->nelem);
1145 copy(s, &d->states[i].elems);
1146 d->states[i].newline = newline;
1147 d->states[i].letter = letter;
1148 d->states[i].backref = 0;
1149 d->states[i].constraint = 0;
1150 d->states[i].first_end = 0;
1151 for (j = 0; j < s->nelem; ++j)
1152 if (d->tokens[s->elems[j].index] < 0)
1154 constraint = s->elems[j].constraint;
1155 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1156 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1157 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1158 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1159 d->states[i].constraint |= constraint;
1160 if (! d->states[i].first_end)
1161 d->states[i].first_end = d->tokens[s->elems[j].index];
1163 else if (d->tokens[s->elems[j].index] == BACKREF)
1165 d->states[i].constraint = NO_CONSTRAINT;
1166 d->states[i].backref = 1;
1174 /* Find the epsilon closure of a set of positions. If any position of the set
1175 contains a symbol that matches the empty string in some context, replace
1176 that position with the elements of its follow labeled with an appropriate
1177 constraint. Repeat exhaustively until no funny positions are left.
1178 S->elems must be large enough to hold the result. */
1179 static void epsclosure _RE_ARGS((position_set *s, struct dfa *d));
1190 MALLOC(visited, int, d->tindex);
1191 for (i = 0; i < d->tindex; ++i)
1194 for (i = 0; i < s->nelem; ++i)
1195 if (d->tokens[s->elems[i].index] >= NOTCHAR
1196 && d->tokens[s->elems[i].index] != BACKREF
1197 && d->tokens[s->elems[i].index] < CSET)
1200 p.constraint = old.constraint;
1201 delete(s->elems[i], s);
1202 if (visited[old.index])
1207 visited[old.index] = 1;
1208 switch (d->tokens[old.index])
1211 p.constraint &= BEGLINE_CONSTRAINT;
1214 p.constraint &= ENDLINE_CONSTRAINT;
1217 p.constraint &= BEGWORD_CONSTRAINT;
1220 p.constraint &= ENDWORD_CONSTRAINT;
1223 p.constraint &= LIMWORD_CONSTRAINT;
1226 p.constraint &= NOTLIMWORD_CONSTRAINT;
1231 for (j = 0; j < d->follows[old.index].nelem; ++j)
1233 p.index = d->follows[old.index].elems[j].index;
1236 /* Force rescan to start at the beginning. */
1243 /* Perform bottom-up analysis on the parse tree, computing various functions.
1244 Note that at this point, we're pretending constructs like \< are real
1245 characters rather than constraints on what can follow them.
1247 Nullable: A node is nullable if it is at the root of a regexp that can
1248 match the empty string.
1249 * EMPTY leaves are nullable.
1250 * No other leaf is nullable.
1251 * A QMARK or STAR node is nullable.
1252 * A PLUS node is nullable if its argument is nullable.
1253 * A CAT node is nullable if both its arguments are nullable.
1254 * An OR node is nullable if either argument is nullable.
1256 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1257 that could correspond to the first character of a string matching the
1258 regexp rooted at the given node.
1259 * EMPTY leaves have empty firstpos.
1260 * The firstpos of a nonempty leaf is that leaf itself.
1261 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1263 * The firstpos of a CAT node is the firstpos of the left argument, union
1264 the firstpos of the right if the left argument is nullable.
1265 * The firstpos of an OR node is the union of firstpos of each argument.
1267 Lastpos: The lastpos of a node is the set of positions that could
1268 correspond to the last character of a string matching the regexp at
1270 * EMPTY leaves have empty lastpos.
1271 * The lastpos of a nonempty leaf is that leaf itself.
1272 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1274 * The lastpos of a CAT node is the lastpos of its right argument, union
1275 the lastpos of the left if the right argument is nullable.
1276 * The lastpos of an OR node is the union of the lastpos of each argument.
1278 Follow: The follow of a position is the set of positions that could
1279 correspond to the character following a character matching the node in
1280 a string matching the regexp. At this point we consider special symbols
1281 that match the empty string in some context to be just normal characters.
1282 Later, if we find that a special symbol is in a follow set, we will
1283 replace it with the elements of its follow, labeled with an appropriate
1285 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1286 the follow of every node in the lastpos.
1287 * Every node in the firstpos of the second argument of a CAT node is in
1288 the follow of every node in the lastpos of the first argument.
1290 Because of the postfix representation of the parse tree, the depth-first
1291 analysis is conveniently done by a linear scan with the aid of a stack.
1292 Sets are stored as arrays of the elements, obeying a stack-like allocation
1293 scheme; the number of elements in each set deeper in the stack can be
1294 used to determine the address of a particular set's array. */
1296 dfaanalyze(d, searchflag)
1300 int *nullable; /* Nullable stack. */
1301 int *nfirstpos; /* Element count stack for firstpos sets. */
1302 position *firstpos; /* Array where firstpos elements are stored. */
1303 int *nlastpos; /* Element count stack for lastpos sets. */
1304 position *lastpos; /* Array where lastpos elements are stored. */
1305 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1306 position_set tmp; /* Temporary set for merging sets. */
1307 position_set merged; /* Result of merging sets. */
1308 int wants_newline; /* True if some position wants newline info. */
1310 int *o_nfirst, *o_nlast;
1311 position *o_firstpos, *o_lastpos;
1316 fprintf(stderr, "dfaanalyze:\n");
1317 for (i = 0; i < d->tindex; ++i)
1319 fprintf(stderr, " %d:", i);
1320 prtok(d->tokens[i]);
1325 d->searchflag = searchflag;
1327 MALLOC(nullable, int, d->depth);
1328 o_nullable = nullable;
1329 MALLOC(nfirstpos, int, d->depth);
1330 o_nfirst = nfirstpos;
1331 MALLOC(firstpos, position, d->nleaves);
1332 o_firstpos = firstpos, firstpos += d->nleaves;
1333 MALLOC(nlastpos, int, d->depth);
1335 MALLOC(lastpos, position, d->nleaves);
1336 o_lastpos = lastpos, lastpos += d->nleaves;
1337 MALLOC(nalloc, int, d->tindex);
1338 for (i = 0; i < d->tindex; ++i)
1340 MALLOC(merged.elems, position, d->nleaves);
1342 CALLOC(d->follows, position_set, d->tindex);
1344 for (i = 0; i < d->tindex; ++i)
1346 { /* Nonsyntactic #ifdef goo... */
1348 switch (d->tokens[i])
1351 /* The empty set is nullable. */
1354 /* The firstpos and lastpos of the empty leaf are both empty. */
1355 *nfirstpos++ = *nlastpos++ = 0;
1360 /* Every element in the firstpos of the argument is in the follow
1361 of every element in the lastpos. */
1362 tmp.nelem = nfirstpos[-1];
1363 tmp.elems = firstpos;
1365 for (j = 0; j < nlastpos[-1]; ++j)
1367 merge(&tmp, &d->follows[pos[j].index], &merged);
1368 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1369 nalloc[pos[j].index], merged.nelem - 1);
1370 copy(&merged, &d->follows[pos[j].index]);
1374 /* A QMARK or STAR node is automatically nullable. */
1375 if (d->tokens[i] != PLUS)
1380 /* Every element in the firstpos of the second argument is in the
1381 follow of every element in the lastpos of the first argument. */
1382 tmp.nelem = nfirstpos[-1];
1383 tmp.elems = firstpos;
1384 pos = lastpos + nlastpos[-1];
1385 for (j = 0; j < nlastpos[-2]; ++j)
1387 merge(&tmp, &d->follows[pos[j].index], &merged);
1388 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1389 nalloc[pos[j].index], merged.nelem - 1);
1390 copy(&merged, &d->follows[pos[j].index]);
1393 /* The firstpos of a CAT node is the firstpos of the first argument,
1394 union that of the second argument if the first is nullable. */
1396 nfirstpos[-2] += nfirstpos[-1];
1398 firstpos += nfirstpos[-1];
1401 /* The lastpos of a CAT node is the lastpos of the second argument,
1402 union that of the first argument if the second is nullable. */
1404 nlastpos[-2] += nlastpos[-1];
1407 pos = lastpos + nlastpos[-2];
1408 for (j = nlastpos[-1] - 1; j >= 0; --j)
1409 pos[j] = lastpos[j];
1410 lastpos += nlastpos[-2];
1411 nlastpos[-2] = nlastpos[-1];
1415 /* A CAT node is nullable if both arguments are nullable. */
1416 nullable[-2] = nullable[-1] && nullable[-2];
1422 /* The firstpos is the union of the firstpos of each argument. */
1423 nfirstpos[-2] += nfirstpos[-1];
1426 /* The lastpos is the union of the lastpos of each argument. */
1427 nlastpos[-2] += nlastpos[-1];
1430 /* An OR node is nullable if either argument is nullable. */
1431 nullable[-2] = nullable[-1] || nullable[-2];
1436 /* Anything else is a nonempty position. (Note that special
1437 constructs like \< are treated as nonempty strings here;
1438 an "epsilon closure" effectively makes them nullable later.
1439 Backreferences have to get a real position so we can detect
1440 transitions on them later. But they are nullable. */
1441 *nullable++ = d->tokens[i] == BACKREF;
1443 /* This position is in its own firstpos and lastpos. */
1444 *nfirstpos++ = *nlastpos++ = 1;
1445 --firstpos, --lastpos;
1446 firstpos->index = lastpos->index = i;
1447 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1449 /* Allocate the follow set for this position. */
1451 MALLOC(d->follows[i].elems, position, nalloc[i]);
1455 /* ... balance the above nonsyntactic #ifdef goo... */
1456 fprintf(stderr, "node %d:", i);
1457 prtok(d->tokens[i]);
1459 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1460 fprintf(stderr, " firstpos:");
1461 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1463 fprintf(stderr, " %d:", firstpos[j].index);
1464 prtok(d->tokens[firstpos[j].index]);
1466 fprintf(stderr, "\n lastpos:");
1467 for (j = nlastpos[-1] - 1; j >= 0; --j)
1469 fprintf(stderr, " %d:", lastpos[j].index);
1470 prtok(d->tokens[lastpos[j].index]);
1476 /* For each follow set that is the follow set of a real position, replace
1477 it with its epsilon closure. */
1478 for (i = 0; i < d->tindex; ++i)
1479 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1480 || d->tokens[i] >= CSET)
1483 fprintf(stderr, "follows(%d:", i);
1484 prtok(d->tokens[i]);
1485 fprintf(stderr, "):");
1486 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1488 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1489 prtok(d->tokens[d->follows[i].elems[j].index]);
1493 copy(&d->follows[i], &merged);
1494 epsclosure(&merged, d);
1495 if (d->follows[i].nelem < merged.nelem)
1496 REALLOC(d->follows[i].elems, position, merged.nelem);
1497 copy(&merged, &d->follows[i]);
1500 /* Get the epsilon closure of the firstpos of the regexp. The result will
1501 be the set of positions of state 0. */
1503 for (i = 0; i < nfirstpos[-1]; ++i)
1504 insert(firstpos[i], &merged);
1505 epsclosure(&merged, d);
1507 /* Check if any of the positions of state 0 will want newline context. */
1509 for (i = 0; i < merged.nelem; ++i)
1510 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1513 /* Build the initial state. */
1516 MALLOC(d->states, dfa_state, d->salloc);
1517 state_index(d, &merged, wants_newline, 0);
1528 /* Find, for each character, the transition out of state s of d, and store
1529 it in the appropriate slot of trans.
1531 We divide the positions of s into groups (positions can appear in more
1532 than one group). Each group is labeled with a set of characters that
1533 every position in the group matches (taking into account, if necessary,
1534 preceding context information of s). For each group, find the union
1535 of the its elements' follows. This set is the set of positions of the
1536 new state. For each character in the group's label, set the transition
1537 on this character to be to a state corresponding to the set's positions,
1538 and its associated backward context information, if necessary.
1540 If we are building a searching matcher, we include the positions of state
1543 The collection of groups is constructed by building an equivalence-class
1544 partition of the positions of s.
1546 For each position, find the set of characters C that it matches. Eliminate
1547 any characters from C that fail on grounds of backward context.
1549 Search through the groups, looking for a group whose label L has nonempty
1550 intersection with C. If L - C is nonempty, create a new group labeled
1551 L - C and having the same positions as the current group, and set L to
1552 the intersection of L and C. Insert the position in this group, set
1553 C = C - L, and resume scanning.
1555 If after comparing with every group there are characters remaining in C,
1556 create a new group labeled with the characters of C and insert this
1557 position in that group. */
1559 dfastate(s, d, trans)
1564 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1565 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1566 int ngrps = 0; /* Number of groups actually used. */
1567 position pos; /* Current position being considered. */
1568 charclass matches; /* Set of matching characters. */
1569 int matchesf; /* True if matches is nonempty. */
1570 charclass intersect; /* Intersection with some label set. */
1571 int intersectf; /* True if intersect is nonempty. */
1572 charclass leftovers; /* Stuff in the label that didn't match. */
1573 int leftoversf; /* True if leftovers is nonempty. */
1574 static charclass letters; /* Set of characters considered letters. */
1575 static charclass newline; /* Set of characters that aren't newline. */
1576 position_set follows; /* Union of the follows of some group. */
1577 position_set tmp; /* Temporary space for merging sets. */
1578 int state; /* New state. */
1579 int wants_newline; /* New state wants to know newline context. */
1580 int state_newline; /* New state on a newline transition. */
1581 int wants_letter; /* New state wants to know letter context. */
1582 int state_letter; /* New state on a letter transition. */
1583 static int initialized; /* Flag for static initialization. */
1586 /* Initialize the set of letters, if necessary. */
1590 for (i = 0; i < NOTCHAR; ++i)
1593 setbit('\n', newline);
1598 for (i = 0; i < d->states[s].elems.nelem; ++i)
1600 pos = d->states[s].elems.elems[i];
1601 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1602 setbit(d->tokens[pos.index], matches);
1603 else if (d->tokens[pos.index] >= CSET)
1604 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1608 /* Some characters may need to be eliminated from matches because
1609 they fail in the current context. */
1610 if (pos.constraint != 0xFF)
1612 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1613 d->states[s].newline, 1))
1614 clrbit('\n', matches);
1615 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1616 d->states[s].newline, 0))
1617 for (j = 0; j < CHARCLASS_INTS; ++j)
1618 matches[j] &= newline[j];
1619 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1620 d->states[s].letter, 1))
1621 for (j = 0; j < CHARCLASS_INTS; ++j)
1622 matches[j] &= ~letters[j];
1623 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1624 d->states[s].letter, 0))
1625 for (j = 0; j < CHARCLASS_INTS; ++j)
1626 matches[j] &= letters[j];
1628 /* If there are no characters left, there's no point in going on. */
1629 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1631 if (j == CHARCLASS_INTS)
1635 for (j = 0; j < ngrps; ++j)
1637 /* If matches contains a single character only, and the current
1638 group's label doesn't contain that character, go on to the
1640 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1641 && !tstbit(d->tokens[pos.index], labels[j]))
1644 /* Check if this group's label has a nonempty intersection with
1647 for (k = 0; k < CHARCLASS_INTS; ++k)
1648 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1652 /* It does; now find the set differences both ways. */
1653 leftoversf = matchesf = 0;
1654 for (k = 0; k < CHARCLASS_INTS; ++k)
1656 /* Even an optimizing compiler can't know this for sure. */
1657 int match = matches[k], label = labels[j][k];
1659 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1660 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1663 /* If there were leftovers, create a new group labeled with them. */
1666 copyset(leftovers, labels[ngrps]);
1667 copyset(intersect, labels[j]);
1668 MALLOC(grps[ngrps].elems, position, d->nleaves);
1669 copy(&grps[j], &grps[ngrps]);
1673 /* Put the position in the current group. Note that there is no
1674 reason to call insert() here. */
1675 grps[j].elems[grps[j].nelem++] = pos;
1677 /* If every character matching the current position has been
1678 accounted for, we're done. */
1683 /* If we've passed the last group, and there are still characters
1684 unaccounted for, then we'll have to create a new group. */
1687 copyset(matches, labels[ngrps]);
1689 MALLOC(grps[ngrps].elems, position, d->nleaves);
1690 grps[ngrps].nelem = 1;
1691 grps[ngrps].elems[0] = pos;
1696 MALLOC(follows.elems, position, d->nleaves);
1697 MALLOC(tmp.elems, position, d->nleaves);
1699 /* If we are a searching matcher, the default transition is to a state
1700 containing the positions of state 0, otherwise the default transition
1701 is to fail miserably. */
1706 for (i = 0; i < d->states[0].elems.nelem; ++i)
1708 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1710 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1713 copy(&d->states[0].elems, &follows);
1714 state = state_index(d, &follows, 0, 0);
1716 state_newline = state_index(d, &follows, 1, 0);
1718 state_newline = state;
1720 state_letter = state_index(d, &follows, 0, 1);
1722 state_letter = state;
1723 for (i = 0; i < NOTCHAR; ++i)
1725 trans[i] = state_newline;
1726 else if (ISALNUM(i))
1727 trans[i] = state_letter;
1732 for (i = 0; i < NOTCHAR; ++i)
1735 for (i = 0; i < ngrps; ++i)
1739 /* Find the union of the follows of the positions of the group.
1740 This is a hideously inefficient loop. Fix it someday. */
1741 for (j = 0; j < grps[i].nelem; ++j)
1742 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1743 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1745 /* If we are building a searching matcher, throw in the positions
1746 of state 0 as well. */
1748 for (j = 0; j < d->states[0].elems.nelem; ++j)
1749 insert(d->states[0].elems.elems[j], &follows);
1751 /* Find out if the new state will want any context information. */
1753 if (tstbit('\n', labels[i]))
1754 for (j = 0; j < follows.nelem; ++j)
1755 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1759 for (j = 0; j < CHARCLASS_INTS; ++j)
1760 if (labels[i][j] & letters[j])
1762 if (j < CHARCLASS_INTS)
1763 for (j = 0; j < follows.nelem; ++j)
1764 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1767 /* Find the state(s) corresponding to the union of the follows. */
1768 state = state_index(d, &follows, 0, 0);
1770 state_newline = state_index(d, &follows, 1, 0);
1772 state_newline = state;
1774 state_letter = state_index(d, &follows, 0, 1);
1776 state_letter = state;
1778 /* Set the transitions for each character in the current label. */
1779 for (j = 0; j < CHARCLASS_INTS; ++j)
1780 for (k = 0; k < INTBITS; ++k)
1781 if (labels[i][j] & 1 << k)
1783 int c = j * INTBITS + k;
1786 trans[c] = state_newline;
1787 else if (ISALNUM(c))
1788 trans[c] = state_letter;
1789 else if (c < NOTCHAR)
1794 for (i = 0; i < ngrps; ++i)
1795 free(grps[i].elems);
1796 free(follows.elems);
1800 /* Some routines for manipulating a compiled dfa's transition tables.
1801 Each state may or may not have a transition table; if it does, and it
1802 is a non-accepting state, then d->trans[state] points to its table.
1803 If it is an accepting state then d->fails[state] points to its table.
1804 If it has no table at all, then d->trans[state] is NULL.
1805 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1812 int *trans; /* The new transition table. */
1815 /* Set an upper limit on the number of transition tables that will ever
1816 exist at once. 1024 is arbitrary. The idea is that the frequently
1817 used transition tables will be quickly rebuilt, whereas the ones that
1818 were only needed once or twice will be cleared away. */
1819 if (d->trcount >= 1024)
1821 for (i = 0; i < d->tralloc; ++i)
1824 free((ptr_t) d->trans[i]);
1827 else if (d->fails[i])
1829 free((ptr_t) d->fails[i]);
1837 /* Set up the success bits for this state. */
1839 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1842 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1845 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1849 MALLOC(trans, int, NOTCHAR);
1850 dfastate(s, d, trans);
1852 /* Now go through the new transition table, and make sure that the trans
1853 and fail arrays are allocated large enough to hold a pointer for the
1854 largest state mentioned in the table. */
1855 for (i = 0; i < NOTCHAR; ++i)
1856 if (trans[i] >= d->tralloc)
1858 int oldalloc = d->tralloc;
1860 while (trans[i] >= d->tralloc)
1862 REALLOC(d->realtrans, int *, d->tralloc + 1);
1863 d->trans = d->realtrans + 1;
1864 REALLOC(d->fails, int *, d->tralloc);
1865 REALLOC(d->success, int, d->tralloc);
1866 REALLOC(d->newlines, int, d->tralloc);
1867 while (oldalloc < d->tralloc)
1869 d->trans[oldalloc] = NULL;
1870 d->fails[oldalloc++] = NULL;
1874 /* Keep the newline transition in a special place so we can use it as
1876 d->newlines[s] = trans['\n'];
1879 if (ACCEPTING(s, *d))
1880 d->fails[s] = trans;
1882 d->trans[s] = trans;
1891 CALLOC(d->realtrans, int *, d->tralloc + 1);
1892 d->trans = d->realtrans + 1;
1893 CALLOC(d->fails, int *, d->tralloc);
1894 MALLOC(d->success, int, d->tralloc);
1895 MALLOC(d->newlines, int, d->tralloc);
1899 /* Search through a buffer looking for a match to the given struct dfa.
1900 Find the first occurrence of a string matching the regexp in the buffer,
1901 and the shortest possible version thereof. Return a pointer to the first
1902 character after the match, or NULL if none is found. Begin points to
1903 the beginning of the buffer, and end points to the first character after
1904 its end. We store a newline in *end to act as a sentinel, so end had
1905 better point somewhere valid. Newline is a flag indicating whether to
1906 allow newlines to be in the matching string. If count is non-
1907 NULL it points to a place we're supposed to increment every time we
1908 see a newline. Finally, if backref is non-NULL it points to a place
1909 where we're supposed to store a 1 if backreferencing happened and the
1910 match needs to be verified by a backtracking matcher. Otherwise
1911 we store a 0 in *backref. */
1913 dfaexec(d, begin, end, newline, count, backref)
1921 register int s, s1, tmp; /* Current state. */
1922 register unsigned char *p; /* Current input character. */
1923 register int **trans, *t; /* Copy of d->trans so it can be optimized
1925 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1926 static int sbit_init;
1933 for (i = 0; i < NOTCHAR; ++i)
1936 else if (ISALNUM(i))
1943 build_state_zero(d);
1946 p = (unsigned char *) begin;
1952 /* The dreaded inner loop. */
1953 if ((t = trans[s]) != 0)
1957 if (! (t = trans[s1]))
1961 while ((t = trans[s]) != 0);
1964 tmp = s, s = s1, s1 = tmp;
1967 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1969 if (d->success[s] & sbit[*p])
1972 if (d->states[s].backref)
1980 s = d->fails[s][*p++];
1984 /* If the previous character was a newline, count it. */
1985 if (count && (char *) p <= end && p[-1] == '\n')
1988 /* Check if we've run off the end of the buffer. */
1989 if ((char *) p > end)
1999 if (p[-1] == '\n' && newline)
2001 s = d->newlines[s1];
2009 /* Initialize the components of a dfa that the other routines don't
2010 initialize for themselves. */
2016 MALLOC(d->charclasses, charclass, d->calloc);
2020 MALLOC(d->tokens, token, d->talloc);
2021 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2029 /* Parse and analyze a single string of the given length. */
2031 dfacomp(s, len, d, searchflag)
2037 if (case_fold) /* dummy folding in service of dfamust() */
2042 lcopy = malloc(len);
2044 dfaerror("out of memory");
2046 /* This is a kludge. */
2048 for (i = 0; i < len; ++i)
2050 lcopy[i] = tolower(s[i]);
2055 dfaparse(lcopy, len, d);
2058 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2060 dfaparse(s, len, d);
2061 dfaanalyze(d, searchflag);
2066 dfaparse(s, len, d);
2068 dfaanalyze(d, searchflag);
2072 /* Free the storage held by the components of a dfa. */
2078 struct dfamust *dm, *ndm;
2080 free((ptr_t) d->charclasses);
2081 free((ptr_t) d->tokens);
2082 for (i = 0; i < d->sindex; ++i)
2083 free((ptr_t) d->states[i].elems.elems);
2084 free((ptr_t) d->states);
2085 for (i = 0; i < d->tindex; ++i)
2086 if (d->follows[i].elems)
2087 free((ptr_t) d->follows[i].elems);
2088 free((ptr_t) d->follows);
2089 for (i = 0; i < d->tralloc; ++i)
2091 free((ptr_t) d->trans[i]);
2092 else if (d->fails[i])
2093 free((ptr_t) d->fails[i]);
2094 if (d->realtrans) free((ptr_t) d->realtrans);
2095 if (d->fails) free((ptr_t) d->fails);
2096 if (d->newlines) free((ptr_t) d->newlines);
2097 if (d->success) free((ptr_t) d->success);
2098 for (dm = d->musts; dm; dm = ndm)
2106 /* Having found the postfix representation of the regular expression,
2107 try to find a long sequence of characters that must appear in any line
2109 Finding a "longest" sequence is beyond the scope here;
2110 we take an easy way out and hope for the best.
2111 (Take "(ab|a)b"--please.)
2113 We do a bottom-up calculation of sequences of characters that must appear
2114 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2116 sequences that must appear at the left of the match ("left")
2117 sequences that must appear at the right of the match ("right")
2118 lists of sequences that must appear somewhere in the match ("in")
2119 sequences that must constitute the match ("is")
2121 When we get to the root of the tree, we use one of the longest of its
2122 calculated "in" sequences as our answer. The sequence we find is returned in
2123 d->must (where "d" is the single argument passed to "dfamust");
2124 the length of the sequence is returned in d->mustn.
2126 The sequences calculated for the various types of node (in pseudo ANSI c)
2127 are shown below. "p" is the operand of unary operators (and the left-hand
2128 operand of binary operators); "q" is the right-hand operand of binary
2131 "ZERO" means "a zero-length sequence" below.
2133 Type left right is in
2134 ---- ---- ----- -- --
2135 char c # c # c # c # c
2137 CSET ZERO ZERO ZERO ZERO
2139 STAR ZERO ZERO ZERO ZERO
2141 QMARK ZERO ZERO ZERO ZERO
2143 PLUS p->left p->right ZERO p->in
2145 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2146 p->left : q->right : q->is!=ZERO) ? q->in plus
2147 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2150 OR longest common longest common (do p->is and substrings common to
2151 leading trailing q->is have same p->in and q->in
2152 (sub)sequence (sub)sequence length and
2153 of p->left of p->right content) ?
2154 and q->left and q->right p->is : NULL
2156 If there's anything else we recognize in the tree, all four sequences get set
2157 to zero-length sequences. If there's something we don't recognize in the tree,
2158 we just return a zero-length sequence.
2160 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2163 And. . .is it here or someplace that we might ponder "optimizations" such as
2164 egrep 'psi|epsilon' -> egrep 'psi'
2165 egrep 'pepsi|epsilon' -> egrep 'epsi'
2166 (Yes, we now find "epsi" as a "string
2167 that must occur", but we might also
2168 simplify the *entire* r.e. being sought)
2169 grep '[c]' -> grep 'c'
2170 grep '(ab|a)b' -> grep 'ab'
2171 grep 'ab*' -> grep 'a'
2172 grep 'a*b' -> grep 'b'
2174 There are several issues:
2176 Is optimization easy (enough)?
2178 Does optimization actually accomplish anything,
2179 or is the automaton you get from "psi|epsilon" (for example)
2180 the same as the one you get from "psi" (for example)?
2182 Are optimizable r.e.'s likely to be used in real-life situations
2183 (something like 'ab*' is probably unlikely; something like is
2184 'psi|epsilon' is likelier)? */
2192 size_t oldsize, newsize;
2194 newsize = (new == NULL) ? 0 : strlen(new);
2197 else if (newsize == 0)
2199 else oldsize = strlen(old);
2201 result = (char *) malloc(newsize + 1);
2203 result = (char *) realloc((void *) old, oldsize + newsize + 1);
2204 if (result != NULL && new != NULL)
2205 (void) strcpy(result + oldsize, new);
2213 return icatalloc((char *) NULL, string);
2217 istrstr(lookin, lookfor)
2224 len = strlen(lookfor);
2225 for (cp = lookin; *cp != '\0'; ++cp)
2226 if (strncmp(cp, lookfor, len) == 0)
2247 for (i = 0; cpp[i] != NULL; ++i)
2255 enlist(cpp, new, len)
2264 if ((new = icpyalloc(new)) == NULL)
2270 /* Is there already something in the list that's new (or longer)? */
2271 for (i = 0; cpp[i] != NULL; ++i)
2272 if (istrstr(cpp[i], new) != NULL)
2277 /* Eliminate any obsoleted strings. */
2279 while (cpp[j] != NULL)
2280 if (istrstr(new, cpp[j]) == NULL)
2290 /* Add the new string. */
2291 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2299 /* Given pointers to two strings, return a pointer to an allocated
2300 list of their distinct common substrings. Return NULL if something
2303 comsubs(left, right)
2312 if (left == NULL || right == NULL)
2314 cpp = (char **) malloc(sizeof *cpp);
2318 for (lcp = left; *lcp != '\0'; ++lcp)
2321 rcp = index(right, *lcp);
2324 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2328 rcp = index(rcp + 1, *lcp);
2332 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2345 if (old == NULL || new == NULL)
2347 for (i = 0; new[i] != NULL; ++i)
2349 old = enlist(old, new[i], strlen(new[i]));
2356 /* Given two lists of substrings, return a new list giving substrings
2367 if (left == NULL || right == NULL)
2369 both = (char **) malloc(sizeof *both);
2373 for (lnum = 0; left[lnum] != NULL; ++lnum)
2375 for (rnum = 0; right[rnum] != NULL; ++rnum)
2377 temp = comsubs(left[lnum], right[rnum]);
2383 both = addlists(both, temp);
2405 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2422 static char empty_string[] = "";
2424 result = empty_string;
2426 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2430 for (i = 0; i <= dfa->tindex; ++i)
2432 for (i = 0; i <= dfa->tindex; ++i)
2434 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2435 mp[i].left = malloc(2);
2436 mp[i].right = malloc(2);
2437 mp[i].is = malloc(2);
2438 if (mp[i].in == NULL || mp[i].left == NULL ||
2439 mp[i].right == NULL || mp[i].is == NULL)
2441 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2445 fprintf(stderr, "dfamust:\n");
2446 for (i = 0; i < dfa->tindex; ++i)
2448 fprintf(stderr, " %d:", i);
2449 prtok(dfa->tokens[i]);
2453 for (ri = 0; ri < dfa->tindex; ++ri)
2455 switch (t = dfa->tokens[ri])
2459 goto done; /* "cannot happen" */
2473 goto done; /* "cannot happen" */
2480 goto done; /* "cannot happen" */
2489 /* Guaranteed to be. Unlikely, but. . . */
2490 if (strcmp(lmp->is, rmp->is) != 0)
2492 /* Left side--easy */
2494 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2496 lmp->left[i] = '\0';
2498 ln = strlen(lmp->right);
2499 rn = strlen(rmp->right);
2503 for (i = 0; i < n; ++i)
2504 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2506 for (j = 0; j < i; ++j)
2507 lmp->right[j] = lmp->right[(ln - i) + j];
2508 lmp->right[j] = '\0';
2509 new = inboth(lmp->in, rmp->in);
2513 free((char *) lmp->in);
2519 goto done; /* "cannot happen" */
2524 if (mp != &musts[1])
2525 goto done; /* "cannot happen" */
2526 for (i = 0; musts[0].in[i] != NULL; ++i)
2527 if (strlen(musts[0].in[i]) > strlen(result))
2528 result = musts[0].in[i];
2529 if (strcmp(result, musts[0].is) == 0)
2534 goto done; /* "cannot happen" */
2541 /* In. Everything in left, plus everything in
2542 right, plus catenation of
2543 left's right and right's left. */
2544 lmp->in = addlists(lmp->in, rmp->in);
2545 if (lmp->in == NULL)
2547 if (lmp->right[0] != '\0' &&
2548 rmp->left[0] != '\0')
2552 tp = icpyalloc(lmp->right);
2555 tp = icatalloc(tp, rmp->left);
2558 lmp->in = enlist(lmp->in, tp,
2561 if (lmp->in == NULL)
2565 if (lmp->is[0] != '\0')
2567 lmp->left = icatalloc(lmp->left,
2569 if (lmp->left == NULL)
2573 if (rmp->is[0] == '\0')
2574 lmp->right[0] = '\0';
2575 lmp->right = icatalloc(lmp->right, rmp->right);
2576 if (lmp->right == NULL)
2578 /* Guaranteed to be */
2579 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2581 lmp->is = icatalloc(lmp->is, rmp->is);
2582 if (lmp->is == NULL)
2592 /* "cannot happen" */
2597 /* not on *my* shift */
2607 /* plain character */
2609 mp->is[0] = mp->left[0] = mp->right[0] = t;
2610 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2611 mp->in = enlist(mp->in, mp->is, (size_t)1);
2618 fprintf(stderr, " node: %d:", ri);
2619 prtok(dfa->tokens[ri]);
2620 fprintf(stderr, "\n in:");
2621 for (i = 0; mp->in[i]; ++i)
2622 fprintf(stderr, " \"%s\"", mp->in[i]);
2623 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2624 fprintf(stderr, " left: \"%s\"\n", mp->left);
2625 fprintf(stderr, " right: \"%s\"\n", mp->right);
2632 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2634 dm->must = malloc(strlen(result) + 1);
2635 strcpy(dm->must, result);
2636 dm->next = dfa->musts;
2640 for (i = 0; i <= dfa->tindex; ++i)
2643 ifree((char *) mp[i].in);