1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
21 /* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
31 #include <sys/types.h>
35 extern char *calloc(), *malloc(), *realloc();
39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
47 #ifndef DEBUG /* use the same approach as regex.c */
53 #define isgraph(C) (isprint(C) && !isspace(C))
56 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
57 #define ISALPHA(C) isalpha(C)
58 #define ISUPPER(C) isupper(C)
59 #define ISLOWER(C) islower(C)
60 #define ISDIGIT(C) isdigit(C)
61 #define ISXDIGIT(C) isxdigit(C)
62 #define ISSPACE(C) isspace(C)
63 #define ISPUNCT(C) ispunct(C)
64 #define ISALNUM(C) isalnum(C)
65 #define ISPRINT(C) isprint(C)
66 #define ISGRAPH(C) isgraph(C)
67 #define ISCNTRL(C) iscntrl(C)
69 #define ISALPHA(C) (isascii(C) && isalpha(C))
70 #define ISUPPER(C) (isascii(C) && isupper(C))
71 #define ISLOWER(C) (isascii(C) && islower(C))
72 #define ISDIGIT(C) (isascii(C) && isdigit(C))
73 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
74 #define ISSPACE(C) (isascii(C) && isspace(C))
75 #define ISPUNCT(C) (isascii(C) && ispunct(C))
76 #define ISALNUM(C) (isascii(C) && isalnum(C))
77 #define ISPRINT(C) (isascii(C) && isprint(C))
78 #define ISGRAPH(C) (isascii(C) && isgraph(C))
79 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
82 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
83 - Its arg may be any int or unsigned int; it need not be an unsigned char.
84 - It's guaranteed to evaluate its argument exactly once.
85 - It's typically faster.
86 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
87 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
88 it's important to use the locale's definition of `digit' even when the
89 host does not conform to Posix. */
90 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
92 /* If we (don't) have I18N. */
95 # ifdef HAVE_LIBINTL_H
98 # define _(Str) gettext (Str)
101 # define _(Str) (Str)
106 #include <gnuregex.h>
112 /* HPUX, define those as macros in sys/param.h */
120 static void dfamust PARAMS ((struct dfa *dfa));
122 static ptr_t xcalloc PARAMS ((size_t n, size_t s));
123 static ptr_t xmalloc PARAMS ((size_t n));
124 static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
126 static void prtok PARAMS ((token t));
128 static int tstbit PARAMS ((int b, charclass c));
129 static void setbit PARAMS ((int b, charclass c));
130 static void clrbit PARAMS ((int b, charclass c));
131 static void copyset PARAMS ((charclass src, charclass dst));
132 static void zeroset PARAMS ((charclass s));
133 static void notset PARAMS ((charclass s));
134 static int equal PARAMS ((charclass s1, charclass s2));
135 static int charclass_index PARAMS ((charclass s));
136 static int looking_at PARAMS ((const char *s));
137 static token lex PARAMS ((void));
138 static void addtok PARAMS ((token t));
139 static void atom PARAMS ((void));
140 static int nsubtoks PARAMS ((int tindex));
141 static void copytoks PARAMS ((int tindex, int ntokens));
142 static void closure PARAMS ((void));
143 static void branch PARAMS ((void));
144 static void regexp PARAMS ((int toplevel));
145 static void copy PARAMS ((position_set *src, position_set *dst));
146 static void insert PARAMS ((position p, position_set *s));
147 static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
148 static void delete PARAMS ((position p, position_set *s));
149 static int state_index PARAMS ((struct dfa *d, position_set *s,
150 int newline, int letter));
151 static void build_state PARAMS ((int s, struct dfa *d));
152 static void build_state_zero PARAMS ((struct dfa *d));
153 static char *icatalloc PARAMS ((char *old, char *new));
154 static char *icpyalloc PARAMS ((char *string));
155 static char *istrstr PARAMS ((char *lookin, char *lookfor));
156 static void ifree PARAMS ((char *cp));
157 static void freelist PARAMS ((char **cpp));
158 static char **enlist PARAMS ((char **cpp, char *new, size_t len));
159 static char **comsubs PARAMS ((char *left, char *right));
160 static char **addlists PARAMS ((char **old, char **new));
161 static char **inboth PARAMS ((char **left, char **right));
164 xcalloc (size_t n, size_t s)
166 ptr_t r = calloc(n, s);
169 dfaerror(_("Memory exhausted"));
180 dfaerror(_("Memory exhausted"));
185 xrealloc (ptr_t p, size_t n)
187 ptr_t r = realloc(p, n);
191 dfaerror(_("Memory exhausted"));
195 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
196 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
197 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
199 /* Reallocate an array of type t if nalloc is too small for index. */
200 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
201 if ((index) >= (nalloc)) \
203 while ((index) >= (nalloc)) \
205 REALLOC(p, t, nalloc); \
216 fprintf(stderr, "END");
217 else if (t < NOTCHAR)
218 fprintf(stderr, "%c", t);
223 case EMPTY: s = "EMPTY"; break;
224 case BACKREF: s = "BACKREF"; break;
225 case BEGLINE: s = "BEGLINE"; break;
226 case ENDLINE: s = "ENDLINE"; break;
227 case BEGWORD: s = "BEGWORD"; break;
228 case ENDWORD: s = "ENDWORD"; break;
229 case LIMWORD: s = "LIMWORD"; break;
230 case NOTLIMWORD: s = "NOTLIMWORD"; break;
231 case QMARK: s = "QMARK"; break;
232 case STAR: s = "STAR"; break;
233 case PLUS: s = "PLUS"; break;
234 case CAT: s = "CAT"; break;
235 case OR: s = "OR"; break;
236 case ORTOP: s = "ORTOP"; break;
237 case LPAREN: s = "LPAREN"; break;
238 case RPAREN: s = "RPAREN"; break;
239 default: s = "CSET"; break;
241 fprintf(stderr, "%s", s);
246 /* Stuff pertaining to charclasses. */
249 tstbit (int b, charclass c)
251 return c[b / INTBITS] & 1 << b % INTBITS;
255 setbit (int b, charclass c)
257 c[b / INTBITS] |= 1 << b % INTBITS;
261 clrbit (int b, charclass c)
263 c[b / INTBITS] &= ~(1 << b % INTBITS);
267 copyset (charclass src, charclass dst)
271 for (i = 0; i < CHARCLASS_INTS; ++i)
276 zeroset (charclass s)
280 for (i = 0; i < CHARCLASS_INTS; ++i)
289 for (i = 0; i < CHARCLASS_INTS; ++i)
294 equal (charclass s1, charclass s2)
298 for (i = 0; i < CHARCLASS_INTS; ++i)
304 /* A pointer to the current dfa is kept here during parsing. */
305 static struct dfa *dfa;
307 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
309 charclass_index (charclass s)
313 for (i = 0; i < dfa->cindex; ++i)
314 if (equal(s, dfa->charclasses[i]))
316 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
318 copyset(s, dfa->charclasses[i]);
322 /* Syntax bits controlling the behavior of the lexical analyzer. */
323 static reg_syntax_t syntax_bits, syntax_bits_set;
325 /* Flag for case-folding letters into sets. */
326 static int case_fold;
328 /* End-of-line byte in data. */
329 static unsigned char eolbyte;
331 /* Entry point to set syntax options. */
333 dfasyntax (reg_syntax_t bits, int fold, int eol)
341 /* Lexical analyzer. All the dross that deals with the obnoxious
342 GNU Regex syntax bits is located here. The poor, suffering
343 reader is referred to the GNU Regex documentation for the
344 meaning of the @#%!@#%^!@ syntax bits. */
346 static char *lexstart; /* Pointer to beginning of input string. */
347 static char *lexptr; /* Pointer to next input character. */
348 static int lexleft; /* Number of characters remaining. */
349 static token lasttok; /* Previous token returned; initially END. */
350 static int laststart; /* True if we're separated from beginning or (, |
351 only by zero-width characters. */
352 static int parens; /* Count of outstanding left parens. */
353 static int minrep, maxrep; /* Repeat counts for {m,n}. */
355 /* Note that characters become unsigned here. */
356 #define FETCH(c, eoferr) \
363 return lasttok = END; \
365 (c) = (unsigned char) *lexptr++; \
370 #define FUNC(F, P) static int F(int c) { return P(c); }
372 #define FUNC(F, P) static int F(c) int c; { return P(c); }
375 FUNC(is_alpha, ISALPHA)
376 FUNC(is_upper, ISUPPER)
377 FUNC(is_lower, ISLOWER)
378 FUNC(is_digit, ISDIGIT)
379 FUNC(is_xdigit, ISXDIGIT)
380 FUNC(is_space, ISSPACE)
381 FUNC(is_punct, ISPUNCT)
382 FUNC(is_alnum, ISALNUM)
383 FUNC(is_print, ISPRINT)
384 FUNC(is_graph, ISGRAPH)
385 FUNC(is_cntrl, ISCNTRL)
390 return (c == ' ' || c == '\t');
393 /* The following list maps the names of the Posix named character classes
394 to predicate functions that determine whether a given character is in
395 the class. The leading [ has already been eaten by the lexical analyzer. */
398 int (*pred) PARAMS ((int));
400 { ":alpha:]", is_alpha },
401 { ":upper:]", is_upper },
402 { ":lower:]", is_lower },
403 { ":digit:]", is_digit },
404 { ":xdigit:]", is_xdigit },
405 { ":space:]", is_space },
406 { ":punct:]", is_punct },
407 { ":alnum:]", is_alnum },
408 { ":print:]", is_print },
409 { ":graph:]", is_graph },
410 { ":cntrl:]", is_cntrl },
411 { ":blank:]", is_blank },
415 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
416 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
419 looking_at (char const *s)
426 return strncmp(s, lexptr, len) == 0;
433 int backslash = 0, invert;
439 /* Basic plan: We fetch a character. If it's a backslash,
440 we set the backslash flag and go through the loop again.
441 On the plus side, this avoids having a duplicate of the
442 main switch inside the backslash case. On the minus side,
443 it means that just about every case begins with
444 "if (backslash) ...". */
445 for (i = 0; i < 2; ++i)
454 dfaerror(_("Unfinished \\ escape"));
461 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
465 return lasttok = BEGLINE;
471 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
473 || (syntax_bits & RE_NO_BK_PARENS
474 ? lexleft > 0 && *lexptr == ')'
475 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
476 || (syntax_bits & RE_NO_BK_VBAR
477 ? lexleft > 0 && *lexptr == '|'
478 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
479 || ((syntax_bits & RE_NEWLINE_ALT)
480 && lexleft > 0 && *lexptr == '\n'))
481 return lasttok = ENDLINE;
493 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
496 return lasttok = BACKREF;
501 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
502 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
506 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
507 return lasttok = ENDLINE; /* FIXME: should be end of string */
511 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
512 return lasttok = BEGWORD;
516 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
517 return lasttok = ENDWORD;
521 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
522 return lasttok = LIMWORD;
526 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
527 return lasttok = NOTLIMWORD;
531 if (syntax_bits & RE_LIMITED_OPS)
533 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
535 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
537 return lasttok = QMARK;
542 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
544 return lasttok = STAR;
547 if (syntax_bits & RE_LIMITED_OPS)
549 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
551 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
553 return lasttok = PLUS;
556 if (!(syntax_bits & RE_INTERVALS))
558 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
560 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
563 if (syntax_bits & RE_NO_BK_BRACES)
565 /* Scan ahead for a valid interval; if it's not valid,
566 treat it as a literal '{'. */
567 int lo = -1, hi = -1;
568 char const *p = lexptr;
569 char const *lim = p + lexleft;
570 for (; p != lim && ISASCIIDIGIT (*p); p++)
571 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
572 if (p != lim && *p == ',')
573 while (++p != lim && ISASCIIDIGIT (*p))
574 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
577 if (p == lim || *p != '}'
578 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
585 {M,} - minimum count, maximum is infinity
586 {M,N} - M through N */
587 FETCH(c, _("unfinished repeat count"));
588 if (ISASCIIDIGIT (c))
593 FETCH(c, _("unfinished repeat count"));
594 if (! ISASCIIDIGIT (c))
596 minrep = 10 * minrep + c - '0';
600 dfaerror(_("malformed repeat count"));
603 FETCH (c, _("unfinished repeat count"));
604 if (! ISASCIIDIGIT (c))
611 FETCH (c, _("unfinished repeat count"));
612 if (! ISASCIIDIGIT (c))
614 maxrep = 10 * maxrep + c - '0';
616 if (0 <= maxrep && maxrep < minrep)
617 dfaerror (_("malformed repeat count"));
622 if (!(syntax_bits & RE_NO_BK_BRACES))
625 dfaerror(_("malformed repeat count"));
626 FETCH(c, _("unfinished repeat count"));
629 dfaerror(_("malformed repeat count"));
631 return lasttok = REPMN;
634 if (syntax_bits & RE_LIMITED_OPS)
636 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
642 if (syntax_bits & RE_LIMITED_OPS
644 || !(syntax_bits & RE_NEWLINE_ALT))
650 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
654 return lasttok = LPAREN;
657 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
659 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
663 return lasttok = RPAREN;
670 if (!(syntax_bits & RE_DOT_NEWLINE))
671 clrbit(eolbyte, ccl);
672 if (syntax_bits & RE_DOT_NOT_NULL)
675 return lasttok = CSET + charclass_index(ccl);
679 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
682 for (c2 = 0; c2 < NOTCHAR; ++c2)
683 if (IS_WORD_CONSTITUENT(c2))
688 return lasttok = CSET + charclass_index(ccl);
694 FETCH(c, _("Unbalanced ["));
697 FETCH(c, _("Unbalanced ["));
704 /* Nobody ever said this had to be fast. :-)
705 Note that if we're looking at some other [:...:]
706 construct, we just treat it as a bunch of ordinary
707 characters. We can do this because we assume
708 regex has checked for syntax errors before
709 dfa is ever called. */
710 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
711 for (c1 = 0; prednames[c1].name; ++c1)
712 if (looking_at(prednames[c1].name))
714 int (*pred)() = prednames[c1].pred;
716 && (pred == is_upper || pred == is_lower))
719 for (c2 = 0; c2 < NOTCHAR; ++c2)
722 lexptr += strlen(prednames[c1].name);
723 lexleft -= strlen(prednames[c1].name);
724 FETCH(c1, _("Unbalanced ["));
727 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
728 FETCH(c, _("Unbalanced ["));
729 FETCH(c1, _("Unbalanced ["));
732 FETCH(c2, _("Unbalanced ["));
735 /* In the case [x-], the - is an ordinary hyphen,
736 which is left in c1, the lookahead character. */
744 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
745 FETCH(c2, _("Unbalanced ["));
746 FETCH(c1, _("Unbalanced ["));
752 lo[0] = c; lo[1] = '\0';
753 hi[0] = c2; hi[1] = '\0';
754 for (c = 0; c < NOTCHAR; c++)
757 ch[0] = c; ch[1] = '\0';
758 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
764 setbit (tolower (c), ccl);
765 else if (ISLOWER (c))
766 setbit (toupper (c), ccl);
774 while ((c = c1) != ']');
778 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
779 clrbit(eolbyte, ccl);
782 return lasttok = CSET + charclass_index(ccl);
787 if (case_fold && ISALPHA(c))
792 setbit(tolower(c), ccl);
794 setbit(toupper(c), ccl);
795 return lasttok = CSET + charclass_index(ccl);
801 /* The above loop should consume at most a backslash
802 and some other character. */
804 return END; /* keeps pedantic compilers happy. */
807 /* Recursive descent parser for regular expressions. */
809 static token tok; /* Lookahead token. */
810 static int depth; /* Current depth of a hypothetical stack
811 holding deferred productions. This is
812 used to determine the depth that will be
813 required of the real stack later on in
816 /* Add the given token to the parse tree, maintaining the depth count and
817 updating the maximum depth if necessary. */
821 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
822 dfa->tokens[dfa->tindex++] = t;
843 if (depth > dfa->depth)
847 /* The grammar understood by the parser is as follows.
875 The parser builds a parse tree in postfix form in an array of tokens. */
880 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
881 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
882 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
887 else if (tok == LPAREN)
892 dfaerror(_("Unbalanced ("));
899 /* Return the number of tokens in the given subexpression. */
901 nsubtoks (int tindex)
905 switch (dfa->tokens[tindex - 1])
912 return 1 + nsubtoks(tindex - 1);
916 ntoks1 = nsubtoks(tindex - 1);
917 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
921 /* Copy the given subexpression to the top of the tree. */
923 copytoks (int tindex, int ntokens)
927 for (i = 0; i < ntokens; ++i)
928 addtok(dfa->tokens[tindex + i]);
934 int tindex, ntokens, i;
937 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
940 ntokens = nsubtoks(dfa->tindex);
941 tindex = dfa->tindex - ntokens;
946 for (i = 1; i < minrep; ++i)
948 copytoks(tindex, ntokens);
951 for (; i < maxrep; ++i)
953 copytoks(tindex, ntokens);
970 while (tok != RPAREN && tok != OR && tok >= 0)
978 regexp (int toplevel)
992 /* Main entry point for the parser. S is a string to be parsed, len is the
993 length of the string, so s can include NUL characters. D is a pointer to
994 the struct dfa to parse into. */
996 dfaparse (char *s, size_t len, struct dfa *d)
999 lexstart = lexptr = s;
1005 if (! syntax_bits_set)
1006 dfaerror(_("No syntax specified"));
1014 dfaerror(_("Unbalanced )"));
1016 addtok(END - d->nregexps);
1025 /* Some primitives for operating on sets of positions. */
1027 /* Copy one set to another; the destination must be large enough. */
1029 copy (position_set *src, position_set *dst)
1033 for (i = 0; i < src->nelem; ++i)
1034 dst->elems[i] = src->elems[i];
1035 dst->nelem = src->nelem;
1038 /* Insert a position in a set. Position sets are maintained in sorted
1039 order according to index. If position already exists in the set with
1040 the same index then their constraints are logically or'd together.
1041 S->elems must point to an array large enough to hold the resulting set. */
1043 insert (position p, position_set *s)
1048 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1050 if (i < s->nelem && p.index == s->elems[i].index)
1051 s->elems[i].constraint |= p.constraint;
1056 while (i < s->nelem)
1065 /* Merge two sets of positions into a third. The result is exactly as if
1066 the positions of both sets were inserted into an initially empty set. */
1068 merge (position_set *s1, position_set *s2, position_set *m)
1073 while (i < s1->nelem && j < s2->nelem)
1074 if (s1->elems[i].index > s2->elems[j].index)
1075 m->elems[m->nelem++] = s1->elems[i++];
1076 else if (s1->elems[i].index < s2->elems[j].index)
1077 m->elems[m->nelem++] = s2->elems[j++];
1080 m->elems[m->nelem] = s1->elems[i++];
1081 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1083 while (i < s1->nelem)
1084 m->elems[m->nelem++] = s1->elems[i++];
1085 while (j < s2->nelem)
1086 m->elems[m->nelem++] = s2->elems[j++];
1089 /* Delete a position from a set. */
1091 delete (position p, position_set *s)
1095 for (i = 0; i < s->nelem; ++i)
1096 if (p.index == s->elems[i].index)
1099 for (--s->nelem; i < s->nelem; ++i)
1100 s->elems[i] = s->elems[i + 1];
1103 /* Find the index of the state corresponding to the given position set with
1104 the given preceding context, or create a new state if there is no such
1105 state. Newline and letter tell whether we got here on a newline or
1106 letter, respectively. */
1108 state_index (struct dfa *d, position_set *s, int newline, int letter)
1114 newline = newline ? 1 : 0;
1115 letter = letter ? 1 : 0;
1117 for (i = 0; i < s->nelem; ++i)
1118 hash ^= s->elems[i].index + s->elems[i].constraint;
1120 /* Try to find a state that exactly matches the proposed one. */
1121 for (i = 0; i < d->sindex; ++i)
1123 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1124 || newline != d->states[i].newline || letter != d->states[i].letter)
1126 for (j = 0; j < s->nelem; ++j)
1127 if (s->elems[j].constraint
1128 != d->states[i].elems.elems[j].constraint
1129 || s->elems[j].index != d->states[i].elems.elems[j].index)
1135 /* We'll have to create a new state. */
1136 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1137 d->states[i].hash = hash;
1138 MALLOC(d->states[i].elems.elems, position, s->nelem);
1139 copy(s, &d->states[i].elems);
1140 d->states[i].newline = newline;
1141 d->states[i].letter = letter;
1142 d->states[i].backref = 0;
1143 d->states[i].constraint = 0;
1144 d->states[i].first_end = 0;
1145 for (j = 0; j < s->nelem; ++j)
1146 if (d->tokens[s->elems[j].index] < 0)
1148 constraint = s->elems[j].constraint;
1149 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1150 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1151 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1152 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1153 d->states[i].constraint |= constraint;
1154 if (! d->states[i].first_end)
1155 d->states[i].first_end = d->tokens[s->elems[j].index];
1157 else if (d->tokens[s->elems[j].index] == BACKREF)
1159 d->states[i].constraint = NO_CONSTRAINT;
1160 d->states[i].backref = 1;
1168 /* Find the epsilon closure of a set of positions. If any position of the set
1169 contains a symbol that matches the empty string in some context, replace
1170 that position with the elements of its follow labeled with an appropriate
1171 constraint. Repeat exhaustively until no funny positions are left.
1172 S->elems must be large enough to hold the result. */
1174 epsclosure (position_set *s, struct dfa *d)
1180 MALLOC(visited, int, d->tindex);
1181 for (i = 0; i < d->tindex; ++i)
1184 for (i = 0; i < s->nelem; ++i)
1185 if (d->tokens[s->elems[i].index] >= NOTCHAR
1186 && d->tokens[s->elems[i].index] != BACKREF
1187 && d->tokens[s->elems[i].index] < CSET)
1190 p.constraint = old.constraint;
1191 delete(s->elems[i], s);
1192 if (visited[old.index])
1197 visited[old.index] = 1;
1198 switch (d->tokens[old.index])
1201 p.constraint &= BEGLINE_CONSTRAINT;
1204 p.constraint &= ENDLINE_CONSTRAINT;
1207 p.constraint &= BEGWORD_CONSTRAINT;
1210 p.constraint &= ENDWORD_CONSTRAINT;
1213 p.constraint &= LIMWORD_CONSTRAINT;
1216 p.constraint &= NOTLIMWORD_CONSTRAINT;
1221 for (j = 0; j < d->follows[old.index].nelem; ++j)
1223 p.index = d->follows[old.index].elems[j].index;
1226 /* Force rescan to start at the beginning. */
1233 /* Perform bottom-up analysis on the parse tree, computing various functions.
1234 Note that at this point, we're pretending constructs like \< are real
1235 characters rather than constraints on what can follow them.
1237 Nullable: A node is nullable if it is at the root of a regexp that can
1238 match the empty string.
1239 * EMPTY leaves are nullable.
1240 * No other leaf is nullable.
1241 * A QMARK or STAR node is nullable.
1242 * A PLUS node is nullable if its argument is nullable.
1243 * A CAT node is nullable if both its arguments are nullable.
1244 * An OR node is nullable if either argument is nullable.
1246 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1247 that could correspond to the first character of a string matching the
1248 regexp rooted at the given node.
1249 * EMPTY leaves have empty firstpos.
1250 * The firstpos of a nonempty leaf is that leaf itself.
1251 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1253 * The firstpos of a CAT node is the firstpos of the left argument, union
1254 the firstpos of the right if the left argument is nullable.
1255 * The firstpos of an OR node is the union of firstpos of each argument.
1257 Lastpos: The lastpos of a node is the set of positions that could
1258 correspond to the last character of a string matching the regexp at
1260 * EMPTY leaves have empty lastpos.
1261 * The lastpos of a nonempty leaf is that leaf itself.
1262 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1264 * The lastpos of a CAT node is the lastpos of its right argument, union
1265 the lastpos of the left if the right argument is nullable.
1266 * The lastpos of an OR node is the union of the lastpos of each argument.
1268 Follow: The follow of a position is the set of positions that could
1269 correspond to the character following a character matching the node in
1270 a string matching the regexp. At this point we consider special symbols
1271 that match the empty string in some context to be just normal characters.
1272 Later, if we find that a special symbol is in a follow set, we will
1273 replace it with the elements of its follow, labeled with an appropriate
1275 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1276 the follow of every node in the lastpos.
1277 * Every node in the firstpos of the second argument of a CAT node is in
1278 the follow of every node in the lastpos of the first argument.
1280 Because of the postfix representation of the parse tree, the depth-first
1281 analysis is conveniently done by a linear scan with the aid of a stack.
1282 Sets are stored as arrays of the elements, obeying a stack-like allocation
1283 scheme; the number of elements in each set deeper in the stack can be
1284 used to determine the address of a particular set's array. */
1286 dfaanalyze (struct dfa *d, int searchflag)
1288 int *nullable; /* Nullable stack. */
1289 int *nfirstpos; /* Element count stack for firstpos sets. */
1290 position *firstpos; /* Array where firstpos elements are stored. */
1291 int *nlastpos; /* Element count stack for lastpos sets. */
1292 position *lastpos; /* Array where lastpos elements are stored. */
1293 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1294 position_set tmp; /* Temporary set for merging sets. */
1295 position_set merged; /* Result of merging sets. */
1296 int wants_newline; /* True if some position wants newline info. */
1298 int *o_nfirst, *o_nlast;
1299 position *o_firstpos, *o_lastpos;
1304 fprintf(stderr, "dfaanalyze:\n");
1305 for (i = 0; i < d->tindex; ++i)
1307 fprintf(stderr, " %d:", i);
1308 prtok(d->tokens[i]);
1313 d->searchflag = searchflag;
1315 MALLOC(nullable, int, d->depth);
1316 o_nullable = nullable;
1317 MALLOC(nfirstpos, int, d->depth);
1318 o_nfirst = nfirstpos;
1319 MALLOC(firstpos, position, d->nleaves);
1320 o_firstpos = firstpos, firstpos += d->nleaves;
1321 MALLOC(nlastpos, int, d->depth);
1323 MALLOC(lastpos, position, d->nleaves);
1324 o_lastpos = lastpos, lastpos += d->nleaves;
1325 MALLOC(nalloc, int, d->tindex);
1326 for (i = 0; i < d->tindex; ++i)
1328 MALLOC(merged.elems, position, d->nleaves);
1330 CALLOC(d->follows, position_set, d->tindex);
1332 for (i = 0; i < d->tindex; ++i)
1334 { /* Nonsyntactic #ifdef goo... */
1336 switch (d->tokens[i])
1339 /* The empty set is nullable. */
1342 /* The firstpos and lastpos of the empty leaf are both empty. */
1343 *nfirstpos++ = *nlastpos++ = 0;
1348 /* Every element in the firstpos of the argument is in the follow
1349 of every element in the lastpos. */
1350 tmp.nelem = nfirstpos[-1];
1351 tmp.elems = firstpos;
1353 for (j = 0; j < nlastpos[-1]; ++j)
1355 merge(&tmp, &d->follows[pos[j].index], &merged);
1356 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1357 nalloc[pos[j].index], merged.nelem - 1);
1358 copy(&merged, &d->follows[pos[j].index]);
1362 /* A QMARK or STAR node is automatically nullable. */
1363 if (d->tokens[i] != PLUS)
1368 /* Every element in the firstpos of the second argument is in the
1369 follow of every element in the lastpos of the first argument. */
1370 tmp.nelem = nfirstpos[-1];
1371 tmp.elems = firstpos;
1372 pos = lastpos + nlastpos[-1];
1373 for (j = 0; j < nlastpos[-2]; ++j)
1375 merge(&tmp, &d->follows[pos[j].index], &merged);
1376 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1377 nalloc[pos[j].index], merged.nelem - 1);
1378 copy(&merged, &d->follows[pos[j].index]);
1381 /* The firstpos of a CAT node is the firstpos of the first argument,
1382 union that of the second argument if the first is nullable. */
1384 nfirstpos[-2] += nfirstpos[-1];
1386 firstpos += nfirstpos[-1];
1389 /* The lastpos of a CAT node is the lastpos of the second argument,
1390 union that of the first argument if the second is nullable. */
1392 nlastpos[-2] += nlastpos[-1];
1395 pos = lastpos + nlastpos[-2];
1396 for (j = nlastpos[-1] - 1; j >= 0; --j)
1397 pos[j] = lastpos[j];
1398 lastpos += nlastpos[-2];
1399 nlastpos[-2] = nlastpos[-1];
1403 /* A CAT node is nullable if both arguments are nullable. */
1404 nullable[-2] = nullable[-1] && nullable[-2];
1410 /* The firstpos is the union of the firstpos of each argument. */
1411 nfirstpos[-2] += nfirstpos[-1];
1414 /* The lastpos is the union of the lastpos of each argument. */
1415 nlastpos[-2] += nlastpos[-1];
1418 /* An OR node is nullable if either argument is nullable. */
1419 nullable[-2] = nullable[-1] || nullable[-2];
1424 /* Anything else is a nonempty position. (Note that special
1425 constructs like \< are treated as nonempty strings here;
1426 an "epsilon closure" effectively makes them nullable later.
1427 Backreferences have to get a real position so we can detect
1428 transitions on them later. But they are nullable. */
1429 *nullable++ = d->tokens[i] == BACKREF;
1431 /* This position is in its own firstpos and lastpos. */
1432 *nfirstpos++ = *nlastpos++ = 1;
1433 --firstpos, --lastpos;
1434 firstpos->index = lastpos->index = i;
1435 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1437 /* Allocate the follow set for this position. */
1439 MALLOC(d->follows[i].elems, position, nalloc[i]);
1443 /* ... balance the above nonsyntactic #ifdef goo... */
1444 fprintf(stderr, "node %d:", i);
1445 prtok(d->tokens[i]);
1447 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1448 fprintf(stderr, " firstpos:");
1449 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1451 fprintf(stderr, " %d:", firstpos[j].index);
1452 prtok(d->tokens[firstpos[j].index]);
1454 fprintf(stderr, "\n lastpos:");
1455 for (j = nlastpos[-1] - 1; j >= 0; --j)
1457 fprintf(stderr, " %d:", lastpos[j].index);
1458 prtok(d->tokens[lastpos[j].index]);
1464 /* For each follow set that is the follow set of a real position, replace
1465 it with its epsilon closure. */
1466 for (i = 0; i < d->tindex; ++i)
1467 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1468 || d->tokens[i] >= CSET)
1471 fprintf(stderr, "follows(%d:", i);
1472 prtok(d->tokens[i]);
1473 fprintf(stderr, "):");
1474 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1476 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1477 prtok(d->tokens[d->follows[i].elems[j].index]);
1481 copy(&d->follows[i], &merged);
1482 epsclosure(&merged, d);
1483 if (d->follows[i].nelem < merged.nelem)
1484 REALLOC(d->follows[i].elems, position, merged.nelem);
1485 copy(&merged, &d->follows[i]);
1488 /* Get the epsilon closure of the firstpos of the regexp. The result will
1489 be the set of positions of state 0. */
1491 for (i = 0; i < nfirstpos[-1]; ++i)
1492 insert(firstpos[i], &merged);
1493 epsclosure(&merged, d);
1495 /* Check if any of the positions of state 0 will want newline context. */
1497 for (i = 0; i < merged.nelem; ++i)
1498 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1501 /* Build the initial state. */
1504 MALLOC(d->states, dfa_state, d->salloc);
1505 state_index(d, &merged, wants_newline, 0);
1516 /* Find, for each character, the transition out of state s of d, and store
1517 it in the appropriate slot of trans.
1519 We divide the positions of s into groups (positions can appear in more
1520 than one group). Each group is labeled with a set of characters that
1521 every position in the group matches (taking into account, if necessary,
1522 preceding context information of s). For each group, find the union
1523 of the its elements' follows. This set is the set of positions of the
1524 new state. For each character in the group's label, set the transition
1525 on this character to be to a state corresponding to the set's positions,
1526 and its associated backward context information, if necessary.
1528 If we are building a searching matcher, we include the positions of state
1531 The collection of groups is constructed by building an equivalence-class
1532 partition of the positions of s.
1534 For each position, find the set of characters C that it matches. Eliminate
1535 any characters from C that fail on grounds of backward context.
1537 Search through the groups, looking for a group whose label L has nonempty
1538 intersection with C. If L - C is nonempty, create a new group labeled
1539 L - C and having the same positions as the current group, and set L to
1540 the intersection of L and C. Insert the position in this group, set
1541 C = C - L, and resume scanning.
1543 If after comparing with every group there are characters remaining in C,
1544 create a new group labeled with the characters of C and insert this
1545 position in that group. */
1547 dfastate (int s, struct dfa *d, int trans[])
1549 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1550 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1551 int ngrps = 0; /* Number of groups actually used. */
1552 position pos; /* Current position being considered. */
1553 charclass matches; /* Set of matching characters. */
1554 int matchesf; /* True if matches is nonempty. */
1555 charclass intersect; /* Intersection with some label set. */
1556 int intersectf; /* True if intersect is nonempty. */
1557 charclass leftovers; /* Stuff in the label that didn't match. */
1558 int leftoversf; /* True if leftovers is nonempty. */
1559 static charclass letters; /* Set of characters considered letters. */
1560 static charclass newline; /* Set of characters that aren't newline. */
1561 position_set follows; /* Union of the follows of some group. */
1562 position_set tmp; /* Temporary space for merging sets. */
1563 int state; /* New state. */
1564 int wants_newline; /* New state wants to know newline context. */
1565 int state_newline; /* New state on a newline transition. */
1566 int wants_letter; /* New state wants to know letter context. */
1567 int state_letter; /* New state on a letter transition. */
1568 static int initialized; /* Flag for static initialization. */
1571 /* Initialize the set of letters, if necessary. */
1575 for (i = 0; i < NOTCHAR; ++i)
1576 if (IS_WORD_CONSTITUENT(i))
1578 setbit(eolbyte, newline);
1583 for (i = 0; i < d->states[s].elems.nelem; ++i)
1585 pos = d->states[s].elems.elems[i];
1586 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1587 setbit(d->tokens[pos.index], matches);
1588 else if (d->tokens[pos.index] >= CSET)
1589 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1593 /* Some characters may need to be eliminated from matches because
1594 they fail in the current context. */
1595 if (pos.constraint != 0xFF)
1597 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1598 d->states[s].newline, 1))
1599 clrbit(eolbyte, matches);
1600 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1601 d->states[s].newline, 0))
1602 for (j = 0; j < CHARCLASS_INTS; ++j)
1603 matches[j] &= newline[j];
1604 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1605 d->states[s].letter, 1))
1606 for (j = 0; j < CHARCLASS_INTS; ++j)
1607 matches[j] &= ~letters[j];
1608 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1609 d->states[s].letter, 0))
1610 for (j = 0; j < CHARCLASS_INTS; ++j)
1611 matches[j] &= letters[j];
1613 /* If there are no characters left, there's no point in going on. */
1614 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1616 if (j == CHARCLASS_INTS)
1620 for (j = 0; j < ngrps; ++j)
1622 /* If matches contains a single character only, and the current
1623 group's label doesn't contain that character, go on to the
1625 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1626 && !tstbit(d->tokens[pos.index], labels[j]))
1629 /* Check if this group's label has a nonempty intersection with
1632 for (k = 0; k < CHARCLASS_INTS; ++k)
1633 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1637 /* It does; now find the set differences both ways. */
1638 leftoversf = matchesf = 0;
1639 for (k = 0; k < CHARCLASS_INTS; ++k)
1641 /* Even an optimizing compiler can't know this for sure. */
1642 int match = matches[k], label = labels[j][k];
1644 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1645 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1648 /* If there were leftovers, create a new group labeled with them. */
1651 copyset(leftovers, labels[ngrps]);
1652 copyset(intersect, labels[j]);
1653 MALLOC(grps[ngrps].elems, position, d->nleaves);
1654 copy(&grps[j], &grps[ngrps]);
1658 /* Put the position in the current group. Note that there is no
1659 reason to call insert() here. */
1660 grps[j].elems[grps[j].nelem++] = pos;
1662 /* If every character matching the current position has been
1663 accounted for, we're done. */
1668 /* If we've passed the last group, and there are still characters
1669 unaccounted for, then we'll have to create a new group. */
1672 copyset(matches, labels[ngrps]);
1674 MALLOC(grps[ngrps].elems, position, d->nleaves);
1675 grps[ngrps].nelem = 1;
1676 grps[ngrps].elems[0] = pos;
1681 MALLOC(follows.elems, position, d->nleaves);
1682 MALLOC(tmp.elems, position, d->nleaves);
1684 /* If we are a searching matcher, the default transition is to a state
1685 containing the positions of state 0, otherwise the default transition
1686 is to fail miserably. */
1691 for (i = 0; i < d->states[0].elems.nelem; ++i)
1693 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1695 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1698 copy(&d->states[0].elems, &follows);
1699 state = state_index(d, &follows, 0, 0);
1701 state_newline = state_index(d, &follows, 1, 0);
1703 state_newline = state;
1705 state_letter = state_index(d, &follows, 0, 1);
1707 state_letter = state;
1708 for (i = 0; i < NOTCHAR; ++i)
1709 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1710 trans[eolbyte] = state_newline;
1713 for (i = 0; i < NOTCHAR; ++i)
1716 for (i = 0; i < ngrps; ++i)
1720 /* Find the union of the follows of the positions of the group.
1721 This is a hideously inefficient loop. Fix it someday. */
1722 for (j = 0; j < grps[i].nelem; ++j)
1723 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1724 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1726 /* If we are building a searching matcher, throw in the positions
1727 of state 0 as well. */
1729 for (j = 0; j < d->states[0].elems.nelem; ++j)
1730 insert(d->states[0].elems.elems[j], &follows);
1732 /* Find out if the new state will want any context information. */
1734 if (tstbit(eolbyte, labels[i]))
1735 for (j = 0; j < follows.nelem; ++j)
1736 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1740 for (j = 0; j < CHARCLASS_INTS; ++j)
1741 if (labels[i][j] & letters[j])
1743 if (j < CHARCLASS_INTS)
1744 for (j = 0; j < follows.nelem; ++j)
1745 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1748 /* Find the state(s) corresponding to the union of the follows. */
1749 state = state_index(d, &follows, 0, 0);
1751 state_newline = state_index(d, &follows, 1, 0);
1753 state_newline = state;
1755 state_letter = state_index(d, &follows, 0, 1);
1757 state_letter = state;
1759 /* Set the transitions for each character in the current label. */
1760 for (j = 0; j < CHARCLASS_INTS; ++j)
1761 for (k = 0; k < INTBITS; ++k)
1762 if (labels[i][j] & 1 << k)
1764 int c = j * INTBITS + k;
1767 trans[c] = state_newline;
1768 else if (IS_WORD_CONSTITUENT(c))
1769 trans[c] = state_letter;
1770 else if (c < NOTCHAR)
1775 for (i = 0; i < ngrps; ++i)
1776 free(grps[i].elems);
1777 free(follows.elems);
1781 /* Some routines for manipulating a compiled dfa's transition tables.
1782 Each state may or may not have a transition table; if it does, and it
1783 is a non-accepting state, then d->trans[state] points to its table.
1784 If it is an accepting state then d->fails[state] points to its table.
1785 If it has no table at all, then d->trans[state] is NULL.
1786 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1789 build_state (int s, struct dfa *d)
1791 int *trans; /* The new transition table. */
1794 /* Set an upper limit on the number of transition tables that will ever
1795 exist at once. 1024 is arbitrary. The idea is that the frequently
1796 used transition tables will be quickly rebuilt, whereas the ones that
1797 were only needed once or twice will be cleared away. */
1798 if (d->trcount >= 1024)
1800 for (i = 0; i < d->tralloc; ++i)
1803 free((ptr_t) d->trans[i]);
1806 else if (d->fails[i])
1808 free((ptr_t) d->fails[i]);
1816 /* Set up the success bits for this state. */
1818 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1821 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1824 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1828 MALLOC(trans, int, NOTCHAR);
1829 dfastate(s, d, trans);
1831 /* Now go through the new transition table, and make sure that the trans
1832 and fail arrays are allocated large enough to hold a pointer for the
1833 largest state mentioned in the table. */
1834 for (i = 0; i < NOTCHAR; ++i)
1835 if (trans[i] >= d->tralloc)
1837 int oldalloc = d->tralloc;
1839 while (trans[i] >= d->tralloc)
1841 REALLOC(d->realtrans, int *, d->tralloc + 1);
1842 d->trans = d->realtrans + 1;
1843 REALLOC(d->fails, int *, d->tralloc);
1844 REALLOC(d->success, int, d->tralloc);
1845 REALLOC(d->newlines, int, d->tralloc);
1846 while (oldalloc < d->tralloc)
1848 d->trans[oldalloc] = NULL;
1849 d->fails[oldalloc++] = NULL;
1853 /* Keep the newline transition in a special place so we can use it as
1855 d->newlines[s] = trans[eolbyte];
1856 trans[eolbyte] = -1;
1858 if (ACCEPTING(s, *d))
1859 d->fails[s] = trans;
1861 d->trans[s] = trans;
1865 build_state_zero (struct dfa *d)
1869 CALLOC(d->realtrans, int *, d->tralloc + 1);
1870 d->trans = d->realtrans + 1;
1871 CALLOC(d->fails, int *, d->tralloc);
1872 MALLOC(d->success, int, d->tralloc);
1873 MALLOC(d->newlines, int, d->tralloc);
1877 /* Search through a buffer looking for a match to the given struct dfa.
1878 Find the first occurrence of a string matching the regexp in the buffer,
1879 and the shortest possible version thereof. Return a pointer to the first
1880 character after the match, or NULL if none is found. Begin points to
1881 the beginning of the buffer, and end points to the first character after
1882 its end. We store a newline in *end to act as a sentinel, so end had
1883 better point somewhere valid. Newline is a flag indicating whether to
1884 allow newlines to be in the matching string. If count is non-
1885 NULL it points to a place we're supposed to increment every time we
1886 see a newline. Finally, if backref is non-NULL it points to a place
1887 where we're supposed to store a 1 if backreferencing happened and the
1888 match needs to be verified by a backtracking matcher. Otherwise
1889 we store a 0 in *backref. */
1891 dfaexec (struct dfa *d, char *begin, char *end,
1892 int newline, int *count, int *backref)
1894 register int s, s1, tmp; /* Current state. */
1895 register unsigned char *p; /* Current input character. */
1896 register int **trans, *t; /* Copy of d->trans so it can be optimized
1898 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
1899 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1900 static int sbit_init;
1907 for (i = 0; i < NOTCHAR; ++i)
1908 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1913 build_state_zero(d);
1916 p = (unsigned char *) begin;
1922 while ((t = trans[s]) != 0) { /* hand-optimized loop */
1924 if ((t = trans[s1]) == 0) {
1925 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1931 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1933 if (d->success[s] & sbit[*p])
1936 *backref = (d->states[s].backref != 0);
1941 s = d->fails[s][*p++];
1945 /* If the previous character was a newline, count it. */
1946 if (count && (char *) p <= end && p[-1] == eol)
1949 /* Check if we've run off the end of the buffer. */
1950 if ((char *) p > end)
1960 if (p[-1] == eol && newline)
1962 s = d->newlines[s1];
1970 /* Initialize the components of a dfa that the other routines don't
1971 initialize for themselves. */
1973 dfainit (struct dfa *d)
1976 MALLOC(d->charclasses, charclass, d->calloc);
1980 MALLOC(d->tokens, token, d->talloc);
1981 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1989 /* Parse and analyze a single string of the given length. */
1991 dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1993 if (case_fold) /* dummy folding in service of dfamust() */
1998 lcopy = malloc(len);
2000 dfaerror(_("out of memory"));
2002 /* This is a kludge. */
2004 for (i = 0; i < len; ++i)
2005 if (ISUPPER ((unsigned char) s[i]))
2006 lcopy[i] = tolower ((unsigned char) s[i]);
2011 dfaparse(lcopy, len, d);
2014 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2016 dfaparse(s, len, d);
2017 dfaanalyze(d, searchflag);
2022 dfaparse(s, len, d);
2024 dfaanalyze(d, searchflag);
2028 /* Free the storage held by the components of a dfa. */
2030 dfafree (struct dfa *d)
2033 struct dfamust *dm, *ndm;
2035 free((ptr_t) d->charclasses);
2036 free((ptr_t) d->tokens);
2037 for (i = 0; i < d->sindex; ++i)
2038 free((ptr_t) d->states[i].elems.elems);
2039 free((ptr_t) d->states);
2040 for (i = 0; i < d->tindex; ++i)
2041 if (d->follows[i].elems)
2042 free((ptr_t) d->follows[i].elems);
2043 free((ptr_t) d->follows);
2044 for (i = 0; i < d->tralloc; ++i)
2046 free((ptr_t) d->trans[i]);
2047 else if (d->fails[i])
2048 free((ptr_t) d->fails[i]);
2049 if (d->realtrans) free((ptr_t) d->realtrans);
2050 if (d->fails) free((ptr_t) d->fails);
2051 if (d->newlines) free((ptr_t) d->newlines);
2052 if (d->success) free((ptr_t) d->success);
2053 for (dm = d->musts; dm; dm = ndm)
2061 /* Having found the postfix representation of the regular expression,
2062 try to find a long sequence of characters that must appear in any line
2064 Finding a "longest" sequence is beyond the scope here;
2065 we take an easy way out and hope for the best.
2066 (Take "(ab|a)b"--please.)
2068 We do a bottom-up calculation of sequences of characters that must appear
2069 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2071 sequences that must appear at the left of the match ("left")
2072 sequences that must appear at the right of the match ("right")
2073 lists of sequences that must appear somewhere in the match ("in")
2074 sequences that must constitute the match ("is")
2076 When we get to the root of the tree, we use one of the longest of its
2077 calculated "in" sequences as our answer. The sequence we find is returned in
2078 d->must (where "d" is the single argument passed to "dfamust");
2079 the length of the sequence is returned in d->mustn.
2081 The sequences calculated for the various types of node (in pseudo ANSI c)
2082 are shown below. "p" is the operand of unary operators (and the left-hand
2083 operand of binary operators); "q" is the right-hand operand of binary
2086 "ZERO" means "a zero-length sequence" below.
2088 Type left right is in
2089 ---- ---- ----- -- --
2090 char c # c # c # c # c
2092 CSET ZERO ZERO ZERO ZERO
2094 STAR ZERO ZERO ZERO ZERO
2096 QMARK ZERO ZERO ZERO ZERO
2098 PLUS p->left p->right ZERO p->in
2100 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2101 p->left : q->right : q->is!=ZERO) ? q->in plus
2102 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2105 OR longest common longest common (do p->is and substrings common to
2106 leading trailing q->is have same p->in and q->in
2107 (sub)sequence (sub)sequence length and
2108 of p->left of p->right content) ?
2109 and q->left and q->right p->is : NULL
2111 If there's anything else we recognize in the tree, all four sequences get set
2112 to zero-length sequences. If there's something we don't recognize in the tree,
2113 we just return a zero-length sequence.
2115 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2118 And. . .is it here or someplace that we might ponder "optimizations" such as
2119 egrep 'psi|epsilon' -> egrep 'psi'
2120 egrep 'pepsi|epsilon' -> egrep 'epsi'
2121 (Yes, we now find "epsi" as a "string
2122 that must occur", but we might also
2123 simplify the *entire* r.e. being sought)
2124 grep '[c]' -> grep 'c'
2125 grep '(ab|a)b' -> grep 'ab'
2126 grep 'ab*' -> grep 'a'
2127 grep 'a*b' -> grep 'b'
2129 There are several issues:
2131 Is optimization easy (enough)?
2133 Does optimization actually accomplish anything,
2134 or is the automaton you get from "psi|epsilon" (for example)
2135 the same as the one you get from "psi" (for example)?
2137 Are optimizable r.e.'s likely to be used in real-life situations
2138 (something like 'ab*' is probably unlikely; something like is
2139 'psi|epsilon' is likelier)? */
2142 icatalloc (char *old, char *new)
2145 size_t oldsize, newsize;
2147 newsize = (new == NULL) ? 0 : strlen(new);
2150 else if (newsize == 0)
2152 else oldsize = strlen(old);
2154 result = (char *) malloc(newsize + 1);
2156 result = (char *) realloc((void *) old, oldsize + newsize + 1);
2157 if (result != NULL && new != NULL)
2158 (void) strcpy(result + oldsize, new);
2163 icpyalloc (char *string)
2165 return icatalloc((char *) NULL, string);
2169 istrstr (char *lookin, char *lookfor)
2174 len = strlen(lookfor);
2175 for (cp = lookin; *cp != '\0'; ++cp)
2176 if (strncmp(cp, lookfor, len) == 0)
2189 freelist (char **cpp)
2195 for (i = 0; cpp[i] != NULL; ++i)
2203 enlist (char **cpp, char *new, size_t len)
2209 if ((new = icpyalloc(new)) == NULL)
2215 /* Is there already something in the list that's new (or longer)? */
2216 for (i = 0; cpp[i] != NULL; ++i)
2217 if (istrstr(cpp[i], new) != NULL)
2222 /* Eliminate any obsoleted strings. */
2224 while (cpp[j] != NULL)
2225 if (istrstr(new, cpp[j]) == NULL)
2235 /* Add the new string. */
2236 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2244 /* Given pointers to two strings, return a pointer to an allocated
2245 list of their distinct common substrings. Return NULL if something
2248 comsubs (char *left, char *right)
2255 if (left == NULL || right == NULL)
2257 cpp = (char **) malloc(sizeof *cpp);
2261 for (lcp = left; *lcp != '\0'; ++lcp)
2264 rcp = index(right, *lcp);
2267 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2271 rcp = index(rcp + 1, *lcp);
2275 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2282 addlists (char **old, char **new)
2286 if (old == NULL || new == NULL)
2288 for (i = 0; new[i] != NULL; ++i)
2290 old = enlist(old, new[i], strlen(new[i]));
2297 /* Given two lists of substrings, return a new list giving substrings
2300 inboth (char **left, char **right)
2306 if (left == NULL || right == NULL)
2308 both = (char **) malloc(sizeof *both);
2312 for (lnum = 0; left[lnum] != NULL; ++lnum)
2314 for (rnum = 0; right[rnum] != NULL; ++rnum)
2316 temp = comsubs(left[lnum], right[rnum]);
2322 both = addlists(both, temp);
2341 resetmust (must *mp)
2343 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2348 dfamust (struct dfa *dfa)
2359 static char empty_string[] = "";
2361 result = empty_string;
2363 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2367 for (i = 0; i <= dfa->tindex; ++i)
2369 for (i = 0; i <= dfa->tindex; ++i)
2371 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2372 mp[i].left = malloc(2);
2373 mp[i].right = malloc(2);
2374 mp[i].is = malloc(2);
2375 if (mp[i].in == NULL || mp[i].left == NULL ||
2376 mp[i].right == NULL || mp[i].is == NULL)
2378 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2382 fprintf(stderr, "dfamust:\n");
2383 for (i = 0; i < dfa->tindex; ++i)
2385 fprintf(stderr, " %d:", i);
2386 prtok(dfa->tokens[i]);
2390 for (ri = 0; ri < dfa->tindex; ++ri)
2392 switch (t = dfa->tokens[ri])
2396 goto done; /* "cannot happen" */
2410 goto done; /* "cannot happen" */
2417 goto done; /* "cannot happen" */
2426 /* Guaranteed to be. Unlikely, but. . . */
2427 if (strcmp(lmp->is, rmp->is) != 0)
2429 /* Left side--easy */
2431 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2433 lmp->left[i] = '\0';
2435 ln = strlen(lmp->right);
2436 rn = strlen(rmp->right);
2440 for (i = 0; i < n; ++i)
2441 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2443 for (j = 0; j < i; ++j)
2444 lmp->right[j] = lmp->right[(ln - i) + j];
2445 lmp->right[j] = '\0';
2446 new = inboth(lmp->in, rmp->in);
2450 free((char *) lmp->in);
2456 goto done; /* "cannot happen" */
2461 if (mp != &musts[1])
2462 goto done; /* "cannot happen" */
2463 for (i = 0; musts[0].in[i] != NULL; ++i)
2464 if (strlen(musts[0].in[i]) > strlen(result))
2465 result = musts[0].in[i];
2466 if (strcmp(result, musts[0].is) == 0)
2471 goto done; /* "cannot happen" */
2478 /* In. Everything in left, plus everything in
2479 right, plus catenation of
2480 left's right and right's left. */
2481 lmp->in = addlists(lmp->in, rmp->in);
2482 if (lmp->in == NULL)
2484 if (lmp->right[0] != '\0' &&
2485 rmp->left[0] != '\0')
2489 tp = icpyalloc(lmp->right);
2492 tp = icatalloc(tp, rmp->left);
2495 lmp->in = enlist(lmp->in, tp,
2498 if (lmp->in == NULL)
2502 if (lmp->is[0] != '\0')
2504 lmp->left = icatalloc(lmp->left,
2506 if (lmp->left == NULL)
2510 if (rmp->is[0] == '\0')
2511 lmp->right[0] = '\0';
2512 lmp->right = icatalloc(lmp->right, rmp->right);
2513 if (lmp->right == NULL)
2515 /* Guaranteed to be */
2516 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2518 lmp->is = icatalloc(lmp->is, rmp->is);
2519 if (lmp->is == NULL)
2529 /* "cannot happen" */
2534 /* not on *my* shift */
2544 /* plain character */
2546 mp->is[0] = mp->left[0] = mp->right[0] = t;
2547 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2548 mp->in = enlist(mp->in, mp->is, (size_t)1);
2555 fprintf(stderr, " node: %d:", ri);
2556 prtok(dfa->tokens[ri]);
2557 fprintf(stderr, "\n in:");
2558 for (i = 0; mp->in[i]; ++i)
2559 fprintf(stderr, " \"%s\"", mp->in[i]);
2560 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2561 fprintf(stderr, " left: \"%s\"\n", mp->left);
2562 fprintf(stderr, " right: \"%s\"\n", mp->right);
2569 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2571 dm->must = malloc(strlen(result) + 1);
2572 strcpy(dm->must, result);
2573 dm->next = dfa->musts;
2577 for (i = 0; i <= dfa->tindex; ++i)
2580 ifree((char *) mp[i].in);