1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
21 /* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.2 2003/06/17 04:25:45 dillon Exp $ */
32 #include <sys/types.h>
36 extern char *calloc(), *malloc(), *realloc();
40 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
48 #ifndef DEBUG /* use the same approach as regex.c */
54 #define isgraph(C) (isprint(C) && !isspace(C))
57 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
58 #define ISALPHA(C) isalpha(C)
59 #define ISUPPER(C) isupper(C)
60 #define ISLOWER(C) islower(C)
61 #define ISDIGIT(C) isdigit(C)
62 #define ISXDIGIT(C) isxdigit(C)
63 #define ISSPACE(C) isspace(C)
64 #define ISPUNCT(C) ispunct(C)
65 #define ISALNUM(C) isalnum(C)
66 #define ISPRINT(C) isprint(C)
67 #define ISGRAPH(C) isgraph(C)
68 #define ISCNTRL(C) iscntrl(C)
70 #define ISALPHA(C) (isascii(C) && isalpha(C))
71 #define ISUPPER(C) (isascii(C) && isupper(C))
72 #define ISLOWER(C) (isascii(C) && islower(C))
73 #define ISDIGIT(C) (isascii(C) && isdigit(C))
74 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
75 #define ISSPACE(C) (isascii(C) && isspace(C))
76 #define ISPUNCT(C) (isascii(C) && ispunct(C))
77 #define ISALNUM(C) (isascii(C) && isalnum(C))
78 #define ISPRINT(C) (isascii(C) && isprint(C))
79 #define ISGRAPH(C) (isascii(C) && isgraph(C))
80 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
83 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
84 - Its arg may be any int or unsigned int; it need not be an unsigned char.
85 - It's guaranteed to evaluate its argument exactly once.
86 - It's typically faster.
87 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
88 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
89 it's important to use the locale's definition of `digit' even when the
90 host does not conform to Posix. */
91 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
93 /* If we (don't) have I18N. */
96 # ifdef HAVE_LIBINTL_H
99 # define _(Str) gettext (Str)
102 # define _(Str) (Str)
107 #include <gnuregex.h>
113 /* HPUX, define those as macros in sys/param.h */
121 static void dfamust PARAMS ((struct dfa *dfa));
123 static ptr_t xcalloc PARAMS ((size_t n, size_t s));
124 static ptr_t xmalloc PARAMS ((size_t n));
125 static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
127 static void prtok PARAMS ((token t));
129 static int tstbit PARAMS ((int b, charclass c));
130 static void setbit PARAMS ((int b, charclass c));
131 static void clrbit PARAMS ((int b, charclass c));
132 static void copyset PARAMS ((charclass src, charclass dst));
133 static void zeroset PARAMS ((charclass s));
134 static void notset PARAMS ((charclass s));
135 static int equal PARAMS ((charclass s1, charclass s2));
136 static int charclass_index PARAMS ((charclass s));
137 static int looking_at PARAMS ((const char *s));
138 static token lex PARAMS ((void));
139 static void addtok PARAMS ((token t));
140 static void atom PARAMS ((void));
141 static int nsubtoks PARAMS ((int tindex));
142 static void copytoks PARAMS ((int tindex, int ntokens));
143 static void closure PARAMS ((void));
144 static void branch PARAMS ((void));
145 static void regexp PARAMS ((int toplevel));
146 static void copy PARAMS ((position_set *src, position_set *dst));
147 static void insert PARAMS ((position p, position_set *s));
148 static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
149 static void delete PARAMS ((position p, position_set *s));
150 static int state_index PARAMS ((struct dfa *d, position_set *s,
151 int newline, int letter));
152 static void build_state PARAMS ((int s, struct dfa *d));
153 static void build_state_zero PARAMS ((struct dfa *d));
154 static char *icatalloc PARAMS ((char *old, char *new));
155 static char *icpyalloc PARAMS ((char *string));
156 static char *istrstr PARAMS ((char *lookin, char *lookfor));
157 static void ifree PARAMS ((char *cp));
158 static void freelist PARAMS ((char **cpp));
159 static char **enlist PARAMS ((char **cpp, char *new, size_t len));
160 static char **comsubs PARAMS ((char *left, char *right));
161 static char **addlists PARAMS ((char **old, char **new));
162 static char **inboth PARAMS ((char **left, char **right));
165 xcalloc (size_t n, size_t s)
167 ptr_t r = calloc(n, s);
170 dfaerror(_("Memory exhausted"));
181 dfaerror(_("Memory exhausted"));
186 xrealloc (ptr_t p, size_t n)
188 ptr_t r = realloc(p, n);
192 dfaerror(_("Memory exhausted"));
196 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
197 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
198 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
200 /* Reallocate an array of type t if nalloc is too small for index. */
201 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
202 if ((index) >= (nalloc)) \
204 while ((index) >= (nalloc)) \
206 REALLOC(p, t, nalloc); \
217 fprintf(stderr, "END");
218 else if (t < NOTCHAR)
219 fprintf(stderr, "%c", t);
224 case EMPTY: s = "EMPTY"; break;
225 case BACKREF: s = "BACKREF"; break;
226 case BEGLINE: s = "BEGLINE"; break;
227 case ENDLINE: s = "ENDLINE"; break;
228 case BEGWORD: s = "BEGWORD"; break;
229 case ENDWORD: s = "ENDWORD"; break;
230 case LIMWORD: s = "LIMWORD"; break;
231 case NOTLIMWORD: s = "NOTLIMWORD"; break;
232 case QMARK: s = "QMARK"; break;
233 case STAR: s = "STAR"; break;
234 case PLUS: s = "PLUS"; break;
235 case CAT: s = "CAT"; break;
236 case OR: s = "OR"; break;
237 case ORTOP: s = "ORTOP"; break;
238 case LPAREN: s = "LPAREN"; break;
239 case RPAREN: s = "RPAREN"; break;
240 default: s = "CSET"; break;
242 fprintf(stderr, "%s", s);
247 /* Stuff pertaining to charclasses. */
250 tstbit (int b, charclass c)
252 return c[b / INTBITS] & 1 << b % INTBITS;
256 setbit (int b, charclass c)
258 c[b / INTBITS] |= 1 << b % INTBITS;
262 clrbit (int b, charclass c)
264 c[b / INTBITS] &= ~(1 << b % INTBITS);
268 copyset (charclass src, charclass dst)
272 for (i = 0; i < CHARCLASS_INTS; ++i)
277 zeroset (charclass s)
281 for (i = 0; i < CHARCLASS_INTS; ++i)
290 for (i = 0; i < CHARCLASS_INTS; ++i)
295 equal (charclass s1, charclass s2)
299 for (i = 0; i < CHARCLASS_INTS; ++i)
305 /* A pointer to the current dfa is kept here during parsing. */
306 static struct dfa *dfa;
308 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
310 charclass_index (charclass s)
314 for (i = 0; i < dfa->cindex; ++i)
315 if (equal(s, dfa->charclasses[i]))
317 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
319 copyset(s, dfa->charclasses[i]);
323 /* Syntax bits controlling the behavior of the lexical analyzer. */
324 static reg_syntax_t syntax_bits, syntax_bits_set;
326 /* Flag for case-folding letters into sets. */
327 static int case_fold;
329 /* End-of-line byte in data. */
330 static unsigned char eolbyte;
332 /* Entry point to set syntax options. */
334 dfasyntax (reg_syntax_t bits, int fold, int eol)
342 /* Lexical analyzer. All the dross that deals with the obnoxious
343 GNU Regex syntax bits is located here. The poor, suffering
344 reader is referred to the GNU Regex documentation for the
345 meaning of the @#%!@#%^!@ syntax bits. */
347 static char *lexstart; /* Pointer to beginning of input string. */
348 static char *lexptr; /* Pointer to next input character. */
349 static int lexleft; /* Number of characters remaining. */
350 static token lasttok; /* Previous token returned; initially END. */
351 static int laststart; /* True if we're separated from beginning or (, |
352 only by zero-width characters. */
353 static int parens; /* Count of outstanding left parens. */
354 static int minrep, maxrep; /* Repeat counts for {m,n}. */
356 /* Note that characters become unsigned here. */
357 #define FETCH(c, eoferr) \
364 return lasttok = END; \
366 (c) = (unsigned char) *lexptr++; \
371 #define FUNC(F, P) static int F(int c) { return P(c); }
373 #define FUNC(F, P) static int F(c) int c; { return P(c); }
376 FUNC(is_alpha, ISALPHA)
377 FUNC(is_upper, ISUPPER)
378 FUNC(is_lower, ISLOWER)
379 FUNC(is_digit, ISDIGIT)
380 FUNC(is_xdigit, ISXDIGIT)
381 FUNC(is_space, ISSPACE)
382 FUNC(is_punct, ISPUNCT)
383 FUNC(is_alnum, ISALNUM)
384 FUNC(is_print, ISPRINT)
385 FUNC(is_graph, ISGRAPH)
386 FUNC(is_cntrl, ISCNTRL)
391 return (c == ' ' || c == '\t');
394 /* The following list maps the names of the Posix named character classes
395 to predicate functions that determine whether a given character is in
396 the class. The leading [ has already been eaten by the lexical analyzer. */
399 int (*pred) PARAMS ((int));
401 { ":alpha:]", is_alpha },
402 { ":upper:]", is_upper },
403 { ":lower:]", is_lower },
404 { ":digit:]", is_digit },
405 { ":xdigit:]", is_xdigit },
406 { ":space:]", is_space },
407 { ":punct:]", is_punct },
408 { ":alnum:]", is_alnum },
409 { ":print:]", is_print },
410 { ":graph:]", is_graph },
411 { ":cntrl:]", is_cntrl },
412 { ":blank:]", is_blank },
416 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
417 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
420 looking_at (char const *s)
427 return strncmp(s, lexptr, len) == 0;
434 int backslash = 0, invert;
440 /* Basic plan: We fetch a character. If it's a backslash,
441 we set the backslash flag and go through the loop again.
442 On the plus side, this avoids having a duplicate of the
443 main switch inside the backslash case. On the minus side,
444 it means that just about every case begins with
445 "if (backslash) ...". */
446 for (i = 0; i < 2; ++i)
455 dfaerror(_("Unfinished \\ escape"));
462 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
466 return lasttok = BEGLINE;
472 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
474 || (syntax_bits & RE_NO_BK_PARENS
475 ? lexleft > 0 && *lexptr == ')'
476 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
477 || (syntax_bits & RE_NO_BK_VBAR
478 ? lexleft > 0 && *lexptr == '|'
479 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
480 || ((syntax_bits & RE_NEWLINE_ALT)
481 && lexleft > 0 && *lexptr == '\n'))
482 return lasttok = ENDLINE;
494 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
497 return lasttok = BACKREF;
502 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
503 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
507 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
508 return lasttok = ENDLINE; /* FIXME: should be end of string */
512 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
513 return lasttok = BEGWORD;
517 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
518 return lasttok = ENDWORD;
522 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
523 return lasttok = LIMWORD;
527 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
528 return lasttok = NOTLIMWORD;
532 if (syntax_bits & RE_LIMITED_OPS)
534 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
536 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
538 return lasttok = QMARK;
543 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
545 return lasttok = STAR;
548 if (syntax_bits & RE_LIMITED_OPS)
550 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
552 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
554 return lasttok = PLUS;
557 if (!(syntax_bits & RE_INTERVALS))
559 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
561 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
564 if (syntax_bits & RE_NO_BK_BRACES)
566 /* Scan ahead for a valid interval; if it's not valid,
567 treat it as a literal '{'. */
568 int lo = -1, hi = -1;
569 char const *p = lexptr;
570 char const *lim = p + lexleft;
571 for (; p != lim && ISASCIIDIGIT (*p); p++)
572 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
573 if (p != lim && *p == ',')
574 while (++p != lim && ISASCIIDIGIT (*p))
575 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
578 if (p == lim || *p != '}'
579 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
586 {M,} - minimum count, maximum is infinity
587 {M,N} - M through N */
588 FETCH(c, _("unfinished repeat count"));
589 if (ISASCIIDIGIT (c))
594 FETCH(c, _("unfinished repeat count"));
595 if (! ISASCIIDIGIT (c))
597 minrep = 10 * minrep + c - '0';
601 dfaerror(_("malformed repeat count"));
604 FETCH (c, _("unfinished repeat count"));
605 if (! ISASCIIDIGIT (c))
612 FETCH (c, _("unfinished repeat count"));
613 if (! ISASCIIDIGIT (c))
615 maxrep = 10 * maxrep + c - '0';
617 if (0 <= maxrep && maxrep < minrep)
618 dfaerror (_("malformed repeat count"));
623 if (!(syntax_bits & RE_NO_BK_BRACES))
626 dfaerror(_("malformed repeat count"));
627 FETCH(c, _("unfinished repeat count"));
630 dfaerror(_("malformed repeat count"));
632 return lasttok = REPMN;
635 if (syntax_bits & RE_LIMITED_OPS)
637 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
643 if (syntax_bits & RE_LIMITED_OPS
645 || !(syntax_bits & RE_NEWLINE_ALT))
651 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
655 return lasttok = LPAREN;
658 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
660 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
664 return lasttok = RPAREN;
671 if (!(syntax_bits & RE_DOT_NEWLINE))
672 clrbit(eolbyte, ccl);
673 if (syntax_bits & RE_DOT_NOT_NULL)
676 return lasttok = CSET + charclass_index(ccl);
680 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
683 for (c2 = 0; c2 < NOTCHAR; ++c2)
684 if (IS_WORD_CONSTITUENT(c2))
689 return lasttok = CSET + charclass_index(ccl);
695 FETCH(c, _("Unbalanced ["));
698 FETCH(c, _("Unbalanced ["));
705 /* Nobody ever said this had to be fast. :-)
706 Note that if we're looking at some other [:...:]
707 construct, we just treat it as a bunch of ordinary
708 characters. We can do this because we assume
709 regex has checked for syntax errors before
710 dfa is ever called. */
711 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
712 for (c1 = 0; prednames[c1].name; ++c1)
713 if (looking_at(prednames[c1].name))
715 int (*pred)() = prednames[c1].pred;
717 && (pred == is_upper || pred == is_lower))
720 for (c2 = 0; c2 < NOTCHAR; ++c2)
723 lexptr += strlen(prednames[c1].name);
724 lexleft -= strlen(prednames[c1].name);
725 FETCH(c1, _("Unbalanced ["));
728 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
729 FETCH(c, _("Unbalanced ["));
730 FETCH(c1, _("Unbalanced ["));
733 FETCH(c2, _("Unbalanced ["));
736 /* In the case [x-], the - is an ordinary hyphen,
737 which is left in c1, the lookahead character. */
745 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
746 FETCH(c2, _("Unbalanced ["));
747 FETCH(c1, _("Unbalanced ["));
753 lo[0] = c; lo[1] = '\0';
754 hi[0] = c2; hi[1] = '\0';
755 for (c = 0; c < NOTCHAR; c++)
758 ch[0] = c; ch[1] = '\0';
759 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
765 setbit (tolower (c), ccl);
766 else if (ISLOWER (c))
767 setbit (toupper (c), ccl);
775 while ((c = c1) != ']');
779 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
780 clrbit(eolbyte, ccl);
783 return lasttok = CSET + charclass_index(ccl);
788 if (case_fold && ISALPHA(c))
793 setbit(tolower(c), ccl);
795 setbit(toupper(c), ccl);
796 return lasttok = CSET + charclass_index(ccl);
802 /* The above loop should consume at most a backslash
803 and some other character. */
805 return END; /* keeps pedantic compilers happy. */
808 /* Recursive descent parser for regular expressions. */
810 static token tok; /* Lookahead token. */
811 static int depth; /* Current depth of a hypothetical stack
812 holding deferred productions. This is
813 used to determine the depth that will be
814 required of the real stack later on in
817 /* Add the given token to the parse tree, maintaining the depth count and
818 updating the maximum depth if necessary. */
822 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
823 dfa->tokens[dfa->tindex++] = t;
844 if (depth > dfa->depth)
848 /* The grammar understood by the parser is as follows.
876 The parser builds a parse tree in postfix form in an array of tokens. */
881 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
882 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
883 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
888 else if (tok == LPAREN)
893 dfaerror(_("Unbalanced ("));
900 /* Return the number of tokens in the given subexpression. */
902 nsubtoks (int tindex)
906 switch (dfa->tokens[tindex - 1])
913 return 1 + nsubtoks(tindex - 1);
917 ntoks1 = nsubtoks(tindex - 1);
918 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
922 /* Copy the given subexpression to the top of the tree. */
924 copytoks (int tindex, int ntokens)
928 for (i = 0; i < ntokens; ++i)
929 addtok(dfa->tokens[tindex + i]);
935 int tindex, ntokens, i;
938 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
941 ntokens = nsubtoks(dfa->tindex);
942 tindex = dfa->tindex - ntokens;
947 for (i = 1; i < minrep; ++i)
949 copytoks(tindex, ntokens);
952 for (; i < maxrep; ++i)
954 copytoks(tindex, ntokens);
971 while (tok != RPAREN && tok != OR && tok >= 0)
979 regexp (int toplevel)
993 /* Main entry point for the parser. S is a string to be parsed, len is the
994 length of the string, so s can include NUL characters. D is a pointer to
995 the struct dfa to parse into. */
997 dfaparse (char *s, size_t len, struct dfa *d)
1000 lexstart = lexptr = s;
1006 if (! syntax_bits_set)
1007 dfaerror(_("No syntax specified"));
1015 dfaerror(_("Unbalanced )"));
1017 addtok(END - d->nregexps);
1026 /* Some primitives for operating on sets of positions. */
1028 /* Copy one set to another; the destination must be large enough. */
1030 copy (position_set *src, position_set *dst)
1034 for (i = 0; i < src->nelem; ++i)
1035 dst->elems[i] = src->elems[i];
1036 dst->nelem = src->nelem;
1039 /* Insert a position in a set. Position sets are maintained in sorted
1040 order according to index. If position already exists in the set with
1041 the same index then their constraints are logically or'd together.
1042 S->elems must point to an array large enough to hold the resulting set. */
1044 insert (position p, position_set *s)
1049 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1051 if (i < s->nelem && p.index == s->elems[i].index)
1052 s->elems[i].constraint |= p.constraint;
1057 while (i < s->nelem)
1066 /* Merge two sets of positions into a third. The result is exactly as if
1067 the positions of both sets were inserted into an initially empty set. */
1069 merge (position_set *s1, position_set *s2, position_set *m)
1074 while (i < s1->nelem && j < s2->nelem)
1075 if (s1->elems[i].index > s2->elems[j].index)
1076 m->elems[m->nelem++] = s1->elems[i++];
1077 else if (s1->elems[i].index < s2->elems[j].index)
1078 m->elems[m->nelem++] = s2->elems[j++];
1081 m->elems[m->nelem] = s1->elems[i++];
1082 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1084 while (i < s1->nelem)
1085 m->elems[m->nelem++] = s1->elems[i++];
1086 while (j < s2->nelem)
1087 m->elems[m->nelem++] = s2->elems[j++];
1090 /* Delete a position from a set. */
1092 delete (position p, position_set *s)
1096 for (i = 0; i < s->nelem; ++i)
1097 if (p.index == s->elems[i].index)
1100 for (--s->nelem; i < s->nelem; ++i)
1101 s->elems[i] = s->elems[i + 1];
1104 /* Find the index of the state corresponding to the given position set with
1105 the given preceding context, or create a new state if there is no such
1106 state. Newline and letter tell whether we got here on a newline or
1107 letter, respectively. */
1109 state_index (struct dfa *d, position_set *s, int newline, int letter)
1115 newline = newline ? 1 : 0;
1116 letter = letter ? 1 : 0;
1118 for (i = 0; i < s->nelem; ++i)
1119 hash ^= s->elems[i].index + s->elems[i].constraint;
1121 /* Try to find a state that exactly matches the proposed one. */
1122 for (i = 0; i < d->sindex; ++i)
1124 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1125 || newline != d->states[i].newline || letter != d->states[i].letter)
1127 for (j = 0; j < s->nelem; ++j)
1128 if (s->elems[j].constraint
1129 != d->states[i].elems.elems[j].constraint
1130 || s->elems[j].index != d->states[i].elems.elems[j].index)
1136 /* We'll have to create a new state. */
1137 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1138 d->states[i].hash = hash;
1139 MALLOC(d->states[i].elems.elems, position, s->nelem);
1140 copy(s, &d->states[i].elems);
1141 d->states[i].newline = newline;
1142 d->states[i].letter = letter;
1143 d->states[i].backref = 0;
1144 d->states[i].constraint = 0;
1145 d->states[i].first_end = 0;
1146 for (j = 0; j < s->nelem; ++j)
1147 if (d->tokens[s->elems[j].index] < 0)
1149 constraint = s->elems[j].constraint;
1150 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1151 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1152 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1153 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1154 d->states[i].constraint |= constraint;
1155 if (! d->states[i].first_end)
1156 d->states[i].first_end = d->tokens[s->elems[j].index];
1158 else if (d->tokens[s->elems[j].index] == BACKREF)
1160 d->states[i].constraint = NO_CONSTRAINT;
1161 d->states[i].backref = 1;
1169 /* Find the epsilon closure of a set of positions. If any position of the set
1170 contains a symbol that matches the empty string in some context, replace
1171 that position with the elements of its follow labeled with an appropriate
1172 constraint. Repeat exhaustively until no funny positions are left.
1173 S->elems must be large enough to hold the result. */
1175 epsclosure (position_set *s, struct dfa *d)
1181 MALLOC(visited, int, d->tindex);
1182 for (i = 0; i < d->tindex; ++i)
1185 for (i = 0; i < s->nelem; ++i)
1186 if (d->tokens[s->elems[i].index] >= NOTCHAR
1187 && d->tokens[s->elems[i].index] != BACKREF
1188 && d->tokens[s->elems[i].index] < CSET)
1191 p.constraint = old.constraint;
1192 delete(s->elems[i], s);
1193 if (visited[old.index])
1198 visited[old.index] = 1;
1199 switch (d->tokens[old.index])
1202 p.constraint &= BEGLINE_CONSTRAINT;
1205 p.constraint &= ENDLINE_CONSTRAINT;
1208 p.constraint &= BEGWORD_CONSTRAINT;
1211 p.constraint &= ENDWORD_CONSTRAINT;
1214 p.constraint &= LIMWORD_CONSTRAINT;
1217 p.constraint &= NOTLIMWORD_CONSTRAINT;
1222 for (j = 0; j < d->follows[old.index].nelem; ++j)
1224 p.index = d->follows[old.index].elems[j].index;
1227 /* Force rescan to start at the beginning. */
1234 /* Perform bottom-up analysis on the parse tree, computing various functions.
1235 Note that at this point, we're pretending constructs like \< are real
1236 characters rather than constraints on what can follow them.
1238 Nullable: A node is nullable if it is at the root of a regexp that can
1239 match the empty string.
1240 * EMPTY leaves are nullable.
1241 * No other leaf is nullable.
1242 * A QMARK or STAR node is nullable.
1243 * A PLUS node is nullable if its argument is nullable.
1244 * A CAT node is nullable if both its arguments are nullable.
1245 * An OR node is nullable if either argument is nullable.
1247 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1248 that could correspond to the first character of a string matching the
1249 regexp rooted at the given node.
1250 * EMPTY leaves have empty firstpos.
1251 * The firstpos of a nonempty leaf is that leaf itself.
1252 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1254 * The firstpos of a CAT node is the firstpos of the left argument, union
1255 the firstpos of the right if the left argument is nullable.
1256 * The firstpos of an OR node is the union of firstpos of each argument.
1258 Lastpos: The lastpos of a node is the set of positions that could
1259 correspond to the last character of a string matching the regexp at
1261 * EMPTY leaves have empty lastpos.
1262 * The lastpos of a nonempty leaf is that leaf itself.
1263 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1265 * The lastpos of a CAT node is the lastpos of its right argument, union
1266 the lastpos of the left if the right argument is nullable.
1267 * The lastpos of an OR node is the union of the lastpos of each argument.
1269 Follow: The follow of a position is the set of positions that could
1270 correspond to the character following a character matching the node in
1271 a string matching the regexp. At this point we consider special symbols
1272 that match the empty string in some context to be just normal characters.
1273 Later, if we find that a special symbol is in a follow set, we will
1274 replace it with the elements of its follow, labeled with an appropriate
1276 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1277 the follow of every node in the lastpos.
1278 * Every node in the firstpos of the second argument of a CAT node is in
1279 the follow of every node in the lastpos of the first argument.
1281 Because of the postfix representation of the parse tree, the depth-first
1282 analysis is conveniently done by a linear scan with the aid of a stack.
1283 Sets are stored as arrays of the elements, obeying a stack-like allocation
1284 scheme; the number of elements in each set deeper in the stack can be
1285 used to determine the address of a particular set's array. */
1287 dfaanalyze (struct dfa *d, int searchflag)
1289 int *nullable; /* Nullable stack. */
1290 int *nfirstpos; /* Element count stack for firstpos sets. */
1291 position *firstpos; /* Array where firstpos elements are stored. */
1292 int *nlastpos; /* Element count stack for lastpos sets. */
1293 position *lastpos; /* Array where lastpos elements are stored. */
1294 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1295 position_set tmp; /* Temporary set for merging sets. */
1296 position_set merged; /* Result of merging sets. */
1297 int wants_newline; /* True if some position wants newline info. */
1299 int *o_nfirst, *o_nlast;
1300 position *o_firstpos, *o_lastpos;
1305 fprintf(stderr, "dfaanalyze:\n");
1306 for (i = 0; i < d->tindex; ++i)
1308 fprintf(stderr, " %d:", i);
1309 prtok(d->tokens[i]);
1314 d->searchflag = searchflag;
1316 MALLOC(nullable, int, d->depth);
1317 o_nullable = nullable;
1318 MALLOC(nfirstpos, int, d->depth);
1319 o_nfirst = nfirstpos;
1320 MALLOC(firstpos, position, d->nleaves);
1321 o_firstpos = firstpos, firstpos += d->nleaves;
1322 MALLOC(nlastpos, int, d->depth);
1324 MALLOC(lastpos, position, d->nleaves);
1325 o_lastpos = lastpos, lastpos += d->nleaves;
1326 MALLOC(nalloc, int, d->tindex);
1327 for (i = 0; i < d->tindex; ++i)
1329 MALLOC(merged.elems, position, d->nleaves);
1331 CALLOC(d->follows, position_set, d->tindex);
1333 for (i = 0; i < d->tindex; ++i)
1335 { /* Nonsyntactic #ifdef goo... */
1337 switch (d->tokens[i])
1340 /* The empty set is nullable. */
1343 /* The firstpos and lastpos of the empty leaf are both empty. */
1344 *nfirstpos++ = *nlastpos++ = 0;
1349 /* Every element in the firstpos of the argument is in the follow
1350 of every element in the lastpos. */
1351 tmp.nelem = nfirstpos[-1];
1352 tmp.elems = firstpos;
1354 for (j = 0; j < nlastpos[-1]; ++j)
1356 merge(&tmp, &d->follows[pos[j].index], &merged);
1357 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1358 nalloc[pos[j].index], merged.nelem - 1);
1359 copy(&merged, &d->follows[pos[j].index]);
1363 /* A QMARK or STAR node is automatically nullable. */
1364 if (d->tokens[i] != PLUS)
1369 /* Every element in the firstpos of the second argument is in the
1370 follow of every element in the lastpos of the first argument. */
1371 tmp.nelem = nfirstpos[-1];
1372 tmp.elems = firstpos;
1373 pos = lastpos + nlastpos[-1];
1374 for (j = 0; j < nlastpos[-2]; ++j)
1376 merge(&tmp, &d->follows[pos[j].index], &merged);
1377 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1378 nalloc[pos[j].index], merged.nelem - 1);
1379 copy(&merged, &d->follows[pos[j].index]);
1382 /* The firstpos of a CAT node is the firstpos of the first argument,
1383 union that of the second argument if the first is nullable. */
1385 nfirstpos[-2] += nfirstpos[-1];
1387 firstpos += nfirstpos[-1];
1390 /* The lastpos of a CAT node is the lastpos of the second argument,
1391 union that of the first argument if the second is nullable. */
1393 nlastpos[-2] += nlastpos[-1];
1396 pos = lastpos + nlastpos[-2];
1397 for (j = nlastpos[-1] - 1; j >= 0; --j)
1398 pos[j] = lastpos[j];
1399 lastpos += nlastpos[-2];
1400 nlastpos[-2] = nlastpos[-1];
1404 /* A CAT node is nullable if both arguments are nullable. */
1405 nullable[-2] = nullable[-1] && nullable[-2];
1411 /* The firstpos is the union of the firstpos of each argument. */
1412 nfirstpos[-2] += nfirstpos[-1];
1415 /* The lastpos is the union of the lastpos of each argument. */
1416 nlastpos[-2] += nlastpos[-1];
1419 /* An OR node is nullable if either argument is nullable. */
1420 nullable[-2] = nullable[-1] || nullable[-2];
1425 /* Anything else is a nonempty position. (Note that special
1426 constructs like \< are treated as nonempty strings here;
1427 an "epsilon closure" effectively makes them nullable later.
1428 Backreferences have to get a real position so we can detect
1429 transitions on them later. But they are nullable. */
1430 *nullable++ = d->tokens[i] == BACKREF;
1432 /* This position is in its own firstpos and lastpos. */
1433 *nfirstpos++ = *nlastpos++ = 1;
1434 --firstpos, --lastpos;
1435 firstpos->index = lastpos->index = i;
1436 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1438 /* Allocate the follow set for this position. */
1440 MALLOC(d->follows[i].elems, position, nalloc[i]);
1444 /* ... balance the above nonsyntactic #ifdef goo... */
1445 fprintf(stderr, "node %d:", i);
1446 prtok(d->tokens[i]);
1448 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1449 fprintf(stderr, " firstpos:");
1450 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1452 fprintf(stderr, " %d:", firstpos[j].index);
1453 prtok(d->tokens[firstpos[j].index]);
1455 fprintf(stderr, "\n lastpos:");
1456 for (j = nlastpos[-1] - 1; j >= 0; --j)
1458 fprintf(stderr, " %d:", lastpos[j].index);
1459 prtok(d->tokens[lastpos[j].index]);
1465 /* For each follow set that is the follow set of a real position, replace
1466 it with its epsilon closure. */
1467 for (i = 0; i < d->tindex; ++i)
1468 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1469 || d->tokens[i] >= CSET)
1472 fprintf(stderr, "follows(%d:", i);
1473 prtok(d->tokens[i]);
1474 fprintf(stderr, "):");
1475 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1477 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1478 prtok(d->tokens[d->follows[i].elems[j].index]);
1482 copy(&d->follows[i], &merged);
1483 epsclosure(&merged, d);
1484 if (d->follows[i].nelem < merged.nelem)
1485 REALLOC(d->follows[i].elems, position, merged.nelem);
1486 copy(&merged, &d->follows[i]);
1489 /* Get the epsilon closure of the firstpos of the regexp. The result will
1490 be the set of positions of state 0. */
1492 for (i = 0; i < nfirstpos[-1]; ++i)
1493 insert(firstpos[i], &merged);
1494 epsclosure(&merged, d);
1496 /* Check if any of the positions of state 0 will want newline context. */
1498 for (i = 0; i < merged.nelem; ++i)
1499 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1502 /* Build the initial state. */
1505 MALLOC(d->states, dfa_state, d->salloc);
1506 state_index(d, &merged, wants_newline, 0);
1517 /* Find, for each character, the transition out of state s of d, and store
1518 it in the appropriate slot of trans.
1520 We divide the positions of s into groups (positions can appear in more
1521 than one group). Each group is labeled with a set of characters that
1522 every position in the group matches (taking into account, if necessary,
1523 preceding context information of s). For each group, find the union
1524 of the its elements' follows. This set is the set of positions of the
1525 new state. For each character in the group's label, set the transition
1526 on this character to be to a state corresponding to the set's positions,
1527 and its associated backward context information, if necessary.
1529 If we are building a searching matcher, we include the positions of state
1532 The collection of groups is constructed by building an equivalence-class
1533 partition of the positions of s.
1535 For each position, find the set of characters C that it matches. Eliminate
1536 any characters from C that fail on grounds of backward context.
1538 Search through the groups, looking for a group whose label L has nonempty
1539 intersection with C. If L - C is nonempty, create a new group labeled
1540 L - C and having the same positions as the current group, and set L to
1541 the intersection of L and C. Insert the position in this group, set
1542 C = C - L, and resume scanning.
1544 If after comparing with every group there are characters remaining in C,
1545 create a new group labeled with the characters of C and insert this
1546 position in that group. */
1548 dfastate (int s, struct dfa *d, int trans[])
1550 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1551 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1552 int ngrps = 0; /* Number of groups actually used. */
1553 position pos; /* Current position being considered. */
1554 charclass matches; /* Set of matching characters. */
1555 int matchesf; /* True if matches is nonempty. */
1556 charclass intersect; /* Intersection with some label set. */
1557 int intersectf; /* True if intersect is nonempty. */
1558 charclass leftovers; /* Stuff in the label that didn't match. */
1559 int leftoversf; /* True if leftovers is nonempty. */
1560 static charclass letters; /* Set of characters considered letters. */
1561 static charclass newline; /* Set of characters that aren't newline. */
1562 position_set follows; /* Union of the follows of some group. */
1563 position_set tmp; /* Temporary space for merging sets. */
1564 int state; /* New state. */
1565 int wants_newline; /* New state wants to know newline context. */
1566 int state_newline; /* New state on a newline transition. */
1567 int wants_letter; /* New state wants to know letter context. */
1568 int state_letter; /* New state on a letter transition. */
1569 static int initialized; /* Flag for static initialization. */
1572 /* Initialize the set of letters, if necessary. */
1576 for (i = 0; i < NOTCHAR; ++i)
1577 if (IS_WORD_CONSTITUENT(i))
1579 setbit(eolbyte, newline);
1584 for (i = 0; i < d->states[s].elems.nelem; ++i)
1586 pos = d->states[s].elems.elems[i];
1587 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1588 setbit(d->tokens[pos.index], matches);
1589 else if (d->tokens[pos.index] >= CSET)
1590 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1594 /* Some characters may need to be eliminated from matches because
1595 they fail in the current context. */
1596 if (pos.constraint != 0xFF)
1598 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1599 d->states[s].newline, 1))
1600 clrbit(eolbyte, matches);
1601 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1602 d->states[s].newline, 0))
1603 for (j = 0; j < CHARCLASS_INTS; ++j)
1604 matches[j] &= newline[j];
1605 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1606 d->states[s].letter, 1))
1607 for (j = 0; j < CHARCLASS_INTS; ++j)
1608 matches[j] &= ~letters[j];
1609 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1610 d->states[s].letter, 0))
1611 for (j = 0; j < CHARCLASS_INTS; ++j)
1612 matches[j] &= letters[j];
1614 /* If there are no characters left, there's no point in going on. */
1615 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1617 if (j == CHARCLASS_INTS)
1621 for (j = 0; j < ngrps; ++j)
1623 /* If matches contains a single character only, and the current
1624 group's label doesn't contain that character, go on to the
1626 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1627 && !tstbit(d->tokens[pos.index], labels[j]))
1630 /* Check if this group's label has a nonempty intersection with
1633 for (k = 0; k < CHARCLASS_INTS; ++k)
1634 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1638 /* It does; now find the set differences both ways. */
1639 leftoversf = matchesf = 0;
1640 for (k = 0; k < CHARCLASS_INTS; ++k)
1642 /* Even an optimizing compiler can't know this for sure. */
1643 int match = matches[k], label = labels[j][k];
1645 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1646 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1649 /* If there were leftovers, create a new group labeled with them. */
1652 copyset(leftovers, labels[ngrps]);
1653 copyset(intersect, labels[j]);
1654 MALLOC(grps[ngrps].elems, position, d->nleaves);
1655 copy(&grps[j], &grps[ngrps]);
1659 /* Put the position in the current group. Note that there is no
1660 reason to call insert() here. */
1661 grps[j].elems[grps[j].nelem++] = pos;
1663 /* If every character matching the current position has been
1664 accounted for, we're done. */
1669 /* If we've passed the last group, and there are still characters
1670 unaccounted for, then we'll have to create a new group. */
1673 copyset(matches, labels[ngrps]);
1675 MALLOC(grps[ngrps].elems, position, d->nleaves);
1676 grps[ngrps].nelem = 1;
1677 grps[ngrps].elems[0] = pos;
1682 MALLOC(follows.elems, position, d->nleaves);
1683 MALLOC(tmp.elems, position, d->nleaves);
1685 /* If we are a searching matcher, the default transition is to a state
1686 containing the positions of state 0, otherwise the default transition
1687 is to fail miserably. */
1692 for (i = 0; i < d->states[0].elems.nelem; ++i)
1694 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1696 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1699 copy(&d->states[0].elems, &follows);
1700 state = state_index(d, &follows, 0, 0);
1702 state_newline = state_index(d, &follows, 1, 0);
1704 state_newline = state;
1706 state_letter = state_index(d, &follows, 0, 1);
1708 state_letter = state;
1709 for (i = 0; i < NOTCHAR; ++i)
1710 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1711 trans[eolbyte] = state_newline;
1714 for (i = 0; i < NOTCHAR; ++i)
1717 for (i = 0; i < ngrps; ++i)
1721 /* Find the union of the follows of the positions of the group.
1722 This is a hideously inefficient loop. Fix it someday. */
1723 for (j = 0; j < grps[i].nelem; ++j)
1724 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1725 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1727 /* If we are building a searching matcher, throw in the positions
1728 of state 0 as well. */
1730 for (j = 0; j < d->states[0].elems.nelem; ++j)
1731 insert(d->states[0].elems.elems[j], &follows);
1733 /* Find out if the new state will want any context information. */
1735 if (tstbit(eolbyte, labels[i]))
1736 for (j = 0; j < follows.nelem; ++j)
1737 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1741 for (j = 0; j < CHARCLASS_INTS; ++j)
1742 if (labels[i][j] & letters[j])
1744 if (j < CHARCLASS_INTS)
1745 for (j = 0; j < follows.nelem; ++j)
1746 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1749 /* Find the state(s) corresponding to the union of the follows. */
1750 state = state_index(d, &follows, 0, 0);
1752 state_newline = state_index(d, &follows, 1, 0);
1754 state_newline = state;
1756 state_letter = state_index(d, &follows, 0, 1);
1758 state_letter = state;
1760 /* Set the transitions for each character in the current label. */
1761 for (j = 0; j < CHARCLASS_INTS; ++j)
1762 for (k = 0; k < INTBITS; ++k)
1763 if (labels[i][j] & 1 << k)
1765 int c = j * INTBITS + k;
1768 trans[c] = state_newline;
1769 else if (IS_WORD_CONSTITUENT(c))
1770 trans[c] = state_letter;
1771 else if (c < NOTCHAR)
1776 for (i = 0; i < ngrps; ++i)
1777 free(grps[i].elems);
1778 free(follows.elems);
1782 /* Some routines for manipulating a compiled dfa's transition tables.
1783 Each state may or may not have a transition table; if it does, and it
1784 is a non-accepting state, then d->trans[state] points to its table.
1785 If it is an accepting state then d->fails[state] points to its table.
1786 If it has no table at all, then d->trans[state] is NULL.
1787 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1790 build_state (int s, struct dfa *d)
1792 int *trans; /* The new transition table. */
1795 /* Set an upper limit on the number of transition tables that will ever
1796 exist at once. 1024 is arbitrary. The idea is that the frequently
1797 used transition tables will be quickly rebuilt, whereas the ones that
1798 were only needed once or twice will be cleared away. */
1799 if (d->trcount >= 1024)
1801 for (i = 0; i < d->tralloc; ++i)
1804 free((ptr_t) d->trans[i]);
1807 else if (d->fails[i])
1809 free((ptr_t) d->fails[i]);
1817 /* Set up the success bits for this state. */
1819 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1822 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1825 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1829 MALLOC(trans, int, NOTCHAR);
1830 dfastate(s, d, trans);
1832 /* Now go through the new transition table, and make sure that the trans
1833 and fail arrays are allocated large enough to hold a pointer for the
1834 largest state mentioned in the table. */
1835 for (i = 0; i < NOTCHAR; ++i)
1836 if (trans[i] >= d->tralloc)
1838 int oldalloc = d->tralloc;
1840 while (trans[i] >= d->tralloc)
1842 REALLOC(d->realtrans, int *, d->tralloc + 1);
1843 d->trans = d->realtrans + 1;
1844 REALLOC(d->fails, int *, d->tralloc);
1845 REALLOC(d->success, int, d->tralloc);
1846 REALLOC(d->newlines, int, d->tralloc);
1847 while (oldalloc < d->tralloc)
1849 d->trans[oldalloc] = NULL;
1850 d->fails[oldalloc++] = NULL;
1854 /* Keep the newline transition in a special place so we can use it as
1856 d->newlines[s] = trans[eolbyte];
1857 trans[eolbyte] = -1;
1859 if (ACCEPTING(s, *d))
1860 d->fails[s] = trans;
1862 d->trans[s] = trans;
1866 build_state_zero (struct dfa *d)
1870 CALLOC(d->realtrans, int *, d->tralloc + 1);
1871 d->trans = d->realtrans + 1;
1872 CALLOC(d->fails, int *, d->tralloc);
1873 MALLOC(d->success, int, d->tralloc);
1874 MALLOC(d->newlines, int, d->tralloc);
1878 /* Search through a buffer looking for a match to the given struct dfa.
1879 Find the first occurrence of a string matching the regexp in the buffer,
1880 and the shortest possible version thereof. Return a pointer to the first
1881 character after the match, or NULL if none is found. Begin points to
1882 the beginning of the buffer, and end points to the first character after
1883 its end. We store a newline in *end to act as a sentinel, so end had
1884 better point somewhere valid. Newline is a flag indicating whether to
1885 allow newlines to be in the matching string. If count is non-
1886 NULL it points to a place we're supposed to increment every time we
1887 see a newline. Finally, if backref is non-NULL it points to a place
1888 where we're supposed to store a 1 if backreferencing happened and the
1889 match needs to be verified by a backtracking matcher. Otherwise
1890 we store a 0 in *backref. */
1892 dfaexec (struct dfa *d, char *begin, char *end,
1893 int newline, int *count, int *backref)
1895 register int s, s1, tmp; /* Current state. */
1896 register unsigned char *p; /* Current input character. */
1897 register int **trans, *t; /* Copy of d->trans so it can be optimized
1899 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
1900 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1901 static int sbit_init;
1908 for (i = 0; i < NOTCHAR; ++i)
1909 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1914 build_state_zero(d);
1917 p = (unsigned char *) begin;
1923 while ((t = trans[s]) != 0) { /* hand-optimized loop */
1925 if ((t = trans[s1]) == 0) {
1926 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1932 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1934 if (d->success[s] & sbit[*p])
1937 *backref = (d->states[s].backref != 0);
1942 s = d->fails[s][*p++];
1946 /* If the previous character was a newline, count it. */
1947 if (count && (char *) p <= end && p[-1] == eol)
1950 /* Check if we've run off the end of the buffer. */
1951 if ((char *) p > end)
1961 if (p[-1] == eol && newline)
1963 s = d->newlines[s1];
1971 /* Initialize the components of a dfa that the other routines don't
1972 initialize for themselves. */
1974 dfainit (struct dfa *d)
1977 MALLOC(d->charclasses, charclass, d->calloc);
1981 MALLOC(d->tokens, token, d->talloc);
1982 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1990 /* Parse and analyze a single string of the given length. */
1992 dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1994 if (case_fold) /* dummy folding in service of dfamust() */
1999 lcopy = malloc(len);
2001 dfaerror(_("out of memory"));
2003 /* This is a kludge. */
2005 for (i = 0; i < len; ++i)
2006 if (ISUPPER ((unsigned char) s[i]))
2007 lcopy[i] = tolower ((unsigned char) s[i]);
2012 dfaparse(lcopy, len, d);
2015 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2017 dfaparse(s, len, d);
2018 dfaanalyze(d, searchflag);
2023 dfaparse(s, len, d);
2025 dfaanalyze(d, searchflag);
2029 /* Free the storage held by the components of a dfa. */
2031 dfafree (struct dfa *d)
2034 struct dfamust *dm, *ndm;
2036 free((ptr_t) d->charclasses);
2037 free((ptr_t) d->tokens);
2038 for (i = 0; i < d->sindex; ++i)
2039 free((ptr_t) d->states[i].elems.elems);
2040 free((ptr_t) d->states);
2041 for (i = 0; i < d->tindex; ++i)
2042 if (d->follows[i].elems)
2043 free((ptr_t) d->follows[i].elems);
2044 free((ptr_t) d->follows);
2045 for (i = 0; i < d->tralloc; ++i)
2047 free((ptr_t) d->trans[i]);
2048 else if (d->fails[i])
2049 free((ptr_t) d->fails[i]);
2050 if (d->realtrans) free((ptr_t) d->realtrans);
2051 if (d->fails) free((ptr_t) d->fails);
2052 if (d->newlines) free((ptr_t) d->newlines);
2053 if (d->success) free((ptr_t) d->success);
2054 for (dm = d->musts; dm; dm = ndm)
2062 /* Having found the postfix representation of the regular expression,
2063 try to find a long sequence of characters that must appear in any line
2065 Finding a "longest" sequence is beyond the scope here;
2066 we take an easy way out and hope for the best.
2067 (Take "(ab|a)b"--please.)
2069 We do a bottom-up calculation of sequences of characters that must appear
2070 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2072 sequences that must appear at the left of the match ("left")
2073 sequences that must appear at the right of the match ("right")
2074 lists of sequences that must appear somewhere in the match ("in")
2075 sequences that must constitute the match ("is")
2077 When we get to the root of the tree, we use one of the longest of its
2078 calculated "in" sequences as our answer. The sequence we find is returned in
2079 d->must (where "d" is the single argument passed to "dfamust");
2080 the length of the sequence is returned in d->mustn.
2082 The sequences calculated for the various types of node (in pseudo ANSI c)
2083 are shown below. "p" is the operand of unary operators (and the left-hand
2084 operand of binary operators); "q" is the right-hand operand of binary
2087 "ZERO" means "a zero-length sequence" below.
2089 Type left right is in
2090 ---- ---- ----- -- --
2091 char c # c # c # c # c
2093 CSET ZERO ZERO ZERO ZERO
2095 STAR ZERO ZERO ZERO ZERO
2097 QMARK ZERO ZERO ZERO ZERO
2099 PLUS p->left p->right ZERO p->in
2101 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2102 p->left : q->right : q->is!=ZERO) ? q->in plus
2103 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2106 OR longest common longest common (do p->is and substrings common to
2107 leading trailing q->is have same p->in and q->in
2108 (sub)sequence (sub)sequence length and
2109 of p->left of p->right content) ?
2110 and q->left and q->right p->is : NULL
2112 If there's anything else we recognize in the tree, all four sequences get set
2113 to zero-length sequences. If there's something we don't recognize in the tree,
2114 we just return a zero-length sequence.
2116 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2119 And. . .is it here or someplace that we might ponder "optimizations" such as
2120 egrep 'psi|epsilon' -> egrep 'psi'
2121 egrep 'pepsi|epsilon' -> egrep 'epsi'
2122 (Yes, we now find "epsi" as a "string
2123 that must occur", but we might also
2124 simplify the *entire* r.e. being sought)
2125 grep '[c]' -> grep 'c'
2126 grep '(ab|a)b' -> grep 'ab'
2127 grep 'ab*' -> grep 'a'
2128 grep 'a*b' -> grep 'b'
2130 There are several issues:
2132 Is optimization easy (enough)?
2134 Does optimization actually accomplish anything,
2135 or is the automaton you get from "psi|epsilon" (for example)
2136 the same as the one you get from "psi" (for example)?
2138 Are optimizable r.e.'s likely to be used in real-life situations
2139 (something like 'ab*' is probably unlikely; something like is
2140 'psi|epsilon' is likelier)? */
2143 icatalloc (char *old, char *new)
2146 size_t oldsize, newsize;
2148 newsize = (new == NULL) ? 0 : strlen(new);
2151 else if (newsize == 0)
2153 else oldsize = strlen(old);
2155 result = (char *) malloc(newsize + 1);
2157 result = (char *) realloc((void *) old, oldsize + newsize + 1);
2158 if (result != NULL && new != NULL)
2159 (void) strcpy(result + oldsize, new);
2164 icpyalloc (char *string)
2166 return icatalloc((char *) NULL, string);
2170 istrstr (char *lookin, char *lookfor)
2175 len = strlen(lookfor);
2176 for (cp = lookin; *cp != '\0'; ++cp)
2177 if (strncmp(cp, lookfor, len) == 0)
2190 freelist (char **cpp)
2196 for (i = 0; cpp[i] != NULL; ++i)
2204 enlist (char **cpp, char *new, size_t len)
2210 if ((new = icpyalloc(new)) == NULL)
2216 /* Is there already something in the list that's new (or longer)? */
2217 for (i = 0; cpp[i] != NULL; ++i)
2218 if (istrstr(cpp[i], new) != NULL)
2223 /* Eliminate any obsoleted strings. */
2225 while (cpp[j] != NULL)
2226 if (istrstr(new, cpp[j]) == NULL)
2236 /* Add the new string. */
2237 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2245 /* Given pointers to two strings, return a pointer to an allocated
2246 list of their distinct common substrings. Return NULL if something
2249 comsubs (char *left, char *right)
2256 if (left == NULL || right == NULL)
2258 cpp = (char **) malloc(sizeof *cpp);
2262 for (lcp = left; *lcp != '\0'; ++lcp)
2265 rcp = index(right, *lcp);
2268 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2272 rcp = index(rcp + 1, *lcp);
2276 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2283 addlists (char **old, char **new)
2287 if (old == NULL || new == NULL)
2289 for (i = 0; new[i] != NULL; ++i)
2291 old = enlist(old, new[i], strlen(new[i]));
2298 /* Given two lists of substrings, return a new list giving substrings
2301 inboth (char **left, char **right)
2307 if (left == NULL || right == NULL)
2309 both = (char **) malloc(sizeof *both);
2313 for (lnum = 0; left[lnum] != NULL; ++lnum)
2315 for (rnum = 0; right[rnum] != NULL; ++rnum)
2317 temp = comsubs(left[lnum], right[rnum]);
2323 both = addlists(both, temp);
2342 resetmust (must *mp)
2344 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2349 dfamust (struct dfa *dfa)
2360 static char empty_string[] = "";
2362 result = empty_string;
2364 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2368 for (i = 0; i <= dfa->tindex; ++i)
2370 for (i = 0; i <= dfa->tindex; ++i)
2372 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2373 mp[i].left = malloc(2);
2374 mp[i].right = malloc(2);
2375 mp[i].is = malloc(2);
2376 if (mp[i].in == NULL || mp[i].left == NULL ||
2377 mp[i].right == NULL || mp[i].is == NULL)
2379 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2383 fprintf(stderr, "dfamust:\n");
2384 for (i = 0; i < dfa->tindex; ++i)
2386 fprintf(stderr, " %d:", i);
2387 prtok(dfa->tokens[i]);
2391 for (ri = 0; ri < dfa->tindex; ++ri)
2393 switch (t = dfa->tokens[ri])
2397 goto done; /* "cannot happen" */
2411 goto done; /* "cannot happen" */
2418 goto done; /* "cannot happen" */
2427 /* Guaranteed to be. Unlikely, but. . . */
2428 if (strcmp(lmp->is, rmp->is) != 0)
2430 /* Left side--easy */
2432 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2434 lmp->left[i] = '\0';
2436 ln = strlen(lmp->right);
2437 rn = strlen(rmp->right);
2441 for (i = 0; i < n; ++i)
2442 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2444 for (j = 0; j < i; ++j)
2445 lmp->right[j] = lmp->right[(ln - i) + j];
2446 lmp->right[j] = '\0';
2447 new = inboth(lmp->in, rmp->in);
2451 free((char *) lmp->in);
2457 goto done; /* "cannot happen" */
2462 if (mp != &musts[1])
2463 goto done; /* "cannot happen" */
2464 for (i = 0; musts[0].in[i] != NULL; ++i)
2465 if (strlen(musts[0].in[i]) > strlen(result))
2466 result = musts[0].in[i];
2467 if (strcmp(result, musts[0].is) == 0)
2472 goto done; /* "cannot happen" */
2479 /* In. Everything in left, plus everything in
2480 right, plus catenation of
2481 left's right and right's left. */
2482 lmp->in = addlists(lmp->in, rmp->in);
2483 if (lmp->in == NULL)
2485 if (lmp->right[0] != '\0' &&
2486 rmp->left[0] != '\0')
2490 tp = icpyalloc(lmp->right);
2493 tp = icatalloc(tp, rmp->left);
2496 lmp->in = enlist(lmp->in, tp,
2499 if (lmp->in == NULL)
2503 if (lmp->is[0] != '\0')
2505 lmp->left = icatalloc(lmp->left,
2507 if (lmp->left == NULL)
2511 if (rmp->is[0] == '\0')
2512 lmp->right[0] = '\0';
2513 lmp->right = icatalloc(lmp->right, rmp->right);
2514 if (lmp->right == NULL)
2516 /* Guaranteed to be */
2517 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2519 lmp->is = icatalloc(lmp->is, rmp->is);
2520 if (lmp->is == NULL)
2530 /* "cannot happen" */
2535 /* not on *my* shift */
2545 /* plain character */
2547 mp->is[0] = mp->left[0] = mp->right[0] = t;
2548 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2549 mp->in = enlist(mp->in, mp->is, (size_t)1);
2556 fprintf(stderr, " node: %d:", ri);
2557 prtok(dfa->tokens[ri]);
2558 fprintf(stderr, "\n in:");
2559 for (i = 0; mp->in[i]; ++i)
2560 fprintf(stderr, " \"%s\"", mp->in[i]);
2561 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2562 fprintf(stderr, " left: \"%s\"\n", mp->left);
2563 fprintf(stderr, " right: \"%s\"\n", mp->right);
2570 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2572 dm->must = malloc(strlen(result) + 1);
2573 strcpy(dm->must, result);
2574 dm->next = dfa->musts;
2578 for (i = 0; i <= dfa->tindex; ++i)
2581 ifree((char *) mp[i].in);