Grep: Add DragonFly README files
[dragonfly.git] / gnu / usr.bin / grep / dfa.c
CommitLineData
984263bc
MD
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
17
18/* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
20
21/* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
0e387216 22/* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.3 2004/02/03 19:22:57 dillon Exp $ */
984263bc
MD
23
24#ifdef HAVE_CONFIG_H
25#include <config.h>
26#endif
27
28#include <assert.h>
29#include <ctype.h>
30#include <stdio.h>
31
32#include <sys/types.h>
33#ifdef STDC_HEADERS
34#include <stdlib.h>
35#else
36extern char *calloc(), *malloc(), *realloc();
37extern void free();
38#endif
39
40#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
41#include <string.h>
42#undef index
43#define index strchr
44#else
45#include <strings.h>
46#endif
47
48#ifndef DEBUG /* use the same approach as regex.c */
49#undef assert
50#define assert(e)
51#endif /* DEBUG */
52
53#ifndef isgraph
54#define isgraph(C) (isprint(C) && !isspace(C))
55#endif
56
57#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
58#define ISALPHA(C) isalpha(C)
59#define ISUPPER(C) isupper(C)
60#define ISLOWER(C) islower(C)
61#define ISDIGIT(C) isdigit(C)
62#define ISXDIGIT(C) isxdigit(C)
63#define ISSPACE(C) isspace(C)
64#define ISPUNCT(C) ispunct(C)
65#define ISALNUM(C) isalnum(C)
66#define ISPRINT(C) isprint(C)
67#define ISGRAPH(C) isgraph(C)
68#define ISCNTRL(C) iscntrl(C)
69#else
70#define ISALPHA(C) (isascii(C) && isalpha(C))
71#define ISUPPER(C) (isascii(C) && isupper(C))
72#define ISLOWER(C) (isascii(C) && islower(C))
73#define ISDIGIT(C) (isascii(C) && isdigit(C))
74#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
75#define ISSPACE(C) (isascii(C) && isspace(C))
76#define ISPUNCT(C) (isascii(C) && ispunct(C))
77#define ISALNUM(C) (isascii(C) && isalnum(C))
78#define ISPRINT(C) (isascii(C) && isprint(C))
79#define ISGRAPH(C) (isascii(C) && isgraph(C))
80#define ISCNTRL(C) (isascii(C) && iscntrl(C))
81#endif
82
83/* ISASCIIDIGIT differs from ISDIGIT, as follows:
84 - Its arg may be any int or unsigned int; it need not be an unsigned char.
85 - It's guaranteed to evaluate its argument exactly once.
86 - It's typically faster.
87 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
88 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
89 it's important to use the locale's definition of `digit' even when the
90 host does not conform to Posix. */
91#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
92
93/* If we (don't) have I18N. */
94/* glibc defines _ */
95#ifndef _
96# ifdef HAVE_LIBINTL_H
97# include <libintl.h>
98# ifndef _
99# define _(Str) gettext (Str)
100# endif
101# else
102# define _(Str) (Str)
103# endif
104#endif
105
984263bc 106#include <gnuregex.h>
984263bc
MD
107#include "dfa.h"
108
109/* HPUX, define those as macros in sys/param.h */
110#ifdef setbit
111# undef setbit
112#endif
113#ifdef clrbit
114# undef clrbit
115#endif
116
117static void dfamust PARAMS ((struct dfa *dfa));
118
119static ptr_t xcalloc PARAMS ((size_t n, size_t s));
120static ptr_t xmalloc PARAMS ((size_t n));
121static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
122#ifdef DEBUG
123static void prtok PARAMS ((token t));
124#endif
125static int tstbit PARAMS ((int b, charclass c));
126static void setbit PARAMS ((int b, charclass c));
127static void clrbit PARAMS ((int b, charclass c));
128static void copyset PARAMS ((charclass src, charclass dst));
129static void zeroset PARAMS ((charclass s));
130static void notset PARAMS ((charclass s));
131static int equal PARAMS ((charclass s1, charclass s2));
132static int charclass_index PARAMS ((charclass s));
133static int looking_at PARAMS ((const char *s));
134static token lex PARAMS ((void));
135static void addtok PARAMS ((token t));
136static void atom PARAMS ((void));
137static int nsubtoks PARAMS ((int tindex));
138static void copytoks PARAMS ((int tindex, int ntokens));
139static void closure PARAMS ((void));
140static void branch PARAMS ((void));
141static void regexp PARAMS ((int toplevel));
142static void copy PARAMS ((position_set *src, position_set *dst));
143static void insert PARAMS ((position p, position_set *s));
144static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
145static void delete PARAMS ((position p, position_set *s));
146static int state_index PARAMS ((struct dfa *d, position_set *s,
147 int newline, int letter));
148static void build_state PARAMS ((int s, struct dfa *d));
149static void build_state_zero PARAMS ((struct dfa *d));
150static char *icatalloc PARAMS ((char *old, char *new));
151static char *icpyalloc PARAMS ((char *string));
152static char *istrstr PARAMS ((char *lookin, char *lookfor));
153static void ifree PARAMS ((char *cp));
154static void freelist PARAMS ((char **cpp));
155static char **enlist PARAMS ((char **cpp, char *new, size_t len));
156static char **comsubs PARAMS ((char *left, char *right));
157static char **addlists PARAMS ((char **old, char **new));
158static char **inboth PARAMS ((char **left, char **right));
159
160static ptr_t
161xcalloc (size_t n, size_t s)
162{
163 ptr_t r = calloc(n, s);
164
165 if (!r)
166 dfaerror(_("Memory exhausted"));
167 return r;
168}
169
170static ptr_t
171xmalloc (size_t n)
172{
173 ptr_t r = malloc(n);
174
175 assert(n != 0);
176 if (!r)
177 dfaerror(_("Memory exhausted"));
178 return r;
179}
180
181static ptr_t
182xrealloc (ptr_t p, size_t n)
183{
184 ptr_t r = realloc(p, n);
185
186 assert(n != 0);
187 if (!r)
188 dfaerror(_("Memory exhausted"));
189 return r;
190}
191
192#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
193#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
194#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
195
196/* Reallocate an array of type t if nalloc is too small for index. */
197#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
198 if ((index) >= (nalloc)) \
199 { \
200 while ((index) >= (nalloc)) \
201 (nalloc) *= 2; \
202 REALLOC(p, t, nalloc); \
203 }
204
205#ifdef DEBUG
206
207static void
208prtok (token t)
209{
210 char *s;
211
212 if (t < 0)
213 fprintf(stderr, "END");
214 else if (t < NOTCHAR)
215 fprintf(stderr, "%c", t);
216 else
217 {
218 switch (t)
219 {
220 case EMPTY: s = "EMPTY"; break;
221 case BACKREF: s = "BACKREF"; break;
222 case BEGLINE: s = "BEGLINE"; break;
223 case ENDLINE: s = "ENDLINE"; break;
224 case BEGWORD: s = "BEGWORD"; break;
225 case ENDWORD: s = "ENDWORD"; break;
226 case LIMWORD: s = "LIMWORD"; break;
227 case NOTLIMWORD: s = "NOTLIMWORD"; break;
228 case QMARK: s = "QMARK"; break;
229 case STAR: s = "STAR"; break;
230 case PLUS: s = "PLUS"; break;
231 case CAT: s = "CAT"; break;
232 case OR: s = "OR"; break;
233 case ORTOP: s = "ORTOP"; break;
234 case LPAREN: s = "LPAREN"; break;
235 case RPAREN: s = "RPAREN"; break;
236 default: s = "CSET"; break;
237 }
238 fprintf(stderr, "%s", s);
239 }
240}
241#endif /* DEBUG */
242
243/* Stuff pertaining to charclasses. */
244
245static int
246tstbit (int b, charclass c)
247{
248 return c[b / INTBITS] & 1 << b % INTBITS;
249}
250
251static void
252setbit (int b, charclass c)
253{
254 c[b / INTBITS] |= 1 << b % INTBITS;
255}
256
257static void
258clrbit (int b, charclass c)
259{
260 c[b / INTBITS] &= ~(1 << b % INTBITS);
261}
262
263static void
264copyset (charclass src, charclass dst)
265{
266 int i;
267
268 for (i = 0; i < CHARCLASS_INTS; ++i)
269 dst[i] = src[i];
270}
271
272static void
273zeroset (charclass s)
274{
275 int i;
276
277 for (i = 0; i < CHARCLASS_INTS; ++i)
278 s[i] = 0;
279}
280
281static void
282notset (charclass s)
283{
284 int i;
285
286 for (i = 0; i < CHARCLASS_INTS; ++i)
287 s[i] = ~s[i];
288}
289
290static int
291equal (charclass s1, charclass s2)
292{
293 int i;
294
295 for (i = 0; i < CHARCLASS_INTS; ++i)
296 if (s1[i] != s2[i])
297 return 0;
298 return 1;
299}
300
301/* A pointer to the current dfa is kept here during parsing. */
302static struct dfa *dfa;
303
304/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
305static int
306charclass_index (charclass s)
307{
308 int i;
309
310 for (i = 0; i < dfa->cindex; ++i)
311 if (equal(s, dfa->charclasses[i]))
312 return i;
313 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
314 ++dfa->cindex;
315 copyset(s, dfa->charclasses[i]);
316 return i;
317}
318
319/* Syntax bits controlling the behavior of the lexical analyzer. */
320static reg_syntax_t syntax_bits, syntax_bits_set;
321
322/* Flag for case-folding letters into sets. */
323static int case_fold;
324
325/* End-of-line byte in data. */
326static unsigned char eolbyte;
327
328/* Entry point to set syntax options. */
329void
330dfasyntax (reg_syntax_t bits, int fold, int eol)
331{
332 syntax_bits_set = 1;
333 syntax_bits = bits;
334 case_fold = fold;
335 eolbyte = eol;
336}
337
338/* Lexical analyzer. All the dross that deals with the obnoxious
339 GNU Regex syntax bits is located here. The poor, suffering
340 reader is referred to the GNU Regex documentation for the
341 meaning of the @#%!@#%^!@ syntax bits. */
342
343static char *lexstart; /* Pointer to beginning of input string. */
344static char *lexptr; /* Pointer to next input character. */
345static int lexleft; /* Number of characters remaining. */
346static token lasttok; /* Previous token returned; initially END. */
347static int laststart; /* True if we're separated from beginning or (, |
348 only by zero-width characters. */
349static int parens; /* Count of outstanding left parens. */
350static int minrep, maxrep; /* Repeat counts for {m,n}. */
351
352/* Note that characters become unsigned here. */
353#define FETCH(c, eoferr) \
354 { \
355 if (! lexleft) \
356 { \
357 if (eoferr != 0) \
358 dfaerror (eoferr); \
359 else \
360 return lasttok = END; \
361 } \
362 (c) = (unsigned char) *lexptr++; \
363 --lexleft; \
364 }
365
366#ifdef __STDC__
367#define FUNC(F, P) static int F(int c) { return P(c); }
368#else
369#define FUNC(F, P) static int F(c) int c; { return P(c); }
370#endif
371
372FUNC(is_alpha, ISALPHA)
373FUNC(is_upper, ISUPPER)
374FUNC(is_lower, ISLOWER)
375FUNC(is_digit, ISDIGIT)
376FUNC(is_xdigit, ISXDIGIT)
377FUNC(is_space, ISSPACE)
378FUNC(is_punct, ISPUNCT)
379FUNC(is_alnum, ISALNUM)
380FUNC(is_print, ISPRINT)
381FUNC(is_graph, ISGRAPH)
382FUNC(is_cntrl, ISCNTRL)
383
384static int
385is_blank (int c)
386{
387 return (c == ' ' || c == '\t');
388}
389
390/* The following list maps the names of the Posix named character classes
391 to predicate functions that determine whether a given character is in
392 the class. The leading [ has already been eaten by the lexical analyzer. */
393static struct {
394 const char *name;
395 int (*pred) PARAMS ((int));
396} prednames[] = {
397 { ":alpha:]", is_alpha },
398 { ":upper:]", is_upper },
399 { ":lower:]", is_lower },
400 { ":digit:]", is_digit },
401 { ":xdigit:]", is_xdigit },
402 { ":space:]", is_space },
403 { ":punct:]", is_punct },
404 { ":alnum:]", is_alnum },
405 { ":print:]", is_print },
406 { ":graph:]", is_graph },
407 { ":cntrl:]", is_cntrl },
408 { ":blank:]", is_blank },
409 { 0 }
410};
411
412/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
413#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
414
415static int
416looking_at (char const *s)
417{
418 size_t len;
419
420 len = strlen(s);
421 if (lexleft < len)
422 return 0;
423 return strncmp(s, lexptr, len) == 0;
424}
425
426static token
427lex (void)
428{
429 token c, c1, c2;
430 int backslash = 0, invert;
431 charclass ccl;
432 int i;
433 char lo[2];
434 char hi[2];
435
436 /* Basic plan: We fetch a character. If it's a backslash,
437 we set the backslash flag and go through the loop again.
438 On the plus side, this avoids having a duplicate of the
439 main switch inside the backslash case. On the minus side,
440 it means that just about every case begins with
441 "if (backslash) ...". */
442 for (i = 0; i < 2; ++i)
443 {
444 FETCH(c, 0);
445 switch (c)
446 {
447 case '\\':
448 if (backslash)
449 goto normal_char;
450 if (lexleft == 0)
451 dfaerror(_("Unfinished \\ escape"));
452 backslash = 1;
453 break;
454
455 case '^':
456 if (backslash)
457 goto normal_char;
458 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
459 || lasttok == END
460 || lasttok == LPAREN
461 || lasttok == OR)
462 return lasttok = BEGLINE;
463 goto normal_char;
464
465 case '$':
466 if (backslash)
467 goto normal_char;
468 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
469 || lexleft == 0
470 || (syntax_bits & RE_NO_BK_PARENS
471 ? lexleft > 0 && *lexptr == ')'
472 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
473 || (syntax_bits & RE_NO_BK_VBAR
474 ? lexleft > 0 && *lexptr == '|'
475 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
476 || ((syntax_bits & RE_NEWLINE_ALT)
477 && lexleft > 0 && *lexptr == '\n'))
478 return lasttok = ENDLINE;
479 goto normal_char;
480
481 case '1':
482 case '2':
483 case '3':
484 case '4':
485 case '5':
486 case '6':
487 case '7':
488 case '8':
489 case '9':
490 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
491 {
492 laststart = 0;
493 return lasttok = BACKREF;
494 }
495 goto normal_char;
496
497 case '`':
498 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
499 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
500 goto normal_char;
501
502 case '\'':
503 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
504 return lasttok = ENDLINE; /* FIXME: should be end of string */
505 goto normal_char;
506
507 case '<':
508 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
509 return lasttok = BEGWORD;
510 goto normal_char;
511
512 case '>':
513 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
514 return lasttok = ENDWORD;
515 goto normal_char;
516
517 case 'b':
518 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
519 return lasttok = LIMWORD;
520 goto normal_char;
521
522 case 'B':
523 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
524 return lasttok = NOTLIMWORD;
525 goto normal_char;
526
527 case '?':
528 if (syntax_bits & RE_LIMITED_OPS)
529 goto normal_char;
530 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
531 goto normal_char;
532 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
533 goto normal_char;
534 return lasttok = QMARK;
535
536 case '*':
537 if (backslash)
538 goto normal_char;
539 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
540 goto normal_char;
541 return lasttok = STAR;
542
543 case '+':
544 if (syntax_bits & RE_LIMITED_OPS)
545 goto normal_char;
546 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
547 goto normal_char;
548 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
549 goto normal_char;
550 return lasttok = PLUS;
551
552 case '{':
553 if (!(syntax_bits & RE_INTERVALS))
554 goto normal_char;
555 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
556 goto normal_char;
557 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
558 goto normal_char;
559
560 if (syntax_bits & RE_NO_BK_BRACES)
561 {
562 /* Scan ahead for a valid interval; if it's not valid,
563 treat it as a literal '{'. */
564 int lo = -1, hi = -1;
565 char const *p = lexptr;
566 char const *lim = p + lexleft;
567 for (; p != lim && ISASCIIDIGIT (*p); p++)
568 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
569 if (p != lim && *p == ',')
570 while (++p != lim && ISASCIIDIGIT (*p))
571 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
572 else
573 hi = lo;
574 if (p == lim || *p != '}'
575 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
576 goto normal_char;
577 }
578
579 minrep = 0;
580 /* Cases:
581 {M} - exact count
582 {M,} - minimum count, maximum is infinity
583 {M,N} - M through N */
584 FETCH(c, _("unfinished repeat count"));
585 if (ISASCIIDIGIT (c))
586 {
587 minrep = c - '0';
588 for (;;)
589 {
590 FETCH(c, _("unfinished repeat count"));
591 if (! ISASCIIDIGIT (c))
592 break;
593 minrep = 10 * minrep + c - '0';
594 }
595 }
596 else
597 dfaerror(_("malformed repeat count"));
598 if (c == ',')
599 {
600 FETCH (c, _("unfinished repeat count"));
601 if (! ISASCIIDIGIT (c))
602 maxrep = -1;
603 else
604 {
605 maxrep = c - '0';
606 for (;;)
607 {
608 FETCH (c, _("unfinished repeat count"));
609 if (! ISASCIIDIGIT (c))
610 break;
611 maxrep = 10 * maxrep + c - '0';
612 }
613 if (0 <= maxrep && maxrep < minrep)
614 dfaerror (_("malformed repeat count"));
615 }
616 }
617 else
618 maxrep = minrep;
619 if (!(syntax_bits & RE_NO_BK_BRACES))
620 {
621 if (c != '\\')
622 dfaerror(_("malformed repeat count"));
623 FETCH(c, _("unfinished repeat count"));
624 }
625 if (c != '}')
626 dfaerror(_("malformed repeat count"));
627 laststart = 0;
628 return lasttok = REPMN;
629
630 case '|':
631 if (syntax_bits & RE_LIMITED_OPS)
632 goto normal_char;
633 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
634 goto normal_char;
635 laststart = 1;
636 return lasttok = OR;
637
638 case '\n':
639 if (syntax_bits & RE_LIMITED_OPS
640 || backslash
641 || !(syntax_bits & RE_NEWLINE_ALT))
642 goto normal_char;
643 laststart = 1;
644 return lasttok = OR;
645
646 case '(':
647 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
648 goto normal_char;
649 ++parens;
650 laststart = 1;
651 return lasttok = LPAREN;
652
653 case ')':
654 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
655 goto normal_char;
656 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
657 goto normal_char;
658 --parens;
659 laststart = 0;
660 return lasttok = RPAREN;
661
662 case '.':
663 if (backslash)
664 goto normal_char;
665 zeroset(ccl);
666 notset(ccl);
667 if (!(syntax_bits & RE_DOT_NEWLINE))
668 clrbit(eolbyte, ccl);
669 if (syntax_bits & RE_DOT_NOT_NULL)
670 clrbit('\0', ccl);
671 laststart = 0;
672 return lasttok = CSET + charclass_index(ccl);
673
674 case 'w':
675 case 'W':
676 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
677 goto normal_char;
678 zeroset(ccl);
679 for (c2 = 0; c2 < NOTCHAR; ++c2)
680 if (IS_WORD_CONSTITUENT(c2))
681 setbit(c2, ccl);
682 if (c == 'W')
683 notset(ccl);
684 laststart = 0;
685 return lasttok = CSET + charclass_index(ccl);
686
687 case '[':
688 if (backslash)
689 goto normal_char;
690 zeroset(ccl);
691 FETCH(c, _("Unbalanced ["));
692 if (c == '^')
693 {
694 FETCH(c, _("Unbalanced ["));
695 invert = 1;
696 }
697 else
698 invert = 0;
699 do
700 {
701 /* Nobody ever said this had to be fast. :-)
702 Note that if we're looking at some other [:...:]
703 construct, we just treat it as a bunch of ordinary
704 characters. We can do this because we assume
705 regex has checked for syntax errors before
706 dfa is ever called. */
707 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
708 for (c1 = 0; prednames[c1].name; ++c1)
709 if (looking_at(prednames[c1].name))
710 {
711 int (*pred)() = prednames[c1].pred;
712 if (case_fold
713 && (pred == is_upper || pred == is_lower))
714 pred = is_alpha;
715
716 for (c2 = 0; c2 < NOTCHAR; ++c2)
717 if ((*pred)(c2))
718 setbit(c2, ccl);
719 lexptr += strlen(prednames[c1].name);
720 lexleft -= strlen(prednames[c1].name);
721 FETCH(c1, _("Unbalanced ["));
722 goto skip;
723 }
724 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
725 FETCH(c, _("Unbalanced ["));
726 FETCH(c1, _("Unbalanced ["));
727 if (c1 == '-')
728 {
729 FETCH(c2, _("Unbalanced ["));
730 if (c2 == ']')
731 {
732 /* In the case [x-], the - is an ordinary hyphen,
733 which is left in c1, the lookahead character. */
734 --lexptr;
735 ++lexleft;
736 c2 = c;
737 }
738 else
739 {
740 if (c2 == '\\'
741 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
742 FETCH(c2, _("Unbalanced ["));
743 FETCH(c1, _("Unbalanced ["));
744 }
745 }
746 else
747 c2 = c;
748
749 lo[0] = c; lo[1] = '\0';
750 hi[0] = c2; hi[1] = '\0';
751 for (c = 0; c < NOTCHAR; c++)
752 {
753 char ch[2];
754 ch[0] = c; ch[1] = '\0';
755 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
756 {
757 setbit (c, ccl);
758 if (case_fold)
759 {
760 if (ISUPPER (c))
761 setbit (tolower (c), ccl);
762 else if (ISLOWER (c))
763 setbit (toupper (c), ccl);
764 }
765 }
766 }
767
768 skip:
769 ;
770 }
771 while ((c = c1) != ']');
772 if (invert)
773 {
774 notset(ccl);
775 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
776 clrbit(eolbyte, ccl);
777 }
778 laststart = 0;
779 return lasttok = CSET + charclass_index(ccl);
780
781 default:
782 normal_char:
783 laststart = 0;
784 if (case_fold && ISALPHA(c))
785 {
786 zeroset(ccl);
787 setbit(c, ccl);
788 if (isupper(c))
789 setbit(tolower(c), ccl);
790 else
791 setbit(toupper(c), ccl);
792 return lasttok = CSET + charclass_index(ccl);
793 }
794 return c;
795 }
796 }
797
798 /* The above loop should consume at most a backslash
799 and some other character. */
800 abort();
801 return END; /* keeps pedantic compilers happy. */
802}
803
804/* Recursive descent parser for regular expressions. */
805
806static token tok; /* Lookahead token. */
807static int depth; /* Current depth of a hypothetical stack
808 holding deferred productions. This is
809 used to determine the depth that will be
810 required of the real stack later on in
811 dfaanalyze(). */
812
813/* Add the given token to the parse tree, maintaining the depth count and
814 updating the maximum depth if necessary. */
815static void
816addtok (token t)
817{
818 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
819 dfa->tokens[dfa->tindex++] = t;
820
821 switch (t)
822 {
823 case QMARK:
824 case STAR:
825 case PLUS:
826 break;
827
828 case CAT:
829 case OR:
830 case ORTOP:
831 --depth;
832 break;
833
834 default:
835 ++dfa->nleaves;
836 case EMPTY:
837 ++depth;
838 break;
839 }
840 if (depth > dfa->depth)
841 dfa->depth = depth;
842}
843
844/* The grammar understood by the parser is as follows.
845
846 regexp:
847 regexp OR branch
848 branch
849
850 branch:
851 branch closure
852 closure
853
854 closure:
855 closure QMARK
856 closure STAR
857 closure PLUS
858 atom
859
860 atom:
861 <normal character>
862 CSET
863 BACKREF
864 BEGLINE
865 ENDLINE
866 BEGWORD
867 ENDWORD
868 LIMWORD
869 NOTLIMWORD
870 <empty>
871
872 The parser builds a parse tree in postfix form in an array of tokens. */
873
874static void
875atom (void)
876{
877 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
878 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
879 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
880 {
881 addtok(tok);
882 tok = lex();
883 }
884 else if (tok == LPAREN)
885 {
886 tok = lex();
887 regexp(0);
888 if (tok != RPAREN)
889 dfaerror(_("Unbalanced ("));
890 tok = lex();
891 }
892 else
893 addtok(EMPTY);
894}
895
896/* Return the number of tokens in the given subexpression. */
897static int
898nsubtoks (int tindex)
899{
900 int ntoks1;
901
902 switch (dfa->tokens[tindex - 1])
903 {
904 default:
905 return 1;
906 case QMARK:
907 case STAR:
908 case PLUS:
909 return 1 + nsubtoks(tindex - 1);
910 case CAT:
911 case OR:
912 case ORTOP:
913 ntoks1 = nsubtoks(tindex - 1);
914 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
915 }
916}
917
918/* Copy the given subexpression to the top of the tree. */
919static void
920copytoks (int tindex, int ntokens)
921{
922 int i;
923
924 for (i = 0; i < ntokens; ++i)
925 addtok(dfa->tokens[tindex + i]);
926}
927
928static void
929closure (void)
930{
931 int tindex, ntokens, i;
932
933 atom();
934 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
935 if (tok == REPMN)
936 {
937 ntokens = nsubtoks(dfa->tindex);
938 tindex = dfa->tindex - ntokens;
939 if (maxrep < 0)
940 addtok(PLUS);
941 if (minrep == 0)
942 addtok(QMARK);
943 for (i = 1; i < minrep; ++i)
944 {
945 copytoks(tindex, ntokens);
946 addtok(CAT);
947 }
948 for (; i < maxrep; ++i)
949 {
950 copytoks(tindex, ntokens);
951 addtok(QMARK);
952 addtok(CAT);
953 }
954 tok = lex();
955 }
956 else
957 {
958 addtok(tok);
959 tok = lex();
960 }
961}
962
963static void
964branch (void)
965{
966 closure();
967 while (tok != RPAREN && tok != OR && tok >= 0)
968 {
969 closure();
970 addtok(CAT);
971 }
972}
973
974static void
975regexp (int toplevel)
976{
977 branch();
978 while (tok == OR)
979 {
980 tok = lex();
981 branch();
982 if (toplevel)
983 addtok(ORTOP);
984 else
985 addtok(OR);
986 }
987}
988
989/* Main entry point for the parser. S is a string to be parsed, len is the
990 length of the string, so s can include NUL characters. D is a pointer to
991 the struct dfa to parse into. */
992void
993dfaparse (char *s, size_t len, struct dfa *d)
994{
995 dfa = d;
996 lexstart = lexptr = s;
997 lexleft = len;
998 lasttok = END;
999 laststart = 1;
1000 parens = 0;
1001
1002 if (! syntax_bits_set)
1003 dfaerror(_("No syntax specified"));
1004
1005 tok = lex();
1006 depth = d->depth;
1007
1008 regexp(1);
1009
1010 if (tok != END)
1011 dfaerror(_("Unbalanced )"));
1012
1013 addtok(END - d->nregexps);
1014 addtok(CAT);
1015
1016 if (d->nregexps)
1017 addtok(ORTOP);
1018
1019 ++d->nregexps;
1020}
1021
1022/* Some primitives for operating on sets of positions. */
1023
1024/* Copy one set to another; the destination must be large enough. */
1025static void
1026copy (position_set *src, position_set *dst)
1027{
1028 int i;
1029
1030 for (i = 0; i < src->nelem; ++i)
1031 dst->elems[i] = src->elems[i];
1032 dst->nelem = src->nelem;
1033}
1034
1035/* Insert a position in a set. Position sets are maintained in sorted
1036 order according to index. If position already exists in the set with
1037 the same index then their constraints are logically or'd together.
1038 S->elems must point to an array large enough to hold the resulting set. */
1039static void
1040insert (position p, position_set *s)
1041{
1042 int i;
1043 position t1, t2;
1044
1045 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1046 continue;
1047 if (i < s->nelem && p.index == s->elems[i].index)
1048 s->elems[i].constraint |= p.constraint;
1049 else
1050 {
1051 t1 = p;
1052 ++s->nelem;
1053 while (i < s->nelem)
1054 {
1055 t2 = s->elems[i];
1056 s->elems[i++] = t1;
1057 t1 = t2;
1058 }
1059 }
1060}
1061
1062/* Merge two sets of positions into a third. The result is exactly as if
1063 the positions of both sets were inserted into an initially empty set. */
1064static void
1065merge (position_set *s1, position_set *s2, position_set *m)
1066{
1067 int i = 0, j = 0;
1068
1069 m->nelem = 0;
1070 while (i < s1->nelem && j < s2->nelem)
1071 if (s1->elems[i].index > s2->elems[j].index)
1072 m->elems[m->nelem++] = s1->elems[i++];
1073 else if (s1->elems[i].index < s2->elems[j].index)
1074 m->elems[m->nelem++] = s2->elems[j++];
1075 else
1076 {
1077 m->elems[m->nelem] = s1->elems[i++];
1078 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1079 }
1080 while (i < s1->nelem)
1081 m->elems[m->nelem++] = s1->elems[i++];
1082 while (j < s2->nelem)
1083 m->elems[m->nelem++] = s2->elems[j++];
1084}
1085
1086/* Delete a position from a set. */
1087static void
1088delete (position p, position_set *s)
1089{
1090 int i;
1091
1092 for (i = 0; i < s->nelem; ++i)
1093 if (p.index == s->elems[i].index)
1094 break;
1095 if (i < s->nelem)
1096 for (--s->nelem; i < s->nelem; ++i)
1097 s->elems[i] = s->elems[i + 1];
1098}
1099
1100/* Find the index of the state corresponding to the given position set with
1101 the given preceding context, or create a new state if there is no such
1102 state. Newline and letter tell whether we got here on a newline or
1103 letter, respectively. */
1104static int
1105state_index (struct dfa *d, position_set *s, int newline, int letter)
1106{
1107 int hash = 0;
1108 int constraint;
1109 int i, j;
1110
1111 newline = newline ? 1 : 0;
1112 letter = letter ? 1 : 0;
1113
1114 for (i = 0; i < s->nelem; ++i)
1115 hash ^= s->elems[i].index + s->elems[i].constraint;
1116
1117 /* Try to find a state that exactly matches the proposed one. */
1118 for (i = 0; i < d->sindex; ++i)
1119 {
1120 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1121 || newline != d->states[i].newline || letter != d->states[i].letter)
1122 continue;
1123 for (j = 0; j < s->nelem; ++j)
1124 if (s->elems[j].constraint
1125 != d->states[i].elems.elems[j].constraint
1126 || s->elems[j].index != d->states[i].elems.elems[j].index)
1127 break;
1128 if (j == s->nelem)
1129 return i;
1130 }
1131
1132 /* We'll have to create a new state. */
1133 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1134 d->states[i].hash = hash;
1135 MALLOC(d->states[i].elems.elems, position, s->nelem);
1136 copy(s, &d->states[i].elems);
1137 d->states[i].newline = newline;
1138 d->states[i].letter = letter;
1139 d->states[i].backref = 0;
1140 d->states[i].constraint = 0;
1141 d->states[i].first_end = 0;
1142 for (j = 0; j < s->nelem; ++j)
1143 if (d->tokens[s->elems[j].index] < 0)
1144 {
1145 constraint = s->elems[j].constraint;
1146 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1147 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1148 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1149 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1150 d->states[i].constraint |= constraint;
1151 if (! d->states[i].first_end)
1152 d->states[i].first_end = d->tokens[s->elems[j].index];
1153 }
1154 else if (d->tokens[s->elems[j].index] == BACKREF)
1155 {
1156 d->states[i].constraint = NO_CONSTRAINT;
1157 d->states[i].backref = 1;
1158 }
1159
1160 ++d->sindex;
1161
1162 return i;
1163}
1164
1165/* Find the epsilon closure of a set of positions. If any position of the set
1166 contains a symbol that matches the empty string in some context, replace
1167 that position with the elements of its follow labeled with an appropriate
1168 constraint. Repeat exhaustively until no funny positions are left.
1169 S->elems must be large enough to hold the result. */
1170static void
1171epsclosure (position_set *s, struct dfa *d)
1172{
1173 int i, j;
1174 int *visited;
1175 position p, old;
1176
1177 MALLOC(visited, int, d->tindex);
1178 for (i = 0; i < d->tindex; ++i)
1179 visited[i] = 0;
1180
1181 for (i = 0; i < s->nelem; ++i)
1182 if (d->tokens[s->elems[i].index] >= NOTCHAR
1183 && d->tokens[s->elems[i].index] != BACKREF
1184 && d->tokens[s->elems[i].index] < CSET)
1185 {
1186 old = s->elems[i];
1187 p.constraint = old.constraint;
1188 delete(s->elems[i], s);
1189 if (visited[old.index])
1190 {
1191 --i;
1192 continue;
1193 }
1194 visited[old.index] = 1;
1195 switch (d->tokens[old.index])
1196 {
1197 case BEGLINE:
1198 p.constraint &= BEGLINE_CONSTRAINT;
1199 break;
1200 case ENDLINE:
1201 p.constraint &= ENDLINE_CONSTRAINT;
1202 break;
1203 case BEGWORD:
1204 p.constraint &= BEGWORD_CONSTRAINT;
1205 break;
1206 case ENDWORD:
1207 p.constraint &= ENDWORD_CONSTRAINT;
1208 break;
1209 case LIMWORD:
1210 p.constraint &= LIMWORD_CONSTRAINT;
1211 break;
1212 case NOTLIMWORD:
1213 p.constraint &= NOTLIMWORD_CONSTRAINT;
1214 break;
1215 default:
1216 break;
1217 }
1218 for (j = 0; j < d->follows[old.index].nelem; ++j)
1219 {
1220 p.index = d->follows[old.index].elems[j].index;
1221 insert(p, s);
1222 }
1223 /* Force rescan to start at the beginning. */
1224 i = -1;
1225 }
1226
1227 free(visited);
1228}
1229
1230/* Perform bottom-up analysis on the parse tree, computing various functions.
1231 Note that at this point, we're pretending constructs like \< are real
1232 characters rather than constraints on what can follow them.
1233
1234 Nullable: A node is nullable if it is at the root of a regexp that can
1235 match the empty string.
1236 * EMPTY leaves are nullable.
1237 * No other leaf is nullable.
1238 * A QMARK or STAR node is nullable.
1239 * A PLUS node is nullable if its argument is nullable.
1240 * A CAT node is nullable if both its arguments are nullable.
1241 * An OR node is nullable if either argument is nullable.
1242
1243 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1244 that could correspond to the first character of a string matching the
1245 regexp rooted at the given node.
1246 * EMPTY leaves have empty firstpos.
1247 * The firstpos of a nonempty leaf is that leaf itself.
1248 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1249 argument.
1250 * The firstpos of a CAT node is the firstpos of the left argument, union
1251 the firstpos of the right if the left argument is nullable.
1252 * The firstpos of an OR node is the union of firstpos of each argument.
1253
1254 Lastpos: The lastpos of a node is the set of positions that could
1255 correspond to the last character of a string matching the regexp at
1256 the given node.
1257 * EMPTY leaves have empty lastpos.
1258 * The lastpos of a nonempty leaf is that leaf itself.
1259 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1260 argument.
1261 * The lastpos of a CAT node is the lastpos of its right argument, union
1262 the lastpos of the left if the right argument is nullable.
1263 * The lastpos of an OR node is the union of the lastpos of each argument.
1264
1265 Follow: The follow of a position is the set of positions that could
1266 correspond to the character following a character matching the node in
1267 a string matching the regexp. At this point we consider special symbols
1268 that match the empty string in some context to be just normal characters.
1269 Later, if we find that a special symbol is in a follow set, we will
1270 replace it with the elements of its follow, labeled with an appropriate
1271 constraint.
1272 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1273 the follow of every node in the lastpos.
1274 * Every node in the firstpos of the second argument of a CAT node is in
1275 the follow of every node in the lastpos of the first argument.
1276
1277 Because of the postfix representation of the parse tree, the depth-first
1278 analysis is conveniently done by a linear scan with the aid of a stack.
1279 Sets are stored as arrays of the elements, obeying a stack-like allocation
1280 scheme; the number of elements in each set deeper in the stack can be
1281 used to determine the address of a particular set's array. */
1282void
1283dfaanalyze (struct dfa *d, int searchflag)
1284{
1285 int *nullable; /* Nullable stack. */
1286 int *nfirstpos; /* Element count stack for firstpos sets. */
1287 position *firstpos; /* Array where firstpos elements are stored. */
1288 int *nlastpos; /* Element count stack for lastpos sets. */
1289 position *lastpos; /* Array where lastpos elements are stored. */
1290 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1291 position_set tmp; /* Temporary set for merging sets. */
1292 position_set merged; /* Result of merging sets. */
1293 int wants_newline; /* True if some position wants newline info. */
1294 int *o_nullable;
1295 int *o_nfirst, *o_nlast;
1296 position *o_firstpos, *o_lastpos;
1297 int i, j;
1298 position *pos;
1299
1300#ifdef DEBUG
1301 fprintf(stderr, "dfaanalyze:\n");
1302 for (i = 0; i < d->tindex; ++i)
1303 {
1304 fprintf(stderr, " %d:", i);
1305 prtok(d->tokens[i]);
1306 }
1307 putc('\n', stderr);
1308#endif
1309
1310 d->searchflag = searchflag;
1311
1312 MALLOC(nullable, int, d->depth);
1313 o_nullable = nullable;
1314 MALLOC(nfirstpos, int, d->depth);
1315 o_nfirst = nfirstpos;
1316 MALLOC(firstpos, position, d->nleaves);
1317 o_firstpos = firstpos, firstpos += d->nleaves;
1318 MALLOC(nlastpos, int, d->depth);
1319 o_nlast = nlastpos;
1320 MALLOC(lastpos, position, d->nleaves);
1321 o_lastpos = lastpos, lastpos += d->nleaves;
1322 MALLOC(nalloc, int, d->tindex);
1323 for (i = 0; i < d->tindex; ++i)
1324 nalloc[i] = 0;
1325 MALLOC(merged.elems, position, d->nleaves);
1326
1327 CALLOC(d->follows, position_set, d->tindex);
1328
1329 for (i = 0; i < d->tindex; ++i)
1330#ifdef DEBUG
1331 { /* Nonsyntactic #ifdef goo... */
1332#endif
1333 switch (d->tokens[i])
1334 {
1335 case EMPTY:
1336 /* The empty set is nullable. */
1337 *nullable++ = 1;
1338
1339 /* The firstpos and lastpos of the empty leaf are both empty. */
1340 *nfirstpos++ = *nlastpos++ = 0;
1341 break;
1342
1343 case STAR:
1344 case PLUS:
1345 /* Every element in the firstpos of the argument is in the follow
1346 of every element in the lastpos. */
1347 tmp.nelem = nfirstpos[-1];
1348 tmp.elems = firstpos;
1349 pos = lastpos;
1350 for (j = 0; j < nlastpos[-1]; ++j)
1351 {
1352 merge(&tmp, &d->follows[pos[j].index], &merged);
1353 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1354 nalloc[pos[j].index], merged.nelem - 1);
1355 copy(&merged, &d->follows[pos[j].index]);
1356 }
1357
1358 case QMARK:
1359 /* A QMARK or STAR node is automatically nullable. */
1360 if (d->tokens[i] != PLUS)
1361 nullable[-1] = 1;
1362 break;
1363
1364 case CAT:
1365 /* Every element in the firstpos of the second argument is in the
1366 follow of every element in the lastpos of the first argument. */
1367 tmp.nelem = nfirstpos[-1];
1368 tmp.elems = firstpos;
1369 pos = lastpos + nlastpos[-1];
1370 for (j = 0; j < nlastpos[-2]; ++j)
1371 {
1372 merge(&tmp, &d->follows[pos[j].index], &merged);
1373 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1374 nalloc[pos[j].index], merged.nelem - 1);
1375 copy(&merged, &d->follows[pos[j].index]);
1376 }
1377
1378 /* The firstpos of a CAT node is the firstpos of the first argument,
1379 union that of the second argument if the first is nullable. */
1380 if (nullable[-2])
1381 nfirstpos[-2] += nfirstpos[-1];
1382 else
1383 firstpos += nfirstpos[-1];
1384 --nfirstpos;
1385
1386 /* The lastpos of a CAT node is the lastpos of the second argument,
1387 union that of the first argument if the second is nullable. */
1388 if (nullable[-1])
1389 nlastpos[-2] += nlastpos[-1];
1390 else
1391 {
1392 pos = lastpos + nlastpos[-2];
1393 for (j = nlastpos[-1] - 1; j >= 0; --j)
1394 pos[j] = lastpos[j];
1395 lastpos += nlastpos[-2];
1396 nlastpos[-2] = nlastpos[-1];
1397 }
1398 --nlastpos;
1399
1400 /* A CAT node is nullable if both arguments are nullable. */
1401 nullable[-2] = nullable[-1] && nullable[-2];
1402 --nullable;
1403 break;
1404
1405 case OR:
1406 case ORTOP:
1407 /* The firstpos is the union of the firstpos of each argument. */
1408 nfirstpos[-2] += nfirstpos[-1];
1409 --nfirstpos;
1410
1411 /* The lastpos is the union of the lastpos of each argument. */
1412 nlastpos[-2] += nlastpos[-1];
1413 --nlastpos;
1414
1415 /* An OR node is nullable if either argument is nullable. */
1416 nullable[-2] = nullable[-1] || nullable[-2];
1417 --nullable;
1418 break;
1419
1420 default:
1421 /* Anything else is a nonempty position. (Note that special
1422 constructs like \< are treated as nonempty strings here;
1423 an "epsilon closure" effectively makes them nullable later.
1424 Backreferences have to get a real position so we can detect
1425 transitions on them later. But they are nullable. */
1426 *nullable++ = d->tokens[i] == BACKREF;
1427
1428 /* This position is in its own firstpos and lastpos. */
1429 *nfirstpos++ = *nlastpos++ = 1;
1430 --firstpos, --lastpos;
1431 firstpos->index = lastpos->index = i;
1432 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1433
1434 /* Allocate the follow set for this position. */
1435 nalloc[i] = 1;
1436 MALLOC(d->follows[i].elems, position, nalloc[i]);
1437 break;
1438 }
1439#ifdef DEBUG
1440 /* ... balance the above nonsyntactic #ifdef goo... */
1441 fprintf(stderr, "node %d:", i);
1442 prtok(d->tokens[i]);
1443 putc('\n', stderr);
1444 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1445 fprintf(stderr, " firstpos:");
1446 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1447 {
1448 fprintf(stderr, " %d:", firstpos[j].index);
1449 prtok(d->tokens[firstpos[j].index]);
1450 }
1451 fprintf(stderr, "\n lastpos:");
1452 for (j = nlastpos[-1] - 1; j >= 0; --j)
1453 {
1454 fprintf(stderr, " %d:", lastpos[j].index);
1455 prtok(d->tokens[lastpos[j].index]);
1456 }
1457 putc('\n', stderr);
1458 }
1459#endif
1460
1461 /* For each follow set that is the follow set of a real position, replace
1462 it with its epsilon closure. */
1463 for (i = 0; i < d->tindex; ++i)
1464 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1465 || d->tokens[i] >= CSET)
1466 {
1467#ifdef DEBUG
1468 fprintf(stderr, "follows(%d:", i);
1469 prtok(d->tokens[i]);
1470 fprintf(stderr, "):");
1471 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1472 {
1473 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1474 prtok(d->tokens[d->follows[i].elems[j].index]);
1475 }
1476 putc('\n', stderr);
1477#endif
1478 copy(&d->follows[i], &merged);
1479 epsclosure(&merged, d);
1480 if (d->follows[i].nelem < merged.nelem)
1481 REALLOC(d->follows[i].elems, position, merged.nelem);
1482 copy(&merged, &d->follows[i]);
1483 }
1484
1485 /* Get the epsilon closure of the firstpos of the regexp. The result will
1486 be the set of positions of state 0. */
1487 merged.nelem = 0;
1488 for (i = 0; i < nfirstpos[-1]; ++i)
1489 insert(firstpos[i], &merged);
1490 epsclosure(&merged, d);
1491
1492 /* Check if any of the positions of state 0 will want newline context. */
1493 wants_newline = 0;
1494 for (i = 0; i < merged.nelem; ++i)
1495 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1496 wants_newline = 1;
1497
1498 /* Build the initial state. */
1499 d->salloc = 1;
1500 d->sindex = 0;
1501 MALLOC(d->states, dfa_state, d->salloc);
1502 state_index(d, &merged, wants_newline, 0);
1503
1504 free(o_nullable);
1505 free(o_nfirst);
1506 free(o_firstpos);
1507 free(o_nlast);
1508 free(o_lastpos);
1509 free(nalloc);
1510 free(merged.elems);
1511}
1512
1513/* Find, for each character, the transition out of state s of d, and store
1514 it in the appropriate slot of trans.
1515
1516 We divide the positions of s into groups (positions can appear in more
1517 than one group). Each group is labeled with a set of characters that
1518 every position in the group matches (taking into account, if necessary,
1519 preceding context information of s). For each group, find the union
1520 of the its elements' follows. This set is the set of positions of the
1521 new state. For each character in the group's label, set the transition
1522 on this character to be to a state corresponding to the set's positions,
1523 and its associated backward context information, if necessary.
1524
1525 If we are building a searching matcher, we include the positions of state
1526 0 in every state.
1527
1528 The collection of groups is constructed by building an equivalence-class
1529 partition of the positions of s.
1530
1531 For each position, find the set of characters C that it matches. Eliminate
1532 any characters from C that fail on grounds of backward context.
1533
1534 Search through the groups, looking for a group whose label L has nonempty
1535 intersection with C. If L - C is nonempty, create a new group labeled
1536 L - C and having the same positions as the current group, and set L to
1537 the intersection of L and C. Insert the position in this group, set
1538 C = C - L, and resume scanning.
1539
1540 If after comparing with every group there are characters remaining in C,
1541 create a new group labeled with the characters of C and insert this
1542 position in that group. */
1543void
1544dfastate (int s, struct dfa *d, int trans[])
1545{
1546 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1547 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1548 int ngrps = 0; /* Number of groups actually used. */
1549 position pos; /* Current position being considered. */
1550 charclass matches; /* Set of matching characters. */
1551 int matchesf; /* True if matches is nonempty. */
1552 charclass intersect; /* Intersection with some label set. */
1553 int intersectf; /* True if intersect is nonempty. */
1554 charclass leftovers; /* Stuff in the label that didn't match. */
1555 int leftoversf; /* True if leftovers is nonempty. */
1556 static charclass letters; /* Set of characters considered letters. */
1557 static charclass newline; /* Set of characters that aren't newline. */
1558 position_set follows; /* Union of the follows of some group. */
1559 position_set tmp; /* Temporary space for merging sets. */
1560 int state; /* New state. */
1561 int wants_newline; /* New state wants to know newline context. */
1562 int state_newline; /* New state on a newline transition. */
1563 int wants_letter; /* New state wants to know letter context. */
1564 int state_letter; /* New state on a letter transition. */
1565 static int initialized; /* Flag for static initialization. */
1566 int i, j, k;
1567
1568 /* Initialize the set of letters, if necessary. */
1569 if (! initialized)
1570 {
1571 initialized = 1;
1572 for (i = 0; i < NOTCHAR; ++i)
1573 if (IS_WORD_CONSTITUENT(i))
1574 setbit(i, letters);
1575 setbit(eolbyte, newline);
1576 }
1577
1578 zeroset(matches);
1579
1580 for (i = 0; i < d->states[s].elems.nelem; ++i)
1581 {
1582 pos = d->states[s].elems.elems[i];
1583 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1584 setbit(d->tokens[pos.index], matches);
1585 else if (d->tokens[pos.index] >= CSET)
1586 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1587 else
1588 continue;
1589
1590 /* Some characters may need to be eliminated from matches because
1591 they fail in the current context. */
1592 if (pos.constraint != 0xFF)
1593 {
1594 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1595 d->states[s].newline, 1))
1596 clrbit(eolbyte, matches);
1597 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1598 d->states[s].newline, 0))
1599 for (j = 0; j < CHARCLASS_INTS; ++j)
1600 matches[j] &= newline[j];
1601 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1602 d->states[s].letter, 1))
1603 for (j = 0; j < CHARCLASS_INTS; ++j)
1604 matches[j] &= ~letters[j];
1605 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1606 d->states[s].letter, 0))
1607 for (j = 0; j < CHARCLASS_INTS; ++j)
1608 matches[j] &= letters[j];
1609
1610 /* If there are no characters left, there's no point in going on. */
1611 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1612 continue;
1613 if (j == CHARCLASS_INTS)
1614 continue;
1615 }
1616
1617 for (j = 0; j < ngrps; ++j)
1618 {
1619 /* If matches contains a single character only, and the current
1620 group's label doesn't contain that character, go on to the
1621 next group. */
1622 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1623 && !tstbit(d->tokens[pos.index], labels[j]))
1624 continue;
1625
1626 /* Check if this group's label has a nonempty intersection with
1627 matches. */
1628 intersectf = 0;
1629 for (k = 0; k < CHARCLASS_INTS; ++k)
1630 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1631 if (! intersectf)
1632 continue;
1633
1634 /* It does; now find the set differences both ways. */
1635 leftoversf = matchesf = 0;
1636 for (k = 0; k < CHARCLASS_INTS; ++k)
1637 {
1638 /* Even an optimizing compiler can't know this for sure. */
1639 int match = matches[k], label = labels[j][k];
1640
1641 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1642 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1643 }
1644
1645 /* If there were leftovers, create a new group labeled with them. */
1646 if (leftoversf)
1647 {
1648 copyset(leftovers, labels[ngrps]);
1649 copyset(intersect, labels[j]);
1650 MALLOC(grps[ngrps].elems, position, d->nleaves);
1651 copy(&grps[j], &grps[ngrps]);
1652 ++ngrps;
1653 }
1654
1655 /* Put the position in the current group. Note that there is no
1656 reason to call insert() here. */
1657 grps[j].elems[grps[j].nelem++] = pos;
1658
1659 /* If every character matching the current position has been
1660 accounted for, we're done. */
1661 if (! matchesf)
1662 break;
1663 }
1664
1665 /* If we've passed the last group, and there are still characters
1666 unaccounted for, then we'll have to create a new group. */
1667 if (j == ngrps)
1668 {
1669 copyset(matches, labels[ngrps]);
1670 zeroset(matches);
1671 MALLOC(grps[ngrps].elems, position, d->nleaves);
1672 grps[ngrps].nelem = 1;
1673 grps[ngrps].elems[0] = pos;
1674 ++ngrps;
1675 }
1676 }
1677
1678 MALLOC(follows.elems, position, d->nleaves);
1679 MALLOC(tmp.elems, position, d->nleaves);
1680
1681 /* If we are a searching matcher, the default transition is to a state
1682 containing the positions of state 0, otherwise the default transition
1683 is to fail miserably. */
1684 if (d->searchflag)
1685 {
1686 wants_newline = 0;
1687 wants_letter = 0;
1688 for (i = 0; i < d->states[0].elems.nelem; ++i)
1689 {
1690 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1691 wants_newline = 1;
1692 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1693 wants_letter = 1;
1694 }
1695 copy(&d->states[0].elems, &follows);
1696 state = state_index(d, &follows, 0, 0);
1697 if (wants_newline)
1698 state_newline = state_index(d, &follows, 1, 0);
1699 else
1700 state_newline = state;
1701 if (wants_letter)
1702 state_letter = state_index(d, &follows, 0, 1);
1703 else
1704 state_letter = state;
1705 for (i = 0; i < NOTCHAR; ++i)
1706 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1707 trans[eolbyte] = state_newline;
1708 }
1709 else
1710 for (i = 0; i < NOTCHAR; ++i)
1711 trans[i] = -1;
1712
1713 for (i = 0; i < ngrps; ++i)
1714 {
1715 follows.nelem = 0;
1716
1717 /* Find the union of the follows of the positions of the group.
1718 This is a hideously inefficient loop. Fix it someday. */
1719 for (j = 0; j < grps[i].nelem; ++j)
1720 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1721 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1722
1723 /* If we are building a searching matcher, throw in the positions
1724 of state 0 as well. */
1725 if (d->searchflag)
1726 for (j = 0; j < d->states[0].elems.nelem; ++j)
1727 insert(d->states[0].elems.elems[j], &follows);
1728
1729 /* Find out if the new state will want any context information. */
1730 wants_newline = 0;
1731 if (tstbit(eolbyte, labels[i]))
1732 for (j = 0; j < follows.nelem; ++j)
1733 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1734 wants_newline = 1;
1735
1736 wants_letter = 0;
1737 for (j = 0; j < CHARCLASS_INTS; ++j)
1738 if (labels[i][j] & letters[j])
1739 break;
1740 if (j < CHARCLASS_INTS)
1741 for (j = 0; j < follows.nelem; ++j)
1742 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1743 wants_letter = 1;
1744
1745 /* Find the state(s) corresponding to the union of the follows. */
1746 state = state_index(d, &follows, 0, 0);
1747 if (wants_newline)
1748 state_newline = state_index(d, &follows, 1, 0);
1749 else
1750 state_newline = state;
1751 if (wants_letter)
1752 state_letter = state_index(d, &follows, 0, 1);
1753 else
1754 state_letter = state;
1755
1756 /* Set the transitions for each character in the current label. */
1757 for (j = 0; j < CHARCLASS_INTS; ++j)
1758 for (k = 0; k < INTBITS; ++k)
1759 if (labels[i][j] & 1 << k)
1760 {
1761 int c = j * INTBITS + k;
1762
1763 if (c == eolbyte)
1764 trans[c] = state_newline;
1765 else if (IS_WORD_CONSTITUENT(c))
1766 trans[c] = state_letter;
1767 else if (c < NOTCHAR)
1768 trans[c] = state;
1769 }
1770 }
1771
1772 for (i = 0; i < ngrps; ++i)
1773 free(grps[i].elems);
1774 free(follows.elems);
1775 free(tmp.elems);
1776}
1777
1778/* Some routines for manipulating a compiled dfa's transition tables.
1779 Each state may or may not have a transition table; if it does, and it
1780 is a non-accepting state, then d->trans[state] points to its table.
1781 If it is an accepting state then d->fails[state] points to its table.
1782 If it has no table at all, then d->trans[state] is NULL.
1783 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1784
1785static void
1786build_state (int s, struct dfa *d)
1787{
1788 int *trans; /* The new transition table. */
1789 int i;
1790
1791 /* Set an upper limit on the number of transition tables that will ever
1792 exist at once. 1024 is arbitrary. The idea is that the frequently
1793 used transition tables will be quickly rebuilt, whereas the ones that
1794 were only needed once or twice will be cleared away. */
1795 if (d->trcount >= 1024)
1796 {
1797 for (i = 0; i < d->tralloc; ++i)
1798 if (d->trans[i])
1799 {
1800 free((ptr_t) d->trans[i]);
1801 d->trans[i] = NULL;
1802 }
1803 else if (d->fails[i])
1804 {
1805 free((ptr_t) d->fails[i]);
1806 d->fails[i] = NULL;
1807 }
1808 d->trcount = 0;
1809 }
1810
1811 ++d->trcount;
1812
1813 /* Set up the success bits for this state. */
1814 d->success[s] = 0;
1815 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1816 s, *d))
1817 d->success[s] |= 4;
1818 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1819 s, *d))
1820 d->success[s] |= 2;
1821 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1822 s, *d))
1823 d->success[s] |= 1;
1824
1825 MALLOC(trans, int, NOTCHAR);
1826 dfastate(s, d, trans);
1827
1828 /* Now go through the new transition table, and make sure that the trans
1829 and fail arrays are allocated large enough to hold a pointer for the
1830 largest state mentioned in the table. */
1831 for (i = 0; i < NOTCHAR; ++i)
1832 if (trans[i] >= d->tralloc)
1833 {
1834 int oldalloc = d->tralloc;
1835
1836 while (trans[i] >= d->tralloc)
1837 d->tralloc *= 2;
1838 REALLOC(d->realtrans, int *, d->tralloc + 1);
1839 d->trans = d->realtrans + 1;
1840 REALLOC(d->fails, int *, d->tralloc);
1841 REALLOC(d->success, int, d->tralloc);
1842 REALLOC(d->newlines, int, d->tralloc);
1843 while (oldalloc < d->tralloc)
1844 {
1845 d->trans[oldalloc] = NULL;
1846 d->fails[oldalloc++] = NULL;
1847 }
1848 }
1849
1850 /* Keep the newline transition in a special place so we can use it as
1851 a sentinel. */
1852 d->newlines[s] = trans[eolbyte];
1853 trans[eolbyte] = -1;
1854
1855 if (ACCEPTING(s, *d))
1856 d->fails[s] = trans;
1857 else
1858 d->trans[s] = trans;
1859}
1860
1861static void
1862build_state_zero (struct dfa *d)
1863{
1864 d->tralloc = 1;
1865 d->trcount = 0;
1866 CALLOC(d->realtrans, int *, d->tralloc + 1);
1867 d->trans = d->realtrans + 1;
1868 CALLOC(d->fails, int *, d->tralloc);
1869 MALLOC(d->success, int, d->tralloc);
1870 MALLOC(d->newlines, int, d->tralloc);
1871 build_state(0, d);
1872}
1873
1874/* Search through a buffer looking for a match to the given struct dfa.
1875 Find the first occurrence of a string matching the regexp in the buffer,
1876 and the shortest possible version thereof. Return a pointer to the first
1877 character after the match, or NULL if none is found. Begin points to
1878 the beginning of the buffer, and end points to the first character after
1879 its end. We store a newline in *end to act as a sentinel, so end had
1880 better point somewhere valid. Newline is a flag indicating whether to
1881 allow newlines to be in the matching string. If count is non-
1882 NULL it points to a place we're supposed to increment every time we
1883 see a newline. Finally, if backref is non-NULL it points to a place
1884 where we're supposed to store a 1 if backreferencing happened and the
1885 match needs to be verified by a backtracking matcher. Otherwise
1886 we store a 0 in *backref. */
1887char *
1888dfaexec (struct dfa *d, char *begin, char *end,
1889 int newline, int *count, int *backref)
1890{
1891 register int s, s1, tmp; /* Current state. */
1892 register unsigned char *p; /* Current input character. */
1893 register int **trans, *t; /* Copy of d->trans so it can be optimized
1894 into a register. */
1895 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
1896 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1897 static int sbit_init;
1898
1899 if (! sbit_init)
1900 {
1901 int i;
1902
1903 sbit_init = 1;
1904 for (i = 0; i < NOTCHAR; ++i)
1905 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1906 sbit[eol] = 4;
1907 }
1908
1909 if (! d->tralloc)
1910 build_state_zero(d);
1911
1912 s = s1 = 0;
1913 p = (unsigned char *) begin;
1914 trans = d->trans;
1915 *end = eol;
1916
1917 for (;;)
1918 {
1919 while ((t = trans[s]) != 0) { /* hand-optimized loop */
1920 s1 = t[*p++];
1921 if ((t = trans[s1]) == 0) {
1922 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1923 break;
1924 }
1925 s = t[*p++];
1926 }
1927
1928 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1929 {
1930 if (d->success[s] & sbit[*p])
1931 {
1932 if (backref)
1933 *backref = (d->states[s].backref != 0);
1934 return (char *) p;
1935 }
1936
1937 s1 = s;
1938 s = d->fails[s][*p++];
1939 continue;
1940 }
1941
1942 /* If the previous character was a newline, count it. */
1943 if (count && (char *) p <= end && p[-1] == eol)
1944 ++*count;
1945
1946 /* Check if we've run off the end of the buffer. */
1947 if ((char *) p > end)
1948 return NULL;
1949
1950 if (s >= 0)
1951 {
1952 build_state(s, d);
1953 trans = d->trans;
1954 continue;
1955 }
1956
1957 if (p[-1] == eol && newline)
1958 {
1959 s = d->newlines[s1];
1960 continue;
1961 }
1962
1963 s = 0;
1964 }
1965}
1966
1967/* Initialize the components of a dfa that the other routines don't
1968 initialize for themselves. */
1969void
1970dfainit (struct dfa *d)
1971{
1972 d->calloc = 1;
1973 MALLOC(d->charclasses, charclass, d->calloc);
1974 d->cindex = 0;
1975
1976 d->talloc = 1;
1977 MALLOC(d->tokens, token, d->talloc);
1978 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1979
1980 d->searchflag = 0;
1981 d->tralloc = 0;
1982
1983 d->musts = 0;
1984}
1985
1986/* Parse and analyze a single string of the given length. */
1987void
1988dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1989{
1990 if (case_fold) /* dummy folding in service of dfamust() */
1991 {
1992 char *lcopy;
1993 int i;
1994
1995 lcopy = malloc(len);
1996 if (!lcopy)
1997 dfaerror(_("out of memory"));
1998
1999 /* This is a kludge. */
2000 case_fold = 0;
2001 for (i = 0; i < len; ++i)
2002 if (ISUPPER ((unsigned char) s[i]))
2003 lcopy[i] = tolower ((unsigned char) s[i]);
2004 else
2005 lcopy[i] = s[i];
2006
2007 dfainit(d);
2008 dfaparse(lcopy, len, d);
2009 free(lcopy);
2010 dfamust(d);
2011 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2012 case_fold = 1;
2013 dfaparse(s, len, d);
2014 dfaanalyze(d, searchflag);
2015 }
2016 else
2017 {
2018 dfainit(d);
2019 dfaparse(s, len, d);
2020 dfamust(d);
2021 dfaanalyze(d, searchflag);
2022 }
2023}
2024
2025/* Free the storage held by the components of a dfa. */
2026void
2027dfafree (struct dfa *d)
2028{
2029 int i;
2030 struct dfamust *dm, *ndm;
2031
2032 free((ptr_t) d->charclasses);
2033 free((ptr_t) d->tokens);
2034 for (i = 0; i < d->sindex; ++i)
2035 free((ptr_t) d->states[i].elems.elems);
2036 free((ptr_t) d->states);
2037 for (i = 0; i < d->tindex; ++i)
2038 if (d->follows[i].elems)
2039 free((ptr_t) d->follows[i].elems);
2040 free((ptr_t) d->follows);
2041 for (i = 0; i < d->tralloc; ++i)
2042 if (d->trans[i])
2043 free((ptr_t) d->trans[i]);
2044 else if (d->fails[i])
2045 free((ptr_t) d->fails[i]);
2046 if (d->realtrans) free((ptr_t) d->realtrans);
2047 if (d->fails) free((ptr_t) d->fails);
2048 if (d->newlines) free((ptr_t) d->newlines);
2049 if (d->success) free((ptr_t) d->success);
2050 for (dm = d->musts; dm; dm = ndm)
2051 {
2052 ndm = dm->next;
2053 free(dm->must);
2054 free((ptr_t) dm);
2055 }
2056}
2057
2058/* Having found the postfix representation of the regular expression,
2059 try to find a long sequence of characters that must appear in any line
2060 containing the r.e.
2061 Finding a "longest" sequence is beyond the scope here;
2062 we take an easy way out and hope for the best.
2063 (Take "(ab|a)b"--please.)
2064
2065 We do a bottom-up calculation of sequences of characters that must appear
2066 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2067 representation:
2068 sequences that must appear at the left of the match ("left")
2069 sequences that must appear at the right of the match ("right")
2070 lists of sequences that must appear somewhere in the match ("in")
2071 sequences that must constitute the match ("is")
2072
2073 When we get to the root of the tree, we use one of the longest of its
2074 calculated "in" sequences as our answer. The sequence we find is returned in
2075 d->must (where "d" is the single argument passed to "dfamust");
2076 the length of the sequence is returned in d->mustn.
2077
2078 The sequences calculated for the various types of node (in pseudo ANSI c)
2079 are shown below. "p" is the operand of unary operators (and the left-hand
2080 operand of binary operators); "q" is the right-hand operand of binary
2081 operators.
2082
2083 "ZERO" means "a zero-length sequence" below.
2084
2085 Type left right is in
2086 ---- ---- ----- -- --
2087 char c # c # c # c # c
2088
2089 CSET ZERO ZERO ZERO ZERO
2090
2091 STAR ZERO ZERO ZERO ZERO
2092
2093 QMARK ZERO ZERO ZERO ZERO
2094
2095 PLUS p->left p->right ZERO p->in
2096
2097 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2098 p->left : q->right : q->is!=ZERO) ? q->in plus
2099 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2100 ZERO
2101
2102 OR longest common longest common (do p->is and substrings common to
2103 leading trailing q->is have same p->in and q->in
2104 (sub)sequence (sub)sequence length and
2105 of p->left of p->right content) ?
2106 and q->left and q->right p->is : NULL
2107
2108 If there's anything else we recognize in the tree, all four sequences get set
2109 to zero-length sequences. If there's something we don't recognize in the tree,
2110 we just return a zero-length sequence.
2111
2112 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2113 'aaa')?
2114
2115 And. . .is it here or someplace that we might ponder "optimizations" such as
2116 egrep 'psi|epsilon' -> egrep 'psi'
2117 egrep 'pepsi|epsilon' -> egrep 'epsi'
2118 (Yes, we now find "epsi" as a "string
2119 that must occur", but we might also
2120 simplify the *entire* r.e. being sought)
2121 grep '[c]' -> grep 'c'
2122 grep '(ab|a)b' -> grep 'ab'
2123 grep 'ab*' -> grep 'a'
2124 grep 'a*b' -> grep 'b'
2125
2126 There are several issues:
2127
2128 Is optimization easy (enough)?
2129
2130 Does optimization actually accomplish anything,
2131 or is the automaton you get from "psi|epsilon" (for example)
2132 the same as the one you get from "psi" (for example)?
2133
2134 Are optimizable r.e.'s likely to be used in real-life situations
2135 (something like 'ab*' is probably unlikely; something like is
2136 'psi|epsilon' is likelier)? */
2137
2138static char *
2139icatalloc (char *old, char *new)
2140{
2141 char *result;
2142 size_t oldsize, newsize;
2143
2144 newsize = (new == NULL) ? 0 : strlen(new);
2145 if (old == NULL)
2146 oldsize = 0;
2147 else if (newsize == 0)
2148 return old;
2149 else oldsize = strlen(old);
2150 if (old == NULL)
2151 result = (char *) malloc(newsize + 1);
2152 else
2153 result = (char *) realloc((void *) old, oldsize + newsize + 1);
2154 if (result != NULL && new != NULL)
2155 (void) strcpy(result + oldsize, new);
2156 return result;
2157}
2158
2159static char *
2160icpyalloc (char *string)
2161{
2162 return icatalloc((char *) NULL, string);
2163}
2164
2165static char *
2166istrstr (char *lookin, char *lookfor)
2167{
2168 char *cp;
2169 size_t len;
2170
2171 len = strlen(lookfor);
2172 for (cp = lookin; *cp != '\0'; ++cp)
2173 if (strncmp(cp, lookfor, len) == 0)
2174 return cp;
2175 return NULL;
2176}
2177
2178static void
2179ifree (char *cp)
2180{
2181 if (cp != NULL)
2182 free(cp);
2183}
2184
2185static void
2186freelist (char **cpp)
2187{
2188 int i;
2189
2190 if (cpp == NULL)
2191 return;
2192 for (i = 0; cpp[i] != NULL; ++i)
2193 {
2194 free(cpp[i]);
2195 cpp[i] = NULL;
2196 }
2197}
2198
2199static char **
2200enlist (char **cpp, char *new, size_t len)
2201{
2202 int i, j;
2203
2204 if (cpp == NULL)
2205 return NULL;
2206 if ((new = icpyalloc(new)) == NULL)
2207 {
2208 freelist(cpp);
2209 return NULL;
2210 }
2211 new[len] = '\0';
2212 /* Is there already something in the list that's new (or longer)? */
2213 for (i = 0; cpp[i] != NULL; ++i)
2214 if (istrstr(cpp[i], new) != NULL)
2215 {
2216 free(new);
2217 return cpp;
2218 }
2219 /* Eliminate any obsoleted strings. */
2220 j = 0;
2221 while (cpp[j] != NULL)
2222 if (istrstr(new, cpp[j]) == NULL)
2223 ++j;
2224 else
2225 {
2226 free(cpp[j]);
2227 if (--i == j)
2228 break;
2229 cpp[j] = cpp[i];
2230 cpp[i] = NULL;
2231 }
2232 /* Add the new string. */
2233 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2234 if (cpp == NULL)
2235 return NULL;
2236 cpp[i] = new;
2237 cpp[i + 1] = NULL;
2238 return cpp;
2239}
2240
2241/* Given pointers to two strings, return a pointer to an allocated
2242 list of their distinct common substrings. Return NULL if something
2243 seems wild. */
2244static char **
2245comsubs (char *left, char *right)
2246{
2247 char **cpp;
2248 char *lcp;
2249 char *rcp;
2250 size_t i, len;
2251
2252 if (left == NULL || right == NULL)
2253 return NULL;
2254 cpp = (char **) malloc(sizeof *cpp);
2255 if (cpp == NULL)
2256 return NULL;
2257 cpp[0] = NULL;
2258 for (lcp = left; *lcp != '\0'; ++lcp)
2259 {
2260 len = 0;
2261 rcp = index(right, *lcp);
2262 while (rcp != NULL)
2263 {
2264 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2265 continue;
2266 if (i > len)
2267 len = i;
2268 rcp = index(rcp + 1, *lcp);
2269 }
2270 if (len == 0)
2271 continue;
2272 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2273 break;
2274 }
2275 return cpp;
2276}
2277
2278static char **
2279addlists (char **old, char **new)
2280{
2281 int i;
2282
2283 if (old == NULL || new == NULL)
2284 return NULL;
2285 for (i = 0; new[i] != NULL; ++i)
2286 {
2287 old = enlist(old, new[i], strlen(new[i]));
2288 if (old == NULL)
2289 break;
2290 }
2291 return old;
2292}
2293
2294/* Given two lists of substrings, return a new list giving substrings
2295 common to both. */
2296static char **
2297inboth (char **left, char **right)
2298{
2299 char **both;
2300 char **temp;
2301 int lnum, rnum;
2302
2303 if (left == NULL || right == NULL)
2304 return NULL;
2305 both = (char **) malloc(sizeof *both);
2306 if (both == NULL)
2307 return NULL;
2308 both[0] = NULL;
2309 for (lnum = 0; left[lnum] != NULL; ++lnum)
2310 {
2311 for (rnum = 0; right[rnum] != NULL; ++rnum)
2312 {
2313 temp = comsubs(left[lnum], right[rnum]);
2314 if (temp == NULL)
2315 {
2316 freelist(both);
2317 return NULL;
2318 }
2319 both = addlists(both, temp);
2320 freelist(temp);
2321 free(temp);
2322 if (both == NULL)
2323 return NULL;
2324 }
2325 }
2326 return both;
2327}
2328
2329typedef struct
2330{
2331 char **in;
2332 char *left;
2333 char *right;
2334 char *is;
2335} must;
2336
2337static void
2338resetmust (must *mp)
2339{
2340 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2341 freelist(mp->in);
2342}
2343
2344static void
2345dfamust (struct dfa *dfa)
2346{
2347 must *musts;
2348 must *mp;
2349 char *result;
2350 int ri;
2351 int i;
2352 int exact;
2353 token t;
2354 static must must0;
2355 struct dfamust *dm;
2356 static char empty_string[] = "";
2357
2358 result = empty_string;
2359 exact = 0;
2360 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2361 if (musts == NULL)
2362 return;
2363 mp = musts;
2364 for (i = 0; i <= dfa->tindex; ++i)
2365 mp[i] = must0;
2366 for (i = 0; i <= dfa->tindex; ++i)
2367 {
2368 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2369 mp[i].left = malloc(2);
2370 mp[i].right = malloc(2);
2371 mp[i].is = malloc(2);
2372 if (mp[i].in == NULL || mp[i].left == NULL ||
2373 mp[i].right == NULL || mp[i].is == NULL)
2374 goto done;
2375 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2376 mp[i].in[0] = NULL;
2377 }
2378#ifdef DEBUG
2379 fprintf(stderr, "dfamust:\n");
2380 for (i = 0; i < dfa->tindex; ++i)
2381 {
2382 fprintf(stderr, " %d:", i);
2383 prtok(dfa->tokens[i]);
2384 }
2385 putc('\n', stderr);
2386#endif
2387 for (ri = 0; ri < dfa->tindex; ++ri)
2388 {
2389 switch (t = dfa->tokens[ri])
2390 {
2391 case LPAREN:
2392 case RPAREN:
2393 goto done; /* "cannot happen" */
2394 case EMPTY:
2395 case BEGLINE:
2396 case ENDLINE:
2397 case BEGWORD:
2398 case ENDWORD:
2399 case LIMWORD:
2400 case NOTLIMWORD:
2401 case BACKREF:
2402 resetmust(mp);
2403 break;
2404 case STAR:
2405 case QMARK:
2406 if (mp <= musts)
2407 goto done; /* "cannot happen" */
2408 --mp;
2409 resetmust(mp);
2410 break;
2411 case OR:
2412 case ORTOP:
2413 if (mp < &musts[2])
2414 goto done; /* "cannot happen" */
2415 {
2416 char **new;
2417 must *lmp;
2418 must *rmp;
2419 int j, ln, rn, n;
2420
2421 rmp = --mp;
2422 lmp = --mp;
2423 /* Guaranteed to be. Unlikely, but. . . */
2424 if (strcmp(lmp->is, rmp->is) != 0)
2425 lmp->is[0] = '\0';
2426 /* Left side--easy */
2427 i = 0;
2428 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2429 ++i;
2430 lmp->left[i] = '\0';
2431 /* Right side */
2432 ln = strlen(lmp->right);
2433 rn = strlen(rmp->right);
2434 n = ln;
2435 if (n > rn)
2436 n = rn;
2437 for (i = 0; i < n; ++i)
2438 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2439 break;
2440 for (j = 0; j < i; ++j)
2441 lmp->right[j] = lmp->right[(ln - i) + j];
2442 lmp->right[j] = '\0';
2443 new = inboth(lmp->in, rmp->in);
2444 if (new == NULL)
2445 goto done;
2446 freelist(lmp->in);
2447 free((char *) lmp->in);
2448 lmp->in = new;
2449 }
2450 break;
2451 case PLUS:
2452 if (mp <= musts)
2453 goto done; /* "cannot happen" */
2454 --mp;
2455 mp->is[0] = '\0';
2456 break;
2457 case END:
2458 if (mp != &musts[1])
2459 goto done; /* "cannot happen" */
2460 for (i = 0; musts[0].in[i] != NULL; ++i)
2461 if (strlen(musts[0].in[i]) > strlen(result))
2462 result = musts[0].in[i];
2463 if (strcmp(result, musts[0].is) == 0)
2464 exact = 1;
2465 goto done;
2466 case CAT:
2467 if (mp < &musts[2])
2468 goto done; /* "cannot happen" */
2469 {
2470 must *lmp;
2471 must *rmp;
2472
2473 rmp = --mp;
2474 lmp = --mp;
2475 /* In. Everything in left, plus everything in
2476 right, plus catenation of
2477 left's right and right's left. */
2478 lmp->in = addlists(lmp->in, rmp->in);
2479 if (lmp->in == NULL)
2480 goto done;
2481 if (lmp->right[0] != '\0' &&
2482 rmp->left[0] != '\0')
2483 {
2484 char *tp;
2485
2486 tp = icpyalloc(lmp->right);
2487 if (tp == NULL)
2488 goto done;
2489 tp = icatalloc(tp, rmp->left);
2490 if (tp == NULL)
2491 goto done;
2492 lmp->in = enlist(lmp->in, tp,
2493 strlen(tp));
2494 free(tp);
2495 if (lmp->in == NULL)
2496 goto done;
2497 }
2498 /* Left-hand */
2499 if (lmp->is[0] != '\0')
2500 {
2501 lmp->left = icatalloc(lmp->left,
2502 rmp->left);
2503 if (lmp->left == NULL)
2504 goto done;
2505 }
2506 /* Right-hand */
2507 if (rmp->is[0] == '\0')
2508 lmp->right[0] = '\0';
2509 lmp->right = icatalloc(lmp->right, rmp->right);
2510 if (lmp->right == NULL)
2511 goto done;
2512 /* Guaranteed to be */
2513 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2514 {
2515 lmp->is = icatalloc(lmp->is, rmp->is);
2516 if (lmp->is == NULL)
2517 goto done;
2518 }
2519 else
2520 lmp->is[0] = '\0';
2521 }
2522 break;
2523 default:
2524 if (t < END)
2525 {
2526 /* "cannot happen" */
2527 goto done;
2528 }
2529 else if (t == '\0')
2530 {
2531 /* not on *my* shift */
2532 goto done;
2533 }
2534 else if (t >= CSET)
2535 {
2536 /* easy enough */
2537 resetmust(mp);
2538 }
2539 else
2540 {
2541 /* plain character */
2542 resetmust(mp);
2543 mp->is[0] = mp->left[0] = mp->right[0] = t;
2544 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2545 mp->in = enlist(mp->in, mp->is, (size_t)1);
2546 if (mp->in == NULL)
2547 goto done;
2548 }
2549 break;
2550 }
2551#ifdef DEBUG
2552 fprintf(stderr, " node: %d:", ri);
2553 prtok(dfa->tokens[ri]);
2554 fprintf(stderr, "\n in:");
2555 for (i = 0; mp->in[i]; ++i)
2556 fprintf(stderr, " \"%s\"", mp->in[i]);
2557 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2558 fprintf(stderr, " left: \"%s\"\n", mp->left);
2559 fprintf(stderr, " right: \"%s\"\n", mp->right);
2560#endif
2561 ++mp;
2562 }
2563 done:
2564 if (strlen(result))
2565 {
2566 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2567 dm->exact = exact;
2568 dm->must = malloc(strlen(result) + 1);
2569 strcpy(dm->must, result);
2570 dm->next = dfa->musts;
2571 dfa->musts = dm;
2572 }
2573 mp = musts;
2574 for (i = 0; i <= dfa->tindex; ++i)
2575 {
2576 freelist(mp[i].in);
2577 ifree((char *) mp[i].in);
2578 ifree(mp[i].left);
2579 ifree(mp[i].right);
2580 ifree(mp[i].is);
2581 }
2582 free((char *) mp);
2583}