gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
984263bc MD	1	/* dfa.c - deterministic extended regexp routines for GNU
	2	Copyright (C) 1988, 1998 Free Software Foundation, Inc.
	3
	4	This program is free software; you can redistribute it and/or modify
	5	it under the terms of the GNU General Public License as published by
	6	the Free Software Foundation; either version 2, or (at your option)
	7	any later version.
	8
	9	This program is distributed in the hope that it will be useful,
	10	but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	GNU General Public License for more details.
	13
	14	You should have received a copy of the GNU General Public License
	15	along with this program; if not, write to the Free Software
	16	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
	17
	18	/* Written June, 1988 by Mike Haertel
	19	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
	20
	21	/* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
0e387216	22	/* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.3 2004/02/03 19:22:57 dillon Exp $ */
984263bc MD	23
	24	#ifdef HAVE_CONFIG_H
	25	#include <config.h>
	26	#endif
	27
	28	#include <assert.h>
	29	#include <ctype.h>
	30	#include <stdio.h>
	31
	32	#include <sys/types.h>
	33	#ifdef STDC_HEADERS
	34	#include <stdlib.h>
	35	#else
	36	extern char calloc(), malloc(), *realloc();
	37	extern void free();
	38	#endif
	39
	40	#if defined(HAVE_STRING_H) \|\| defined(STDC_HEADERS)
	41	#include <string.h>
	42	#undef index
	43	#define index strchr
	44	#else
	45	#include <strings.h>
	46	#endif
	47
	48	#ifndef DEBUG /* use the same approach as regex.c */
	49	#undef assert
	50	#define assert(e)
	51	#endif /* DEBUG */
	52
	53	#ifndef isgraph
	54	#define isgraph(C) (isprint(C) && !isspace(C))
	55	#endif
	56
	57	#if defined (STDC_HEADERS) \|\| (!defined (isascii) && !defined (HAVE_ISASCII))
	58	#define ISALPHA(C) isalpha(C)
	59	#define ISUPPER(C) isupper(C)
	60	#define ISLOWER(C) islower(C)
	61	#define ISDIGIT(C) isdigit(C)
	62	#define ISXDIGIT(C) isxdigit(C)
	63	#define ISSPACE(C) isspace(C)
	64	#define ISPUNCT(C) ispunct(C)
	65	#define ISALNUM(C) isalnum(C)
	66	#define ISPRINT(C) isprint(C)
	67	#define ISGRAPH(C) isgraph(C)
	68	#define ISCNTRL(C) iscntrl(C)
	69	#else
	70	#define ISALPHA(C) (isascii(C) && isalpha(C))
	71	#define ISUPPER(C) (isascii(C) && isupper(C))
	72	#define ISLOWER(C) (isascii(C) && islower(C))
	73	#define ISDIGIT(C) (isascii(C) && isdigit(C))
	74	#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
	75	#define ISSPACE(C) (isascii(C) && isspace(C))
	76	#define ISPUNCT(C) (isascii(C) && ispunct(C))
	77	#define ISALNUM(C) (isascii(C) && isalnum(C))
	78	#define ISPRINT(C) (isascii(C) && isprint(C))
	79	#define ISGRAPH(C) (isascii(C) && isgraph(C))
	80	#define ISCNTRL(C) (isascii(C) && iscntrl(C))
	81	#endif
	82
	83	/* ISASCIIDIGIT differs from ISDIGIT, as follows:
	84	- Its arg may be any int or unsigned int; it need not be an unsigned char.
	85	- It's guaranteed to evaluate its argument exactly once.
	86	- It's typically faster.
87	Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
88	only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
89	it's important to use the locale's definition of `digit' even when the
90	host does not conform to Posix. */
91	#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
92
93	/* If we (don't) have I18N. */
94	/* glibc defines _ */
95	#ifndef _
96	# ifdef HAVE_LIBINTL_H
97	# include <libintl.h>
98	# ifndef _
99	# define _(Str) gettext (Str)
100	# endif
101	# else
102	# define _(Str) (Str)
103	# endif
104	#endif
105
984263bc	106	#include <gnuregex.h>
984263bc MD	107	#include "dfa.h"
	108
	109	/* HPUX, define those as macros in sys/param.h */
	110	#ifdef setbit
	111	# undef setbit
	112	#endif
	113	#ifdef clrbit
	114	# undef clrbit
	115	#endif
	116
	117	static void dfamust PARAMS ((struct dfa *dfa));
	118
	119	static ptr_t xcalloc PARAMS ((size_t n, size_t s));
	120	static ptr_t xmalloc PARAMS ((size_t n));
	121	static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
	122	#ifdef DEBUG
	123	static void prtok PARAMS ((token t));
	124	#endif
	125	static int tstbit PARAMS ((int b, charclass c));
	126	static void setbit PARAMS ((int b, charclass c));
	127	static void clrbit PARAMS ((int b, charclass c));
	128	static void copyset PARAMS ((charclass src, charclass dst));
	129	static void zeroset PARAMS ((charclass s));
	130	static void notset PARAMS ((charclass s));
	131	static int equal PARAMS ((charclass s1, charclass s2));
	132	static int charclass_index PARAMS ((charclass s));
	133	static int looking_at PARAMS ((const char *s));
	134	static token lex PARAMS ((void));
	135	static void addtok PARAMS ((token t));
	136	static void atom PARAMS ((void));
	137	static int nsubtoks PARAMS ((int tindex));
	138	static void copytoks PARAMS ((int tindex, int ntokens));
	139	static void closure PARAMS ((void));
	140	static void branch PARAMS ((void));
	141	static void regexp PARAMS ((int toplevel));
	142	static void copy PARAMS ((position_set src, position_set dst));
	143	static void insert PARAMS ((position p, position_set *s));
	144	static void merge PARAMS ((position_set s1, position_set s2, position_set *m));
	145	static void delete PARAMS ((position p, position_set *s));
	146	static int state_index PARAMS ((struct dfa d, position_set s,
	147	int newline, int letter));
	148	static void build_state PARAMS ((int s, struct dfa *d));
	149	static void build_state_zero PARAMS ((struct dfa *d));
	150	static char icatalloc PARAMS ((char old, char *new));
	151	static char icpyalloc PARAMS ((char string));
	152	static char istrstr PARAMS ((char lookin, char *lookfor));
	153	static void ifree PARAMS ((char *cp));
	154	static void freelist PARAMS ((char **cpp));
	155	static char enlist PARAMS ((char cpp, char *new, size_t len));
	156	static char *comsubs PARAMS ((char left, char *right));
	157	static char addlists PARAMS ((char old, char **new));
	158	static char inboth PARAMS ((char left, char **right));
	159
	160	static ptr_t
	161	xcalloc (size_t n, size_t s)
	162	{
	163	ptr_t r = calloc(n, s);
	164
	165	if (!r)
	166	dfaerror(_("Memory exhausted"));
	167	return r;
	168	}
	169
	170	static ptr_t
171	xmalloc (size_t n)
172	{
173	ptr_t r = malloc(n);
174
175	assert(n != 0);
176	if (!r)
177	dfaerror(_("Memory exhausted"));
178	return r;
179	}
180
181	static ptr_t
182	xrealloc (ptr_t p, size_t n)
183	{
184	ptr_t r = realloc(p, n);
185
186	assert(n != 0);
187	if (!r)
188	dfaerror(_("Memory exhausted"));
189	return r;
190	}
191
192	#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
193	#define MALLOC(p, t, n) ((p) = (t ) xmalloc((n) sizeof (t)))
194	#define REALLOC(p, t, n) ((p) = (t ) xrealloc((ptr_t) (p), (n) sizeof (t)))
195
196	/* Reallocate an array of type t if nalloc is too small for index. */
197	#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
198	if ((index) >= (nalloc)) \
199	{ \
200	while ((index) >= (nalloc)) \
201	(nalloc) *= 2; \
202	REALLOC(p, t, nalloc); \
203	}
204
205	#ifdef DEBUG
206
207	static void
208	prtok (token t)
209	{
210	char *s;
211
212	if (t < 0)
213	fprintf(stderr, "END");
214	else if (t < NOTCHAR)
215	fprintf(stderr, "%c", t);
216	else
217	{
218	switch (t)
219	{
220	case EMPTY: s = "EMPTY"; break;
221	case BACKREF: s = "BACKREF"; break;
222	case BEGLINE: s = "BEGLINE"; break;
223	case ENDLINE: s = "ENDLINE"; break;
224	case BEGWORD: s = "BEGWORD"; break;
225	case ENDWORD: s = "ENDWORD"; break;
226	case LIMWORD: s = "LIMWORD"; break;
227	case NOTLIMWORD: s = "NOTLIMWORD"; break;
228	case QMARK: s = "QMARK"; break;
229	case STAR: s = "STAR"; break;
230	case PLUS: s = "PLUS"; break;
231	case CAT: s = "CAT"; break;
232	case OR: s = "OR"; break;
233	case ORTOP: s = "ORTOP"; break;
234	case LPAREN: s = "LPAREN"; break;
235	case RPAREN: s = "RPAREN"; break;
236	default: s = "CSET"; break;
237	}
238	fprintf(stderr, "%s", s);
239	}
240	}
241	#endif /* DEBUG */
242
243	/* Stuff pertaining to charclasses. */
244
245	static int
246	tstbit (int b, charclass c)
247	{
248	return c[b / INTBITS] & 1 << b % INTBITS;
249	}
250
251	static void
252	setbit (int b, charclass c)
253	{
254	c[b / INTBITS] \|= 1 << b % INTBITS;
255	}
256
257	static void
258	clrbit (int b, charclass c)
259	{
260	c[b / INTBITS] &= ~(1 << b % INTBITS);
261	}
262
263	static void
264	copyset (charclass src, charclass dst)
265	{
266	int i;
267
268	for (i = 0; i < CHARCLASS_INTS; ++i)
269	dst[i] = src[i];
270	}
271
272	static void
273	zeroset (charclass s)
274	{
275	int i;
276
277	for (i = 0; i < CHARCLASS_INTS; ++i)
278	s[i] = 0;
279	}
280
281	static void
282	notset (charclass s)
283	{
284	int i;
285
286	for (i = 0; i < CHARCLASS_INTS; ++i)
287	s[i] = ~s[i];
288	}
289
290	static int
291	equal (charclass s1, charclass s2)
292	{
293	int i;
294
295	for (i = 0; i < CHARCLASS_INTS; ++i)
296	if (s1[i] != s2[i])
297	return 0;
298	return 1;
299	}
300
301	/* A pointer to the current dfa is kept here during parsing. */
302	static struct dfa *dfa;
303
304	/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
305	static int
306	charclass_index (charclass s)
307	{
308	int i;
309
310	for (i = 0; i < dfa->cindex; ++i)
311	if (equal(s, dfa->charclasses[i]))
312	return i;
313	REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
314	++dfa->cindex;
315	copyset(s, dfa->charclasses[i]);
316	return i;
317	}
318
319	/* Syntax bits controlling the behavior of the lexical analyzer. */
320	static reg_syntax_t syntax_bits, syntax_bits_set;
321
322	/* Flag for case-folding letters into sets. */
323	static int case_fold;
324
325	/* End-of-line byte in data. */
326	static unsigned char eolbyte;
327
328	/* Entry point to set syntax options. */
329	void
330	dfasyntax (reg_syntax_t bits, int fold, int eol)
331	{
332	syntax_bits_set = 1;
333	syntax_bits = bits;
334	case_fold = fold;
335	eolbyte = eol;
336	}
337
338	/* Lexical analyzer. All the dross that deals with the obnoxious
339	GNU Regex syntax bits is located here. The poor, suffering
340	reader is referred to the GNU Regex documentation for the
341	meaning of the @#%!@#%^!@ syntax bits. */
342
343	static char lexstart; / Pointer to beginning of input string. */
344	static char lexptr; / Pointer to next input character. */
345	static int lexleft; /* Number of characters remaining. */
346	static token lasttok; /* Previous token returned; initially END. */
347	static int laststart; /* True if we're separated from beginning or (, \|
348	only by zero-width characters. */
349	static int parens; /* Count of outstanding left parens. */
350	static int minrep, maxrep; /* Repeat counts for {m,n}. */
351
352	/* Note that characters become unsigned here. */
353	#define FETCH(c, eoferr) \
354	{ \
355	if (! lexleft) \
356	{ \
357	if (eoferr != 0) \
358	dfaerror (eoferr); \
359	else \
360	return lasttok = END; \
361	} \
362	(c) = (unsigned char) *lexptr++; \
363	--lexleft; \
364	}
365
366	#ifdef __STDC__
367	#define FUNC(F, P) static int F(int c) { return P(c); }
368	#else
369	#define FUNC(F, P) static int F(c) int c; { return P(c); }
370	#endif
371
372	FUNC(is_alpha, ISALPHA)
373	FUNC(is_upper, ISUPPER)
374	FUNC(is_lower, ISLOWER)
375	FUNC(is_digit, ISDIGIT)
376	FUNC(is_xdigit, ISXDIGIT)
377	FUNC(is_space, ISSPACE)
378	FUNC(is_punct, ISPUNCT)
379	FUNC(is_alnum, ISALNUM)
380	FUNC(is_print, ISPRINT)
381	FUNC(is_graph, ISGRAPH)
382	FUNC(is_cntrl, ISCNTRL)
383
384	static int
385	is_blank (int c)
386	{
387	return (c == ' ' \|\| c == '\t');
388	}
389
390	/* The following list maps the names of the Posix named character classes
391	to predicate functions that determine whether a given character is in
392	the class. The leading [ has already been eaten by the lexical analyzer. */
393	static struct {
394	const char *name;
395	int (*pred) PARAMS ((int));
396	} prednames[] = {
397	{ ":alpha:]", is_alpha },
398	{ ":upper:]", is_upper },
399	{ ":lower:]", is_lower },
400	{ ":digit:]", is_digit },
401	{ ":xdigit:]", is_xdigit },
402	{ ":space:]", is_space },
403	{ ":punct:]", is_punct },
404	{ ":alnum:]", is_alnum },
405	{ ":print:]", is_print },
406	{ ":graph:]", is_graph },
407	{ ":cntrl:]", is_cntrl },
408	{ ":blank:]", is_blank },
409	{ 0 }
410	};
411
412	/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
413	#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) \|\| (C) == '_')
414
415	static int
416	looking_at (char const *s)
417	{
418	size_t len;
419
420	len = strlen(s);
421	if (lexleft < len)
422	return 0;
423	return strncmp(s, lexptr, len) == 0;
424	}
425
426	static token
427	lex (void)
428	{
429	token c, c1, c2;
430	int backslash = 0, invert;
431	charclass ccl;
432	int i;
433	char lo[2];
434	char hi[2];
435
436	/* Basic plan: We fetch a character. If it's a backslash,
437	we set the backslash flag and go through the loop again.
438	On the plus side, this avoids having a duplicate of the
439	main switch inside the backslash case. On the minus side,
440	it means that just about every case begins with
441	"if (backslash) ...". */
442	for (i = 0; i < 2; ++i)
443	{
444	FETCH(c, 0);
445	switch (c)
446	{
447	case '\\':
448	if (backslash)
449	goto normal_char;
450	if (lexleft == 0)
451	dfaerror(_("Unfinished \\ escape"));
452	backslash = 1;
453	break;
454
455	case '^':
456	if (backslash)
457	goto normal_char;
458	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
459	\|\| lasttok == END
460	\|\| lasttok == LPAREN
461	\|\| lasttok == OR)
462	return lasttok = BEGLINE;
463	goto normal_char;
464
465	case '$':
466	if (backslash)
467	goto normal_char;
468	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
469	\|\| lexleft == 0
470	\|\| (syntax_bits & RE_NO_BK_PARENS
471	? lexleft > 0 && *lexptr == ')'
472	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
473	\|\| (syntax_bits & RE_NO_BK_VBAR
474	? lexleft > 0 && *lexptr == '\|'
475	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '\|')
476	\|\| ((syntax_bits & RE_NEWLINE_ALT)
477	&& lexleft > 0 && *lexptr == '\n'))
478	return lasttok = ENDLINE;
479	goto normal_char;
480
481	case '1':
482	case '2':
483	case '3':
484	case '4':
485	case '5':
486	case '6':
487	case '7':
488	case '8':
489	case '9':
490	if (backslash && !(syntax_bits & RE_NO_BK_REFS))
491	{
492	laststart = 0;
493	return lasttok = BACKREF;
494	}
495	goto normal_char;
496
497	case '`':
498	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
499	return lasttok = BEGLINE; /* FIXME: should be beginning of string */
500	goto normal_char;
501
502	case '\'':
503	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
504	return lasttok = ENDLINE; /* FIXME: should be end of string */
505	goto normal_char;
506
507	case '<':
508	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
509	return lasttok = BEGWORD;
510	goto normal_char;
511
512	case '>':
513	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
514	return lasttok = ENDWORD;
515	goto normal_char;
516
517	case 'b':
518	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
519	return lasttok = LIMWORD;
520	goto normal_char;
521
522	case 'B':
523	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
524	return lasttok = NOTLIMWORD;
525	goto normal_char;
526
527	case '?':
528	if (syntax_bits & RE_LIMITED_OPS)
529	goto normal_char;
530	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
531	goto normal_char;
532	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
533	goto normal_char;
534	return lasttok = QMARK;
535
536	case '*':
537	if (backslash)
538	goto normal_char;
539	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
540	goto normal_char;
541	return lasttok = STAR;
542
543	case '+':
544	if (syntax_bits & RE_LIMITED_OPS)
545	goto normal_char;
546	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
547	goto normal_char;
548	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
549	goto normal_char;
550	return lasttok = PLUS;
551
552	case '{':
553	if (!(syntax_bits & RE_INTERVALS))
554	goto normal_char;
555	if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
556	goto normal_char;
557	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
558	goto normal_char;
559
560	if (syntax_bits & RE_NO_BK_BRACES)
561	{
562	/* Scan ahead for a valid interval; if it's not valid,
563	treat it as a literal '{'. */
564	int lo = -1, hi = -1;
565	char const *p = lexptr;
566	char const *lim = p + lexleft;
567	for (; p != lim && ISASCIIDIGIT (*p); p++)
568	lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
569	if (p != lim && *p == ',')
570	while (++p != lim && ISASCIIDIGIT (*p))
571	hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
572	else
573	hi = lo;
574	if (p == lim \|\| *p != '}'
575	\|\| lo < 0 \|\| RE_DUP_MAX < hi \|\| (0 <= hi && hi < lo))
576	goto normal_char;
577	}
578
579	minrep = 0;
580	/* Cases:
581	{M} - exact count
582	{M,} - minimum count, maximum is infinity
583	{M,N} - M through N */
584	FETCH(c, _("unfinished repeat count"));
585	if (ISASCIIDIGIT (c))
586	{
587	minrep = c - '0';
588	for (;;)
589	{
590	FETCH(c, _("unfinished repeat count"));
591	if (! ISASCIIDIGIT (c))
592	break;
593	minrep = 10 * minrep + c - '0';
594	}
595	}
596	else
597	dfaerror(_("malformed repeat count"));
598	if (c == ',')
599	{
600	FETCH (c, _("unfinished repeat count"));
601	if (! ISASCIIDIGIT (c))
602	maxrep = -1;
603	else
604	{
605	maxrep = c - '0';
606	for (;;)
607	{
608	FETCH (c, _("unfinished repeat count"));
609	if (! ISASCIIDIGIT (c))
610	break;
611	maxrep = 10 * maxrep + c - '0';
612	}
613	if (0 <= maxrep && maxrep < minrep)
614	dfaerror (_("malformed repeat count"));
615	}
616	}
617	else
618	maxrep = minrep;
619	if (!(syntax_bits & RE_NO_BK_BRACES))
620	{
621	if (c != '\\')
622	dfaerror(_("malformed repeat count"));
623	FETCH(c, _("unfinished repeat count"));
624	}
625	if (c != '}')
626	dfaerror(_("malformed repeat count"));
627	laststart = 0;
628	return lasttok = REPMN;
629
630	case '\|':
631	if (syntax_bits & RE_LIMITED_OPS)
632	goto normal_char;
633	if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
634	goto normal_char;
635	laststart = 1;
636	return lasttok = OR;
637
638	case '\n':
639	if (syntax_bits & RE_LIMITED_OPS
640	\|\| backslash
641	\|\| !(syntax_bits & RE_NEWLINE_ALT))
642	goto normal_char;
643	laststart = 1;
644	return lasttok = OR;
645
646	case '(':
647	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
648	goto normal_char;
649	++parens;
650	laststart = 1;
651	return lasttok = LPAREN;
652
653	case ')':
654	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
655	goto normal_char;
656	if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
657	goto normal_char;
658	--parens;
659	laststart = 0;
660	return lasttok = RPAREN;
661
662	case '.':
663	if (backslash)
664	goto normal_char;
665	zeroset(ccl);
666	notset(ccl);
667	if (!(syntax_bits & RE_DOT_NEWLINE))
668	clrbit(eolbyte, ccl);
669	if (syntax_bits & RE_DOT_NOT_NULL)
670	clrbit('\0', ccl);
671	laststart = 0;
672	return lasttok = CSET + charclass_index(ccl);
673
674	case 'w':
675	case 'W':
676	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
677	goto normal_char;
678	zeroset(ccl);
679	for (c2 = 0; c2 < NOTCHAR; ++c2)
680	if (IS_WORD_CONSTITUENT(c2))
681	setbit(c2, ccl);
682	if (c == 'W')
683	notset(ccl);
684	laststart = 0;
685	return lasttok = CSET + charclass_index(ccl);
686
687	case '[':
688	if (backslash)
689	goto normal_char;
690	zeroset(ccl);
691	FETCH(c, _("Unbalanced ["));
692	if (c == '^')
693	{
694	FETCH(c, _("Unbalanced ["));
695	invert = 1;
696	}
697	else
698	invert = 0;
699	do
700	{
701	/* Nobody ever said this had to be fast. :-)
702	Note that if we're looking at some other [:...:]
703	construct, we just treat it as a bunch of ordinary
704	characters. We can do this because we assume
705	regex has checked for syntax errors before
706	dfa is ever called. */
707	if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
708	for (c1 = 0; prednames[c1].name; ++c1)
709	if (looking_at(prednames[c1].name))
710	{
711	int (*pred)() = prednames[c1].pred;
712	if (case_fold
713	&& (pred == is_upper \|\| pred == is_lower))
714	pred = is_alpha;
715
716	for (c2 = 0; c2 < NOTCHAR; ++c2)
717	if ((*pred)(c2))
718	setbit(c2, ccl);
719	lexptr += strlen(prednames[c1].name);
720	lexleft -= strlen(prednames[c1].name);
721	FETCH(c1, _("Unbalanced ["));
722	goto skip;
723	}
724	if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
725	FETCH(c, _("Unbalanced ["));
726	FETCH(c1, _("Unbalanced ["));
727	if (c1 == '-')
728	{
729	FETCH(c2, _("Unbalanced ["));
730	if (c2 == ']')
731	{
732	/* In the case [x-], the - is an ordinary hyphen,
733	which is left in c1, the lookahead character. */
734	--lexptr;
735	++lexleft;
736	c2 = c;
737	}
738	else
739	{
740	if (c2 == '\\'
741	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
742	FETCH(c2, _("Unbalanced ["));
743	FETCH(c1, _("Unbalanced ["));
744	}
745	}
746	else
747	c2 = c;
748
749	lo[0] = c; lo[1] = '\0';
750	hi[0] = c2; hi[1] = '\0';
751	for (c = 0; c < NOTCHAR; c++)
752	{
753	char ch[2];
754	ch[0] = c; ch[1] = '\0';
755	if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
756	{
757	setbit (c, ccl);
758	if (case_fold)
759	{
760	if (ISUPPER (c))
761	setbit (tolower (c), ccl);
762	else if (ISLOWER (c))
763	setbit (toupper (c), ccl);
764	}
765	}
766	}
767
768	skip:
769	;
770	}
771	while ((c = c1) != ']');
772	if (invert)
773	{
774	notset(ccl);
775	if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
776	clrbit(eolbyte, ccl);
777	}
778	laststart = 0;
779	return lasttok = CSET + charclass_index(ccl);
780
781	default:
782	normal_char:
783	laststart = 0;
784	if (case_fold && ISALPHA(c))
785	{
786	zeroset(ccl);
787	setbit(c, ccl);
788	if (isupper(c))
789	setbit(tolower(c), ccl);
790	else
791	setbit(toupper(c), ccl);
792	return lasttok = CSET + charclass_index(ccl);
793	}
794	return c;
795	}
796	}
797
798	/* The above loop should consume at most a backslash
799	and some other character. */
800	abort();
801	return END; /* keeps pedantic compilers happy. */
802	}
803
804	/* Recursive descent parser for regular expressions. */
805
806	static token tok; /* Lookahead token. */
807	static int depth; /* Current depth of a hypothetical stack
808	holding deferred productions. This is
809	used to determine the depth that will be
810	required of the real stack later on in
811	dfaanalyze(). */
812
813	/* Add the given token to the parse tree, maintaining the depth count and
814	updating the maximum depth if necessary. */
815	static void
816	addtok (token t)
817	{
818	REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
819	dfa->tokens[dfa->tindex++] = t;
820
821	switch (t)
822	{
823	case QMARK:
824	case STAR:
825	case PLUS:
826	break;
827
828	case CAT:
829	case OR:
830	case ORTOP:
831	--depth;
832	break;
833
834	default:
835	++dfa->nleaves;
836	case EMPTY:
837	++depth;
838	break;
839	}
840	if (depth > dfa->depth)
841	dfa->depth = depth;
842	}
843
844	/* The grammar understood by the parser is as follows.
845
846	regexp:
847	regexp OR branch
848	branch
849
850	branch:
851	branch closure
852	closure
853
854	closure:
855	closure QMARK
856	closure STAR
857	closure PLUS
858	atom
859
860	atom:
861	<normal character>
862	CSET
863	BACKREF
864	BEGLINE
865	ENDLINE
866	BEGWORD
867	ENDWORD
868	LIMWORD
869	NOTLIMWORD
870	<empty>
871
872	The parser builds a parse tree in postfix form in an array of tokens. */
873
874	static void
875	atom (void)
876	{
877	if ((tok >= 0 && tok < NOTCHAR) \|\| tok >= CSET \|\| tok == BACKREF
878	\|\| tok == BEGLINE \|\| tok == ENDLINE \|\| tok == BEGWORD
879	\|\| tok == ENDWORD \|\| tok == LIMWORD \|\| tok == NOTLIMWORD)
880	{
881	addtok(tok);
882	tok = lex();
883	}
884	else if (tok == LPAREN)
885	{
886	tok = lex();
887	regexp(0);
888	if (tok != RPAREN)
889	dfaerror(_("Unbalanced ("));
890	tok = lex();
891	}
892	else
893	addtok(EMPTY);
894	}
895
896	/* Return the number of tokens in the given subexpression. */
897	static int
898	nsubtoks (int tindex)
899	{
900	int ntoks1;
901
902	switch (dfa->tokens[tindex - 1])
903	{
904	default:
905	return 1;
906	case QMARK:
907	case STAR:
908	case PLUS:
909	return 1 + nsubtoks(tindex - 1);
910	case CAT:
911	case OR:
912	case ORTOP:
913	ntoks1 = nsubtoks(tindex - 1);
914	return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
915	}
916	}
917
918	/* Copy the given subexpression to the top of the tree. */
919	static void
920	copytoks (int tindex, int ntokens)
921	{
922	int i;
923
924	for (i = 0; i < ntokens; ++i)
925	addtok(dfa->tokens[tindex + i]);
926	}
927
928	static void
929	closure (void)
930	{
931	int tindex, ntokens, i;
932
933	atom();
934	while (tok == QMARK \|\| tok == STAR \|\| tok == PLUS \|\| tok == REPMN)
935	if (tok == REPMN)
936	{
937	ntokens = nsubtoks(dfa->tindex);
938	tindex = dfa->tindex - ntokens;
939	if (maxrep < 0)
940	addtok(PLUS);
941	if (minrep == 0)
942	addtok(QMARK);
943	for (i = 1; i < minrep; ++i)
944	{
945	copytoks(tindex, ntokens);
946	addtok(CAT);
947	}
948	for (; i < maxrep; ++i)
949	{
950	copytoks(tindex, ntokens);
951	addtok(QMARK);
952	addtok(CAT);
953	}
954	tok = lex();
955	}
956	else
957	{
958	addtok(tok);
959	tok = lex();
960	}
961	}
962
963	static void
964	branch (void)
965	{
966	closure();
967	while (tok != RPAREN && tok != OR && tok >= 0)
968	{
969	closure();
970	addtok(CAT);
971	}
972	}
973
974	static void
975	regexp (int toplevel)
976	{
977	branch();
978	while (tok == OR)
979	{
980	tok = lex();
981	branch();
982	if (toplevel)
983	addtok(ORTOP);
984	else
985	addtok(OR);
986	}
987	}
988
989	/* Main entry point for the parser. S is a string to be parsed, len is the
990	length of the string, so s can include NUL characters. D is a pointer to
991	the struct dfa to parse into. */
992	void
993	dfaparse (char s, size_t len, struct dfa d)
994	{
995	dfa = d;
996	lexstart = lexptr = s;
997	lexleft = len;
998	lasttok = END;
999	laststart = 1;
1000	parens = 0;
1001
1002	if (! syntax_bits_set)
1003	dfaerror(_("No syntax specified"));
1004
1005	tok = lex();
1006	depth = d->depth;
1007
1008	regexp(1);
1009
1010	if (tok != END)
1011	dfaerror(_("Unbalanced )"));
1012
1013	addtok(END - d->nregexps);
1014	addtok(CAT);
1015
1016	if (d->nregexps)
1017	addtok(ORTOP);
1018
1019	++d->nregexps;
1020	}
1021
1022	/* Some primitives for operating on sets of positions. */
1023
1024	/* Copy one set to another; the destination must be large enough. */
1025	static void
1026	copy (position_set src, position_set dst)
1027	{
1028	int i;
1029
1030	for (i = 0; i < src->nelem; ++i)
1031	dst->elems[i] = src->elems[i];
1032	dst->nelem = src->nelem;
1033	}
1034
1035	/* Insert a position in a set. Position sets are maintained in sorted
1036	order according to index. If position already exists in the set with
1037	the same index then their constraints are logically or'd together.
1038	S->elems must point to an array large enough to hold the resulting set. */
1039	static void
1040	insert (position p, position_set *s)
1041	{
1042	int i;
1043	position t1, t2;
1044
1045	for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1046	continue;
1047	if (i < s->nelem && p.index == s->elems[i].index)
1048	s->elems[i].constraint \|= p.constraint;
1049	else
1050	{
1051	t1 = p;
1052	++s->nelem;
1053	while (i < s->nelem)
1054	{
1055	t2 = s->elems[i];
1056	s->elems[i++] = t1;
1057	t1 = t2;
1058	}
1059	}
1060	}
1061
1062	/* Merge two sets of positions into a third. The result is exactly as if
1063	the positions of both sets were inserted into an initially empty set. */
1064	static void
1065	merge (position_set s1, position_set s2, position_set *m)
1066	{
1067	int i = 0, j = 0;
1068
1069	m->nelem = 0;
1070	while (i < s1->nelem && j < s2->nelem)
1071	if (s1->elems[i].index > s2->elems[j].index)
1072	m->elems[m->nelem++] = s1->elems[i++];
1073	else if (s1->elems[i].index < s2->elems[j].index)
1074	m->elems[m->nelem++] = s2->elems[j++];
1075	else
1076	{
1077	m->elems[m->nelem] = s1->elems[i++];
1078	m->elems[m->nelem++].constraint \|= s2->elems[j++].constraint;
1079	}
1080	while (i < s1->nelem)
1081	m->elems[m->nelem++] = s1->elems[i++];
1082	while (j < s2->nelem)
1083	m->elems[m->nelem++] = s2->elems[j++];
1084	}
1085
1086	/* Delete a position from a set. */
1087	static void
1088	delete (position p, position_set *s)
1089	{
1090	int i;
1091
1092	for (i = 0; i < s->nelem; ++i)
1093	if (p.index == s->elems[i].index)
1094	break;
1095	if (i < s->nelem)
1096	for (--s->nelem; i < s->nelem; ++i)
1097	s->elems[i] = s->elems[i + 1];
1098	}
1099
1100	/* Find the index of the state corresponding to the given position set with
1101	the given preceding context, or create a new state if there is no such
1102	state. Newline and letter tell whether we got here on a newline or
1103	letter, respectively. */
1104	static int
1105	state_index (struct dfa d, position_set s, int newline, int letter)
1106	{
1107	int hash = 0;
1108	int constraint;
1109	int i, j;
1110
1111	newline = newline ? 1 : 0;
1112	letter = letter ? 1 : 0;
1113
1114	for (i = 0; i < s->nelem; ++i)
1115	hash ^= s->elems[i].index + s->elems[i].constraint;
1116
1117	/* Try to find a state that exactly matches the proposed one. */
1118	for (i = 0; i < d->sindex; ++i)
1119	{
1120	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
1121	\|\| newline != d->states[i].newline \|\| letter != d->states[i].letter)
1122	continue;
1123	for (j = 0; j < s->nelem; ++j)
1124	if (s->elems[j].constraint
1125	!= d->states[i].elems.elems[j].constraint
1126	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
1127	break;
1128	if (j == s->nelem)
1129	return i;
1130	}
1131
1132	/* We'll have to create a new state. */
1133	REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1134	d->states[i].hash = hash;
1135	MALLOC(d->states[i].elems.elems, position, s->nelem);
1136	copy(s, &d->states[i].elems);
1137	d->states[i].newline = newline;
1138	d->states[i].letter = letter;
1139	d->states[i].backref = 0;
1140	d->states[i].constraint = 0;
1141	d->states[i].first_end = 0;
1142	for (j = 0; j < s->nelem; ++j)
1143	if (d->tokens[s->elems[j].index] < 0)
1144	{
1145	constraint = s->elems[j].constraint;
1146	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1147	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1148	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1149	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1150	d->states[i].constraint \|= constraint;
1151	if (! d->states[i].first_end)
1152	d->states[i].first_end = d->tokens[s->elems[j].index];
1153	}
1154	else if (d->tokens[s->elems[j].index] == BACKREF)
1155	{
1156	d->states[i].constraint = NO_CONSTRAINT;
1157	d->states[i].backref = 1;
1158	}
1159
1160	++d->sindex;
1161
1162	return i;
1163	}
1164
1165	/* Find the epsilon closure of a set of positions. If any position of the set
1166	contains a symbol that matches the empty string in some context, replace
1167	that position with the elements of its follow labeled with an appropriate
1168	constraint. Repeat exhaustively until no funny positions are left.
1169	S->elems must be large enough to hold the result. */
1170	static void
1171	epsclosure (position_set s, struct dfa d)
1172	{
1173	int i, j;
1174	int *visited;
1175	position p, old;
1176
1177	MALLOC(visited, int, d->tindex);
1178	for (i = 0; i < d->tindex; ++i)
1179	visited[i] = 0;
1180
1181	for (i = 0; i < s->nelem; ++i)
1182	if (d->tokens[s->elems[i].index] >= NOTCHAR
1183	&& d->tokens[s->elems[i].index] != BACKREF
1184	&& d->tokens[s->elems[i].index] < CSET)
1185	{
1186	old = s->elems[i];
1187	p.constraint = old.constraint;
1188	delete(s->elems[i], s);
1189	if (visited[old.index])
1190	{
1191	--i;
1192	continue;
1193	}
1194	visited[old.index] = 1;
1195	switch (d->tokens[old.index])
1196	{
1197	case BEGLINE:
1198	p.constraint &= BEGLINE_CONSTRAINT;
1199	break;
1200	case ENDLINE:
1201	p.constraint &= ENDLINE_CONSTRAINT;
1202	break;
1203	case BEGWORD:
1204	p.constraint &= BEGWORD_CONSTRAINT;
1205	break;
1206	case ENDWORD:
1207	p.constraint &= ENDWORD_CONSTRAINT;
1208	break;
1209	case LIMWORD:
1210	p.constraint &= LIMWORD_CONSTRAINT;
1211	break;
1212	case NOTLIMWORD:
1213	p.constraint &= NOTLIMWORD_CONSTRAINT;
1214	break;
1215	default:
1216	break;
1217	}
1218	for (j = 0; j < d->follows[old.index].nelem; ++j)
1219	{
1220	p.index = d->follows[old.index].elems[j].index;
1221	insert(p, s);
1222	}
1223	/* Force rescan to start at the beginning. */
1224	i = -1;
1225	}
1226
1227	free(visited);
1228	}
1229
1230	/* Perform bottom-up analysis on the parse tree, computing various functions.
1231	Note that at this point, we're pretending constructs like \< are real
1232	characters rather than constraints on what can follow them.
1233
1234	Nullable: A node is nullable if it is at the root of a regexp that can
1235	match the empty string.
1236	* EMPTY leaves are nullable.
1237	* No other leaf is nullable.
1238	* A QMARK or STAR node is nullable.
1239	* A PLUS node is nullable if its argument is nullable.
1240	* A CAT node is nullable if both its arguments are nullable.
1241	* An OR node is nullable if either argument is nullable.
1242
1243	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1244	that could correspond to the first character of a string matching the
1245	regexp rooted at the given node.
1246	* EMPTY leaves have empty firstpos.
1247	* The firstpos of a nonempty leaf is that leaf itself.
1248	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1249	argument.
1250	* The firstpos of a CAT node is the firstpos of the left argument, union
1251	the firstpos of the right if the left argument is nullable.
1252	* The firstpos of an OR node is the union of firstpos of each argument.
1253
1254	Lastpos: The lastpos of a node is the set of positions that could
1255	correspond to the last character of a string matching the regexp at
1256	the given node.
1257	* EMPTY leaves have empty lastpos.
1258	* The lastpos of a nonempty leaf is that leaf itself.
1259	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1260	argument.
1261	* The lastpos of a CAT node is the lastpos of its right argument, union
1262	the lastpos of the left if the right argument is nullable.
1263	* The lastpos of an OR node is the union of the lastpos of each argument.
1264
1265	Follow: The follow of a position is the set of positions that could
1266	correspond to the character following a character matching the node in
1267	a string matching the regexp. At this point we consider special symbols
1268	that match the empty string in some context to be just normal characters.
1269	Later, if we find that a special symbol is in a follow set, we will
1270	replace it with the elements of its follow, labeled with an appropriate
1271	constraint.
1272	* Every node in the firstpos of the argument of a STAR or PLUS node is in
1273	the follow of every node in the lastpos.
1274	* Every node in the firstpos of the second argument of a CAT node is in
1275	the follow of every node in the lastpos of the first argument.
1276
1277	Because of the postfix representation of the parse tree, the depth-first
1278	analysis is conveniently done by a linear scan with the aid of a stack.
1279	Sets are stored as arrays of the elements, obeying a stack-like allocation
1280	scheme; the number of elements in each set deeper in the stack can be
1281	used to determine the address of a particular set's array. */
1282	void
1283	dfaanalyze (struct dfa *d, int searchflag)
1284	{
1285	int nullable; / Nullable stack. */
1286	int nfirstpos; / Element count stack for firstpos sets. */
1287	position firstpos; / Array where firstpos elements are stored. */
1288	int nlastpos; / Element count stack for lastpos sets. */
1289	position lastpos; / Array where lastpos elements are stored. */
1290	int nalloc; / Sizes of arrays allocated to follow sets. */
1291	position_set tmp; /* Temporary set for merging sets. */
1292	position_set merged; /* Result of merging sets. */
1293	int wants_newline; /* True if some position wants newline info. */
1294	int *o_nullable;
1295	int o_nfirst, o_nlast;
1296	position o_firstpos, o_lastpos;
1297	int i, j;
1298	position *pos;
1299
1300	#ifdef DEBUG
1301	fprintf(stderr, "dfaanalyze:\n");
1302	for (i = 0; i < d->tindex; ++i)
1303	{
1304	fprintf(stderr, " %d:", i);
1305	prtok(d->tokens[i]);
1306	}
1307	putc('\n', stderr);
1308	#endif
1309
1310	d->searchflag = searchflag;
1311
1312	MALLOC(nullable, int, d->depth);
1313	o_nullable = nullable;
1314	MALLOC(nfirstpos, int, d->depth);
1315	o_nfirst = nfirstpos;
1316	MALLOC(firstpos, position, d->nleaves);
1317	o_firstpos = firstpos, firstpos += d->nleaves;
1318	MALLOC(nlastpos, int, d->depth);
1319	o_nlast = nlastpos;
1320	MALLOC(lastpos, position, d->nleaves);
1321	o_lastpos = lastpos, lastpos += d->nleaves;
1322	MALLOC(nalloc, int, d->tindex);
1323	for (i = 0; i < d->tindex; ++i)
1324	nalloc[i] = 0;
1325	MALLOC(merged.elems, position, d->nleaves);
1326
1327	CALLOC(d->follows, position_set, d->tindex);
1328
1329	for (i = 0; i < d->tindex; ++i)
1330	#ifdef DEBUG
1331	{ /* Nonsyntactic #ifdef goo... */
1332	#endif
1333	switch (d->tokens[i])
1334	{
1335	case EMPTY:
1336	/* The empty set is nullable. */
1337	*nullable++ = 1;
1338
1339	/* The firstpos and lastpos of the empty leaf are both empty. */
1340	nfirstpos++ = nlastpos++ = 0;
1341	break;
1342
1343	case STAR:
1344	case PLUS:
1345	/* Every element in the firstpos of the argument is in the follow
1346	of every element in the lastpos. */
1347	tmp.nelem = nfirstpos[-1];
1348	tmp.elems = firstpos;
1349	pos = lastpos;
1350	for (j = 0; j < nlastpos[-1]; ++j)
1351	{
1352	merge(&tmp, &d->follows[pos[j].index], &merged);
1353	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1354	nalloc[pos[j].index], merged.nelem - 1);
1355	copy(&merged, &d->follows[pos[j].index]);
1356	}
1357
1358	case QMARK:
1359	/* A QMARK or STAR node is automatically nullable. */
1360	if (d->tokens[i] != PLUS)
1361	nullable[-1] = 1;
1362	break;
1363
1364	case CAT:
1365	/* Every element in the firstpos of the second argument is in the
1366	follow of every element in the lastpos of the first argument. */
1367	tmp.nelem = nfirstpos[-1];
1368	tmp.elems = firstpos;
1369	pos = lastpos + nlastpos[-1];
1370	for (j = 0; j < nlastpos[-2]; ++j)
1371	{
1372	merge(&tmp, &d->follows[pos[j].index], &merged);
1373	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1374	nalloc[pos[j].index], merged.nelem - 1);
1375	copy(&merged, &d->follows[pos[j].index]);
1376	}
1377
1378	/* The firstpos of a CAT node is the firstpos of the first argument,
1379	union that of the second argument if the first is nullable. */
1380	if (nullable[-2])
1381	nfirstpos[-2] += nfirstpos[-1];
1382	else
1383	firstpos += nfirstpos[-1];
1384	--nfirstpos;
1385
1386	/* The lastpos of a CAT node is the lastpos of the second argument,
1387	union that of the first argument if the second is nullable. */
1388	if (nullable[-1])
1389	nlastpos[-2] += nlastpos[-1];
1390	else
1391	{
1392	pos = lastpos + nlastpos[-2];
1393	for (j = nlastpos[-1] - 1; j >= 0; --j)
1394	pos[j] = lastpos[j];
1395	lastpos += nlastpos[-2];
1396	nlastpos[-2] = nlastpos[-1];
1397	}
1398	--nlastpos;
1399
1400	/* A CAT node is nullable if both arguments are nullable. */
1401	nullable[-2] = nullable[-1] && nullable[-2];
1402	--nullable;
1403	break;
1404
1405	case OR:
1406	case ORTOP:
1407	/* The firstpos is the union of the firstpos of each argument. */
1408	nfirstpos[-2] += nfirstpos[-1];
1409	--nfirstpos;
1410
1411	/* The lastpos is the union of the lastpos of each argument. */
1412	nlastpos[-2] += nlastpos[-1];
1413	--nlastpos;
1414
1415	/* An OR node is nullable if either argument is nullable. */
1416	nullable[-2] = nullable[-1] \|\| nullable[-2];
1417	--nullable;
1418	break;
1419
1420	default:
1421	/* Anything else is a nonempty position. (Note that special
1422	constructs like \< are treated as nonempty strings here;
1423	an "epsilon closure" effectively makes them nullable later.
1424	Backreferences have to get a real position so we can detect
1425	transitions on them later. But they are nullable. */
1426	*nullable++ = d->tokens[i] == BACKREF;
1427
1428	/* This position is in its own firstpos and lastpos. */
1429	nfirstpos++ = nlastpos++ = 1;
1430	--firstpos, --lastpos;
1431	firstpos->index = lastpos->index = i;
1432	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1433
1434	/* Allocate the follow set for this position. */
1435	nalloc[i] = 1;
1436	MALLOC(d->follows[i].elems, position, nalloc[i]);
1437	break;
1438	}
1439	#ifdef DEBUG
1440	/* ... balance the above nonsyntactic #ifdef goo... */
1441	fprintf(stderr, "node %d:", i);
1442	prtok(d->tokens[i]);
1443	putc('\n', stderr);
1444	fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1445	fprintf(stderr, " firstpos:");
1446	for (j = nfirstpos[-1] - 1; j >= 0; --j)
1447	{
1448	fprintf(stderr, " %d:", firstpos[j].index);
1449	prtok(d->tokens[firstpos[j].index]);
1450	}
1451	fprintf(stderr, "\n lastpos:");
1452	for (j = nlastpos[-1] - 1; j >= 0; --j)
1453	{
1454	fprintf(stderr, " %d:", lastpos[j].index);
1455	prtok(d->tokens[lastpos[j].index]);
1456	}
1457	putc('\n', stderr);
1458	}
1459	#endif
1460
1461	/* For each follow set that is the follow set of a real position, replace
1462	it with its epsilon closure. */
1463	for (i = 0; i < d->tindex; ++i)
1464	if (d->tokens[i] < NOTCHAR \|\| d->tokens[i] == BACKREF
1465	\|\| d->tokens[i] >= CSET)
1466	{
1467	#ifdef DEBUG
1468	fprintf(stderr, "follows(%d:", i);
1469	prtok(d->tokens[i]);
1470	fprintf(stderr, "):");
1471	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1472	{
1473	fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1474	prtok(d->tokens[d->follows[i].elems[j].index]);
1475	}
1476	putc('\n', stderr);
1477	#endif
1478	copy(&d->follows[i], &merged);
1479	epsclosure(&merged, d);
1480	if (d->follows[i].nelem < merged.nelem)
1481	REALLOC(d->follows[i].elems, position, merged.nelem);
1482	copy(&merged, &d->follows[i]);
1483	}
1484
1485	/* Get the epsilon closure of the firstpos of the regexp. The result will
1486	be the set of positions of state 0. */
1487	merged.nelem = 0;
1488	for (i = 0; i < nfirstpos[-1]; ++i)
1489	insert(firstpos[i], &merged);
1490	epsclosure(&merged, d);
1491
1492	/* Check if any of the positions of state 0 will want newline context. */
1493	wants_newline = 0;
1494	for (i = 0; i < merged.nelem; ++i)
1495	if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1496	wants_newline = 1;
1497
1498	/* Build the initial state. */
1499	d->salloc = 1;
1500	d->sindex = 0;
1501	MALLOC(d->states, dfa_state, d->salloc);
1502	state_index(d, &merged, wants_newline, 0);
1503
1504	free(o_nullable);
1505	free(o_nfirst);
1506	free(o_firstpos);
1507	free(o_nlast);
1508	free(o_lastpos);
1509	free(nalloc);
1510	free(merged.elems);
1511	}
1512
1513	/* Find, for each character, the transition out of state s of d, and store
1514	it in the appropriate slot of trans.
1515
1516	We divide the positions of s into groups (positions can appear in more
1517	than one group). Each group is labeled with a set of characters that
1518	every position in the group matches (taking into account, if necessary,
1519	preceding context information of s). For each group, find the union
1520	of the its elements' follows. This set is the set of positions of the
1521	new state. For each character in the group's label, set the transition
1522	on this character to be to a state corresponding to the set's positions,
1523	and its associated backward context information, if necessary.
1524
1525	If we are building a searching matcher, we include the positions of state
1526	0 in every state.
1527
1528	The collection of groups is constructed by building an equivalence-class
1529	partition of the positions of s.
1530
1531	For each position, find the set of characters C that it matches. Eliminate
1532	any characters from C that fail on grounds of backward context.
1533
1534	Search through the groups, looking for a group whose label L has nonempty
1535	intersection with C. If L - C is nonempty, create a new group labeled
1536	L - C and having the same positions as the current group, and set L to
1537	the intersection of L and C. Insert the position in this group, set
1538	C = C - L, and resume scanning.
1539
1540	If after comparing with every group there are characters remaining in C,
1541	create a new group labeled with the characters of C and insert this
1542	position in that group. */
1543	void
1544	dfastate (int s, struct dfa *d, int trans[])
1545	{
1546	position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1547	charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1548	int ngrps = 0; /* Number of groups actually used. */
1549	position pos; /* Current position being considered. */
1550	charclass matches; /* Set of matching characters. */
1551	int matchesf; /* True if matches is nonempty. */
1552	charclass intersect; /* Intersection with some label set. */
1553	int intersectf; /* True if intersect is nonempty. */
1554	charclass leftovers; /* Stuff in the label that didn't match. */
1555	int leftoversf; /* True if leftovers is nonempty. */
1556	static charclass letters; /* Set of characters considered letters. */
1557	static charclass newline; /* Set of characters that aren't newline. */
1558	position_set follows; /* Union of the follows of some group. */
1559	position_set tmp; /* Temporary space for merging sets. */
1560	int state; /* New state. */
1561	int wants_newline; /* New state wants to know newline context. */
1562	int state_newline; /* New state on a newline transition. */
1563	int wants_letter; /* New state wants to know letter context. */
1564	int state_letter; /* New state on a letter transition. */
1565	static int initialized; /* Flag for static initialization. */
1566	int i, j, k;
1567
1568	/* Initialize the set of letters, if necessary. */
1569	if (! initialized)
1570	{
1571	initialized = 1;
1572	for (i = 0; i < NOTCHAR; ++i)
1573	if (IS_WORD_CONSTITUENT(i))
1574	setbit(i, letters);
1575	setbit(eolbyte, newline);
1576	}
1577
1578	zeroset(matches);
1579
1580	for (i = 0; i < d->states[s].elems.nelem; ++i)
1581	{
1582	pos = d->states[s].elems.elems[i];
1583	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1584	setbit(d->tokens[pos.index], matches);
1585	else if (d->tokens[pos.index] >= CSET)
1586	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1587	else
1588	continue;
1589
1590	/* Some characters may need to be eliminated from matches because
1591	they fail in the current context. */
1592	if (pos.constraint != 0xFF)
1593	{
1594	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1595	d->states[s].newline, 1))
1596	clrbit(eolbyte, matches);
1597	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1598	d->states[s].newline, 0))
1599	for (j = 0; j < CHARCLASS_INTS; ++j)
1600	matches[j] &= newline[j];
1601	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1602	d->states[s].letter, 1))
1603	for (j = 0; j < CHARCLASS_INTS; ++j)
1604	matches[j] &= ~letters[j];
1605	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1606	d->states[s].letter, 0))
1607	for (j = 0; j < CHARCLASS_INTS; ++j)
1608	matches[j] &= letters[j];
1609
1610	/* If there are no characters left, there's no point in going on. */
1611	for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1612	continue;
1613	if (j == CHARCLASS_INTS)
1614	continue;
1615	}
1616
1617	for (j = 0; j < ngrps; ++j)
1618	{
1619	/* If matches contains a single character only, and the current
1620	group's label doesn't contain that character, go on to the
1621	next group. */
1622	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1623	&& !tstbit(d->tokens[pos.index], labels[j]))
1624	continue;
1625
1626	/* Check if this group's label has a nonempty intersection with
1627	matches. */
1628	intersectf = 0;
1629	for (k = 0; k < CHARCLASS_INTS; ++k)
1630	(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1631	if (! intersectf)
1632	continue;
1633
1634	/* It does; now find the set differences both ways. */
1635	leftoversf = matchesf = 0;
1636	for (k = 0; k < CHARCLASS_INTS; ++k)
1637	{
1638	/* Even an optimizing compiler can't know this for sure. */
1639	int match = matches[k], label = labels[j][k];
1640
1641	(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1642	(matches[k] = match & ~label) ? (matchesf = 1) : 0;
1643	}
1644
1645	/* If there were leftovers, create a new group labeled with them. */
1646	if (leftoversf)
1647	{
1648	copyset(leftovers, labels[ngrps]);
1649	copyset(intersect, labels[j]);
1650	MALLOC(grps[ngrps].elems, position, d->nleaves);
1651	copy(&grps[j], &grps[ngrps]);
1652	++ngrps;
1653	}
1654
1655	/* Put the position in the current group. Note that there is no
1656	reason to call insert() here. */
1657	grps[j].elems[grps[j].nelem++] = pos;
1658
1659	/* If every character matching the current position has been
1660	accounted for, we're done. */
1661	if (! matchesf)
1662	break;
1663	}
1664
1665	/* If we've passed the last group, and there are still characters
1666	unaccounted for, then we'll have to create a new group. */
1667	if (j == ngrps)
1668	{
1669	copyset(matches, labels[ngrps]);
1670	zeroset(matches);
1671	MALLOC(grps[ngrps].elems, position, d->nleaves);
1672	grps[ngrps].nelem = 1;
1673	grps[ngrps].elems[0] = pos;
1674	++ngrps;
1675	}
1676	}
1677
1678	MALLOC(follows.elems, position, d->nleaves);
1679	MALLOC(tmp.elems, position, d->nleaves);
1680
1681	/* If we are a searching matcher, the default transition is to a state
1682	containing the positions of state 0, otherwise the default transition
1683	is to fail miserably. */
1684	if (d->searchflag)
1685	{
1686	wants_newline = 0;
1687	wants_letter = 0;
1688	for (i = 0; i < d->states[0].elems.nelem; ++i)
1689	{
1690	if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1691	wants_newline = 1;
1692	if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1693	wants_letter = 1;
1694	}
1695	copy(&d->states[0].elems, &follows);
1696	state = state_index(d, &follows, 0, 0);
1697	if (wants_newline)
1698	state_newline = state_index(d, &follows, 1, 0);
1699	else
1700	state_newline = state;
1701	if (wants_letter)
1702	state_letter = state_index(d, &follows, 0, 1);
1703	else
1704	state_letter = state;
1705	for (i = 0; i < NOTCHAR; ++i)
1706	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1707	trans[eolbyte] = state_newline;
1708	}
1709	else
1710	for (i = 0; i < NOTCHAR; ++i)
1711	trans[i] = -1;
1712
1713	for (i = 0; i < ngrps; ++i)
1714	{
1715	follows.nelem = 0;
1716
1717	/* Find the union of the follows of the positions of the group.
1718	This is a hideously inefficient loop. Fix it someday. */
1719	for (j = 0; j < grps[i].nelem; ++j)
1720	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1721	insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1722
1723	/* If we are building a searching matcher, throw in the positions
1724	of state 0 as well. */
1725	if (d->searchflag)
1726	for (j = 0; j < d->states[0].elems.nelem; ++j)
1727	insert(d->states[0].elems.elems[j], &follows);
1728
1729	/* Find out if the new state will want any context information. */
1730	wants_newline = 0;
1731	if (tstbit(eolbyte, labels[i]))
1732	for (j = 0; j < follows.nelem; ++j)
1733	if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1734	wants_newline = 1;
1735
1736	wants_letter = 0;
1737	for (j = 0; j < CHARCLASS_INTS; ++j)
1738	if (labels[i][j] & letters[j])
1739	break;
1740	if (j < CHARCLASS_INTS)
1741	for (j = 0; j < follows.nelem; ++j)
1742	if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1743	wants_letter = 1;
1744
1745	/* Find the state(s) corresponding to the union of the follows. */
1746	state = state_index(d, &follows, 0, 0);
1747	if (wants_newline)
1748	state_newline = state_index(d, &follows, 1, 0);
1749	else
1750	state_newline = state;
1751	if (wants_letter)
1752	state_letter = state_index(d, &follows, 0, 1);
1753	else
1754	state_letter = state;
1755
1756	/* Set the transitions for each character in the current label. */
1757	for (j = 0; j < CHARCLASS_INTS; ++j)
1758	for (k = 0; k < INTBITS; ++k)
1759	if (labels[i][j] & 1 << k)
1760	{
1761	int c = j * INTBITS + k;
1762
1763	if (c == eolbyte)
1764	trans[c] = state_newline;
1765	else if (IS_WORD_CONSTITUENT(c))
1766	trans[c] = state_letter;
1767	else if (c < NOTCHAR)
1768	trans[c] = state;
1769	}
1770	}
1771
1772	for (i = 0; i < ngrps; ++i)
1773	free(grps[i].elems);
1774	free(follows.elems);
1775	free(tmp.elems);
1776	}
1777
1778	/* Some routines for manipulating a compiled dfa's transition tables.
1779	Each state may or may not have a transition table; if it does, and it
1780	is a non-accepting state, then d->trans[state] points to its table.
1781	If it is an accepting state then d->fails[state] points to its table.
1782	If it has no table at all, then d->trans[state] is NULL.
1783	TODO: Improve this comment, get rid of the unnecessary redundancy. */
1784
1785	static void
1786	build_state (int s, struct dfa *d)
1787	{
1788	int trans; / The new transition table. */
1789	int i;
1790
1791	/* Set an upper limit on the number of transition tables that will ever
1792	exist at once. 1024 is arbitrary. The idea is that the frequently
1793	used transition tables will be quickly rebuilt, whereas the ones that
1794	were only needed once or twice will be cleared away. */
1795	if (d->trcount >= 1024)
1796	{
1797	for (i = 0; i < d->tralloc; ++i)
1798	if (d->trans[i])
1799	{
1800	free((ptr_t) d->trans[i]);
1801	d->trans[i] = NULL;
1802	}
1803	else if (d->fails[i])
1804	{
1805	free((ptr_t) d->fails[i]);
1806	d->fails[i] = NULL;
1807	}
1808	d->trcount = 0;
1809	}
1810
1811	++d->trcount;
1812
1813	/* Set up the success bits for this state. */
1814	d->success[s] = 0;
1815	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1816	s, *d))
1817	d->success[s] \|= 4;
1818	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1819	s, *d))
1820	d->success[s] \|= 2;
1821	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1822	s, *d))
1823	d->success[s] \|= 1;
1824
1825	MALLOC(trans, int, NOTCHAR);
1826	dfastate(s, d, trans);
1827
1828	/* Now go through the new transition table, and make sure that the trans
1829	and fail arrays are allocated large enough to hold a pointer for the
1830	largest state mentioned in the table. */
1831	for (i = 0; i < NOTCHAR; ++i)
1832	if (trans[i] >= d->tralloc)
1833	{
1834	int oldalloc = d->tralloc;
1835
1836	while (trans[i] >= d->tralloc)
1837	d->tralloc *= 2;
1838	REALLOC(d->realtrans, int *, d->tralloc + 1);
1839	d->trans = d->realtrans + 1;
1840	REALLOC(d->fails, int *, d->tralloc);
1841	REALLOC(d->success, int, d->tralloc);
1842	REALLOC(d->newlines, int, d->tralloc);
1843	while (oldalloc < d->tralloc)
1844	{
1845	d->trans[oldalloc] = NULL;
1846	d->fails[oldalloc++] = NULL;
1847	}
1848	}
1849
1850	/* Keep the newline transition in a special place so we can use it as
1851	a sentinel. */
1852	d->newlines[s] = trans[eolbyte];
1853	trans[eolbyte] = -1;
1854
1855	if (ACCEPTING(s, *d))
1856	d->fails[s] = trans;
1857	else
1858	d->trans[s] = trans;
1859	}
1860
1861	static void
1862	build_state_zero (struct dfa *d)
1863	{
1864	d->tralloc = 1;
1865	d->trcount = 0;
1866	CALLOC(d->realtrans, int *, d->tralloc + 1);
1867	d->trans = d->realtrans + 1;
1868	CALLOC(d->fails, int *, d->tralloc);
1869	MALLOC(d->success, int, d->tralloc);
1870	MALLOC(d->newlines, int, d->tralloc);
1871	build_state(0, d);
1872	}
1873
1874	/* Search through a buffer looking for a match to the given struct dfa.
1875	Find the first occurrence of a string matching the regexp in the buffer,
1876	and the shortest possible version thereof. Return a pointer to the first
1877	character after the match, or NULL if none is found. Begin points to
1878	the beginning of the buffer, and end points to the first character after
1879	its end. We store a newline in *end to act as a sentinel, so end had
1880	better point somewhere valid. Newline is a flag indicating whether to
1881	allow newlines to be in the matching string. If count is non-
1882	NULL it points to a place we're supposed to increment every time we
1883	see a newline. Finally, if backref is non-NULL it points to a place
1884	where we're supposed to store a 1 if backreferencing happened and the
1885	match needs to be verified by a backtracking matcher. Otherwise
1886	we store a 0 in backref. /
1887	char *
1888	dfaexec (struct dfa d, char begin, char *end,
1889	int newline, int count, int backref)
1890	{
1891	register int s, s1, tmp; /* Current state. */
1892	register unsigned char p; / Current input character. */
1893	register int *trans, t; /* Copy of d->trans so it can be optimized
1894	into a register. */
1895	register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
1896	static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1897	static int sbit_init;
1898
1899	if (! sbit_init)
1900	{
1901	int i;
1902
1903	sbit_init = 1;
1904	for (i = 0; i < NOTCHAR; ++i)
1905	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1906	sbit[eol] = 4;
1907	}
1908
1909	if (! d->tralloc)
1910	build_state_zero(d);
1911
1912	s = s1 = 0;
1913	p = (unsigned char *) begin;
1914	trans = d->trans;
1915	*end = eol;
1916
1917	for (;;)
1918	{
1919	while ((t = trans[s]) != 0) { /* hand-optimized loop */
1920	s1 = t[*p++];
1921	if ((t = trans[s1]) == 0) {
1922	tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1923	break;
1924	}
1925	s = t[*p++];
1926	}
1927
1928	if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1929	{
1930	if (d->success[s] & sbit[*p])
1931	{
1932	if (backref)
1933	*backref = (d->states[s].backref != 0);
1934	return (char *) p;
1935	}
1936
1937	s1 = s;
1938	s = d->fails[s][*p++];
1939	continue;
1940	}
1941
1942	/* If the previous character was a newline, count it. */
1943	if (count && (char *) p <= end && p[-1] == eol)
1944	++*count;
1945
1946	/* Check if we've run off the end of the buffer. */
1947	if ((char *) p > end)
1948	return NULL;
1949
1950	if (s >= 0)
1951	{
1952	build_state(s, d);
1953	trans = d->trans;
1954	continue;
1955	}
1956
1957	if (p[-1] == eol && newline)
1958	{
1959	s = d->newlines[s1];
1960	continue;
1961	}
1962
1963	s = 0;
1964	}
1965	}
1966
1967	/* Initialize the components of a dfa that the other routines don't
1968	initialize for themselves. */
1969	void
1970	dfainit (struct dfa *d)
1971	{
1972	d->calloc = 1;
1973	MALLOC(d->charclasses, charclass, d->calloc);
1974	d->cindex = 0;
1975
1976	d->talloc = 1;
1977	MALLOC(d->tokens, token, d->talloc);
1978	d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1979
1980	d->searchflag = 0;
1981	d->tralloc = 0;
1982
1983	d->musts = 0;
1984	}
1985
1986	/* Parse and analyze a single string of the given length. */
1987	void
1988	dfacomp (char s, size_t len, struct dfa d, int searchflag)
1989	{
1990	if (case_fold) /* dummy folding in service of dfamust() */
1991	{
1992	char *lcopy;
1993	int i;
1994
1995	lcopy = malloc(len);
1996	if (!lcopy)
1997	dfaerror(_("out of memory"));
1998
1999	/* This is a kludge. */
2000	case_fold = 0;
2001	for (i = 0; i < len; ++i)
2002	if (ISUPPER ((unsigned char) s[i]))
2003	lcopy[i] = tolower ((unsigned char) s[i]);
2004	else
2005	lcopy[i] = s[i];
2006
2007	dfainit(d);
2008	dfaparse(lcopy, len, d);
2009	free(lcopy);
2010	dfamust(d);
2011	d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2012	case_fold = 1;
2013	dfaparse(s, len, d);
2014	dfaanalyze(d, searchflag);
2015	}
2016	else
2017	{
2018	dfainit(d);
2019	dfaparse(s, len, d);
2020	dfamust(d);
2021	dfaanalyze(d, searchflag);
2022	}
2023	}
2024
2025	/* Free the storage held by the components of a dfa. */
2026	void
2027	dfafree (struct dfa *d)
2028	{
2029	int i;
2030	struct dfamust dm, ndm;
2031
2032	free((ptr_t) d->charclasses);
2033	free((ptr_t) d->tokens);
2034	for (i = 0; i < d->sindex; ++i)
2035	free((ptr_t) d->states[i].elems.elems);
2036	free((ptr_t) d->states);
2037	for (i = 0; i < d->tindex; ++i)
2038	if (d->follows[i].elems)
2039	free((ptr_t) d->follows[i].elems);
2040	free((ptr_t) d->follows);
2041	for (i = 0; i < d->tralloc; ++i)
2042	if (d->trans[i])
2043	free((ptr_t) d->trans[i]);
2044	else if (d->fails[i])
2045	free((ptr_t) d->fails[i]);
2046	if (d->realtrans) free((ptr_t) d->realtrans);
2047	if (d->fails) free((ptr_t) d->fails);
2048	if (d->newlines) free((ptr_t) d->newlines);
2049	if (d->success) free((ptr_t) d->success);
2050	for (dm = d->musts; dm; dm = ndm)
2051	{
2052	ndm = dm->next;
2053	free(dm->must);
2054	free((ptr_t) dm);
2055	}
2056	}
2057
2058	/* Having found the postfix representation of the regular expression,
2059	try to find a long sequence of characters that must appear in any line
2060	containing the r.e.
2061	Finding a "longest" sequence is beyond the scope here;
2062	we take an easy way out and hope for the best.
2063	(Take "(ab\|a)b"--please.)
2064
2065	We do a bottom-up calculation of sequences of characters that must appear
2066	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2067	representation:
2068	sequences that must appear at the left of the match ("left")
2069	sequences that must appear at the right of the match ("right")
2070	lists of sequences that must appear somewhere in the match ("in")
2071	sequences that must constitute the match ("is")
2072
2073	When we get to the root of the tree, we use one of the longest of its
2074	calculated "in" sequences as our answer. The sequence we find is returned in
2075	d->must (where "d" is the single argument passed to "dfamust");
2076	the length of the sequence is returned in d->mustn.
2077
2078	The sequences calculated for the various types of node (in pseudo ANSI c)
2079	are shown below. "p" is the operand of unary operators (and the left-hand
2080	operand of binary operators); "q" is the right-hand operand of binary
2081	operators.
2082
2083	"ZERO" means "a zero-length sequence" below.
2084
2085	Type left right is in
2086	---- ---- ----- -- --
2087	char c # c # c # c # c
2088
2089	CSET ZERO ZERO ZERO ZERO
2090
2091	STAR ZERO ZERO ZERO ZERO
2092
2093	QMARK ZERO ZERO ZERO ZERO
2094
2095	PLUS p->left p->right ZERO p->in
2096
2097	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2098	p->left : q->right : q->is!=ZERO) ? q->in plus
2099	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2100	ZERO
2101
2102	OR longest common longest common (do p->is and substrings common to
2103	leading trailing q->is have same p->in and q->in
2104	(sub)sequence (sub)sequence length and
2105	of p->left of p->right content) ?
2106	and q->left and q->right p->is : NULL
2107
2108	If there's anything else we recognize in the tree, all four sequences get set
2109	to zero-length sequences. If there's something we don't recognize in the tree,
2110	we just return a zero-length sequence.
2111
2112	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2113	'aaa')?
2114
2115	And. . .is it here or someplace that we might ponder "optimizations" such as
2116	egrep 'psi\|epsilon' -> egrep 'psi'
2117	egrep 'pepsi\|epsilon' -> egrep 'epsi'
2118	(Yes, we now find "epsi" as a "string
2119	that must occur", but we might also
2120	simplify the entire r.e. being sought)
2121	grep '[c]' -> grep 'c'
2122	grep '(ab\|a)b' -> grep 'ab'
2123	grep 'ab*' -> grep 'a'
2124	grep 'a*b' -> grep 'b'
2125
2126	There are several issues:
2127
2128	Is optimization easy (enough)?
2129
2130	Does optimization actually accomplish anything,
2131	or is the automaton you get from "psi\|epsilon" (for example)
2132	the same as the one you get from "psi" (for example)?
2133
2134	Are optimizable r.e.'s likely to be used in real-life situations
2135	(something like 'ab*' is probably unlikely; something like is
2136	'psi\|epsilon' is likelier)? */
2137
2138	static char *
2139	icatalloc (char old, char new)
2140	{
2141	char *result;
2142	size_t oldsize, newsize;
2143
2144	newsize = (new == NULL) ? 0 : strlen(new);
2145	if (old == NULL)
2146	oldsize = 0;
2147	else if (newsize == 0)
2148	return old;
2149	else oldsize = strlen(old);
2150	if (old == NULL)
2151	result = (char *) malloc(newsize + 1);
2152	else
2153	result = (char ) realloc((void ) old, oldsize + newsize + 1);
2154	if (result != NULL && new != NULL)
2155	(void) strcpy(result + oldsize, new);
2156	return result;
2157	}
2158
2159	static char *
2160	icpyalloc (char *string)
2161	{
2162	return icatalloc((char *) NULL, string);
2163	}
2164
2165	static char *
2166	istrstr (char lookin, char lookfor)
2167	{
2168	char *cp;
2169	size_t len;
2170
2171	len = strlen(lookfor);
2172	for (cp = lookin; *cp != '\0'; ++cp)
2173	if (strncmp(cp, lookfor, len) == 0)
2174	return cp;
2175	return NULL;
2176	}
2177
2178	static void
2179	ifree (char *cp)
2180	{
2181	if (cp != NULL)
2182	free(cp);
2183	}
2184
2185	static void
2186	freelist (char **cpp)
2187	{
2188	int i;
2189
2190	if (cpp == NULL)
2191	return;
2192	for (i = 0; cpp[i] != NULL; ++i)
2193	{
2194	free(cpp[i]);
2195	cpp[i] = NULL;
2196	}
2197	}
2198
2199	static char **
2200	enlist (char *cpp, char new, size_t len)
2201	{
2202	int i, j;
2203
2204	if (cpp == NULL)
2205	return NULL;
2206	if ((new = icpyalloc(new)) == NULL)
2207	{
2208	freelist(cpp);
2209	return NULL;
2210	}
2211	new[len] = '\0';
2212	/* Is there already something in the list that's new (or longer)? */
2213	for (i = 0; cpp[i] != NULL; ++i)
2214	if (istrstr(cpp[i], new) != NULL)
2215	{
2216	free(new);
2217	return cpp;
2218	}
2219	/* Eliminate any obsoleted strings. */
2220	j = 0;
2221	while (cpp[j] != NULL)
2222	if (istrstr(new, cpp[j]) == NULL)
2223	++j;
2224	else
2225	{
2226	free(cpp[j]);
2227	if (--i == j)
2228	break;
2229	cpp[j] = cpp[i];
2230	cpp[i] = NULL;
2231	}
2232	/* Add the new string. */
2233	cpp = (char *) realloc((char ) cpp, (i + 2) * sizeof *cpp);
2234	if (cpp == NULL)
2235	return NULL;
2236	cpp[i] = new;
2237	cpp[i + 1] = NULL;
2238	return cpp;
2239	}
2240
2241	/* Given pointers to two strings, return a pointer to an allocated
2242	list of their distinct common substrings. Return NULL if something
2243	seems wild. */
2244	static char **
2245	comsubs (char left, char right)
2246	{
2247	char **cpp;
2248	char *lcp;
2249	char *rcp;
2250	size_t i, len;
2251
2252	if (left == NULL \|\| right == NULL)
2253	return NULL;
2254	cpp = (char *) malloc(sizeof cpp);
2255	if (cpp == NULL)
2256	return NULL;
2257	cpp[0] = NULL;
2258	for (lcp = left; *lcp != '\0'; ++lcp)
2259	{
2260	len = 0;
2261	rcp = index(right, *lcp);
2262	while (rcp != NULL)
2263	{
2264	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2265	continue;
2266	if (i > len)
2267	len = i;
2268	rcp = index(rcp + 1, *lcp);
2269	}
2270	if (len == 0)
2271	continue;
2272	if ((cpp = enlist(cpp, lcp, len)) == NULL)
2273	break;
2274	}
2275	return cpp;
2276	}
2277
2278	static char **
2279	addlists (char old, char new)
2280	{
2281	int i;
2282
2283	if (old == NULL \|\| new == NULL)
2284	return NULL;
2285	for (i = 0; new[i] != NULL; ++i)
2286	{
2287	old = enlist(old, new[i], strlen(new[i]));
2288	if (old == NULL)
2289	break;
2290	}
2291	return old;
2292	}
2293
2294	/* Given two lists of substrings, return a new list giving substrings
2295	common to both. */
2296	static char **
2297	inboth (char left, char right)
2298	{
2299	char **both;
2300	char **temp;
2301	int lnum, rnum;
2302
2303	if (left == NULL \|\| right == NULL)
2304	return NULL;
2305	both = (char *) malloc(sizeof both);
2306	if (both == NULL)
2307	return NULL;
2308	both[0] = NULL;
2309	for (lnum = 0; left[lnum] != NULL; ++lnum)
2310	{
2311	for (rnum = 0; right[rnum] != NULL; ++rnum)
2312	{
2313	temp = comsubs(left[lnum], right[rnum]);
2314	if (temp == NULL)
2315	{
2316	freelist(both);
2317	return NULL;
2318	}
2319	both = addlists(both, temp);
2320	freelist(temp);
2321	free(temp);
2322	if (both == NULL)
2323	return NULL;
2324	}
2325	}
2326	return both;
2327	}
2328
2329	typedef struct
2330	{
2331	char **in;
2332	char *left;
2333	char *right;
2334	char *is;
2335	} must;
2336
2337	static void
2338	resetmust (must *mp)
2339	{
2340	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2341	freelist(mp->in);
2342	}
2343
2344	static void
2345	dfamust (struct dfa *dfa)
2346	{
2347	must *musts;
2348	must *mp;
2349	char *result;
2350	int ri;
2351	int i;
2352	int exact;
2353	token t;
2354	static must must0;
2355	struct dfamust *dm;
2356	static char empty_string[] = "";
2357
2358	result = empty_string;
2359	exact = 0;
2360	musts = (must ) malloc((dfa->tindex + 1) sizeof *musts);
2361	if (musts == NULL)
2362	return;
2363	mp = musts;
2364	for (i = 0; i <= dfa->tindex; ++i)
2365	mp[i] = must0;
2366	for (i = 0; i <= dfa->tindex; ++i)
2367	{
2368	mp[i].in = (char *) malloc(sizeof mp[i].in);
2369	mp[i].left = malloc(2);
2370	mp[i].right = malloc(2);
2371	mp[i].is = malloc(2);
2372	if (mp[i].in == NULL \|\| mp[i].left == NULL \|\|
2373	mp[i].right == NULL \|\| mp[i].is == NULL)
2374	goto done;
2375	mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2376	mp[i].in[0] = NULL;
2377	}
2378	#ifdef DEBUG
2379	fprintf(stderr, "dfamust:\n");
2380	for (i = 0; i < dfa->tindex; ++i)
2381	{
2382	fprintf(stderr, " %d:", i);
2383	prtok(dfa->tokens[i]);
2384	}
2385	putc('\n', stderr);
2386	#endif
2387	for (ri = 0; ri < dfa->tindex; ++ri)
2388	{
2389	switch (t = dfa->tokens[ri])
2390	{
2391	case LPAREN:
2392	case RPAREN:
2393	goto done; /* "cannot happen" */
2394	case EMPTY:
2395	case BEGLINE:
2396	case ENDLINE:
2397	case BEGWORD:
2398	case ENDWORD:
2399	case LIMWORD:
2400	case NOTLIMWORD:
2401	case BACKREF:
2402	resetmust(mp);
2403	break;
2404	case STAR:
2405	case QMARK:
2406	if (mp <= musts)
2407	goto done; /* "cannot happen" */
2408	--mp;
2409	resetmust(mp);
2410	break;
2411	case OR:
2412	case ORTOP:
2413	if (mp < &musts[2])
2414	goto done; /* "cannot happen" */
2415	{
2416	char **new;
2417	must *lmp;
2418	must *rmp;
2419	int j, ln, rn, n;
2420
2421	rmp = --mp;
2422	lmp = --mp;
2423	/* Guaranteed to be. Unlikely, but. . . */
2424	if (strcmp(lmp->is, rmp->is) != 0)
2425	lmp->is[0] = '\0';
2426	/* Left side--easy */
2427	i = 0;
2428	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2429	++i;
2430	lmp->left[i] = '\0';
2431	/* Right side */
2432	ln = strlen(lmp->right);
2433	rn = strlen(rmp->right);
2434	n = ln;
2435	if (n > rn)
2436	n = rn;
2437	for (i = 0; i < n; ++i)
2438	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2439	break;
2440	for (j = 0; j < i; ++j)
2441	lmp->right[j] = lmp->right[(ln - i) + j];
2442	lmp->right[j] = '\0';
2443	new = inboth(lmp->in, rmp->in);
2444	if (new == NULL)
2445	goto done;
2446	freelist(lmp->in);
2447	free((char *) lmp->in);
2448	lmp->in = new;
2449	}
2450	break;
2451	case PLUS:
2452	if (mp <= musts)
2453	goto done; /* "cannot happen" */
2454	--mp;
2455	mp->is[0] = '\0';
2456	break;
2457	case END:
2458	if (mp != &musts[1])
2459	goto done; /* "cannot happen" */
2460	for (i = 0; musts[0].in[i] != NULL; ++i)
2461	if (strlen(musts[0].in[i]) > strlen(result))
2462	result = musts[0].in[i];
2463	if (strcmp(result, musts[0].is) == 0)
2464	exact = 1;
2465	goto done;
2466	case CAT:
2467	if (mp < &musts[2])
2468	goto done; /* "cannot happen" */
2469	{
2470	must *lmp;
2471	must *rmp;
2472
2473	rmp = --mp;
2474	lmp = --mp;
2475	/* In. Everything in left, plus everything in
2476	right, plus catenation of
2477	left's right and right's left. */
2478	lmp->in = addlists(lmp->in, rmp->in);
2479	if (lmp->in == NULL)
2480	goto done;
2481	if (lmp->right[0] != '\0' &&
2482	rmp->left[0] != '\0')
2483	{
2484	char *tp;
2485
2486	tp = icpyalloc(lmp->right);
2487	if (tp == NULL)
2488	goto done;
2489	tp = icatalloc(tp, rmp->left);
2490	if (tp == NULL)
2491	goto done;
2492	lmp->in = enlist(lmp->in, tp,
2493	strlen(tp));
2494	free(tp);
2495	if (lmp->in == NULL)
2496	goto done;
2497	}
2498	/* Left-hand */
2499	if (lmp->is[0] != '\0')
2500	{
2501	lmp->left = icatalloc(lmp->left,
2502	rmp->left);
2503	if (lmp->left == NULL)
2504	goto done;
2505	}
2506	/* Right-hand */
2507	if (rmp->is[0] == '\0')
2508	lmp->right[0] = '\0';
2509	lmp->right = icatalloc(lmp->right, rmp->right);
2510	if (lmp->right == NULL)
2511	goto done;
2512	/* Guaranteed to be */
2513	if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2514	{
2515	lmp->is = icatalloc(lmp->is, rmp->is);
2516	if (lmp->is == NULL)
2517	goto done;
2518	}
2519	else
2520	lmp->is[0] = '\0';
2521	}
2522	break;
2523	default:
2524	if (t < END)
2525	{
2526	/* "cannot happen" */
2527	goto done;
2528	}
2529	else if (t == '\0')
2530	{
2531	/* not on my shift */
2532	goto done;
2533	}
2534	else if (t >= CSET)
2535	{
2536	/* easy enough */
2537	resetmust(mp);
2538	}
2539	else
2540	{
2541	/* plain character */
2542	resetmust(mp);
2543	mp->is[0] = mp->left[0] = mp->right[0] = t;
2544	mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2545	mp->in = enlist(mp->in, mp->is, (size_t)1);
2546	if (mp->in == NULL)
2547	goto done;
2548	}
2549	break;
2550	}
2551	#ifdef DEBUG
2552	fprintf(stderr, " node: %d:", ri);
2553	prtok(dfa->tokens[ri]);
2554	fprintf(stderr, "\n in:");
2555	for (i = 0; mp->in[i]; ++i)
2556	fprintf(stderr, " \"%s\"", mp->in[i]);
2557	fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2558	fprintf(stderr, " left: \"%s\"\n", mp->left);
2559	fprintf(stderr, " right: \"%s\"\n", mp->right);
2560	#endif
2561	++mp;
2562	}
2563	done:
2564	if (strlen(result))
2565	{
2566	dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2567	dm->exact = exact;
2568	dm->must = malloc(strlen(result) + 1);
2569	strcpy(dm->must, result);
2570	dm->next = dfa->musts;
2571	dfa->musts = dm;
2572	}
2573	mp = musts;
2574	for (i = 0; i <= dfa->tindex; ++i)
2575	{
2576	freelist(mp[i].in);
2577	ifree((char *) mp[i].in);
2578	ifree(mp[i].left);
2579	ifree(mp[i].right);
2580	ifree(mp[i].is);
2581	}
2582	free((char *) mp);
2583	}