Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /* dfa.c - deterministic extended regexp routines for GNU |
2 | Copyright (C) 1988, 1998 Free Software Foundation, Inc. | |
3 | ||
4 | This program is free software; you can redistribute it and/or modify | |
5 | it under the terms of the GNU General Public License as published by | |
6 | the Free Software Foundation; either version 2, or (at your option) | |
7 | any later version. | |
8 | ||
9 | This program is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | GNU General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU General Public License | |
15 | along with this program; if not, write to the Free Software | |
16 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ | |
17 | ||
18 | /* Written June, 1988 by Mike Haertel | |
19 | Modified July, 1988 by Arthur David Olson to assist BMG speedups */ | |
20 | ||
21 | /* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */ | |
0e387216 | 22 | /* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.3 2004/02/03 19:22:57 dillon Exp $ */ |
984263bc MD |
23 | |
24 | #ifdef HAVE_CONFIG_H | |
25 | #include <config.h> | |
26 | #endif | |
27 | ||
28 | #include <assert.h> | |
29 | #include <ctype.h> | |
30 | #include <stdio.h> | |
31 | ||
32 | #include <sys/types.h> | |
33 | #ifdef STDC_HEADERS | |
34 | #include <stdlib.h> | |
35 | #else | |
36 | extern char *calloc(), *malloc(), *realloc(); | |
37 | extern void free(); | |
38 | #endif | |
39 | ||
40 | #if defined(HAVE_STRING_H) || defined(STDC_HEADERS) | |
41 | #include <string.h> | |
42 | #undef index | |
43 | #define index strchr | |
44 | #else | |
45 | #include <strings.h> | |
46 | #endif | |
47 | ||
48 | #ifndef DEBUG /* use the same approach as regex.c */ | |
49 | #undef assert | |
50 | #define assert(e) | |
51 | #endif /* DEBUG */ | |
52 | ||
53 | #ifndef isgraph | |
54 | #define isgraph(C) (isprint(C) && !isspace(C)) | |
55 | #endif | |
56 | ||
57 | #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | |
58 | #define ISALPHA(C) isalpha(C) | |
59 | #define ISUPPER(C) isupper(C) | |
60 | #define ISLOWER(C) islower(C) | |
61 | #define ISDIGIT(C) isdigit(C) | |
62 | #define ISXDIGIT(C) isxdigit(C) | |
63 | #define ISSPACE(C) isspace(C) | |
64 | #define ISPUNCT(C) ispunct(C) | |
65 | #define ISALNUM(C) isalnum(C) | |
66 | #define ISPRINT(C) isprint(C) | |
67 | #define ISGRAPH(C) isgraph(C) | |
68 | #define ISCNTRL(C) iscntrl(C) | |
69 | #else | |
70 | #define ISALPHA(C) (isascii(C) && isalpha(C)) | |
71 | #define ISUPPER(C) (isascii(C) && isupper(C)) | |
72 | #define ISLOWER(C) (isascii(C) && islower(C)) | |
73 | #define ISDIGIT(C) (isascii(C) && isdigit(C)) | |
74 | #define ISXDIGIT(C) (isascii(C) && isxdigit(C)) | |
75 | #define ISSPACE(C) (isascii(C) && isspace(C)) | |
76 | #define ISPUNCT(C) (isascii(C) && ispunct(C)) | |
77 | #define ISALNUM(C) (isascii(C) && isalnum(C)) | |
78 | #define ISPRINT(C) (isascii(C) && isprint(C)) | |
79 | #define ISGRAPH(C) (isascii(C) && isgraph(C)) | |
80 | #define ISCNTRL(C) (isascii(C) && iscntrl(C)) | |
81 | #endif | |
82 | ||
83 | /* ISASCIIDIGIT differs from ISDIGIT, as follows: | |
84 | - Its arg may be any int or unsigned int; it need not be an unsigned char. | |
85 | - It's guaranteed to evaluate its argument exactly once. | |
86 | - It's typically faster. | |
87 | Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that | |
88 | only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless | |
89 | it's important to use the locale's definition of `digit' even when the | |
90 | host does not conform to Posix. */ | |
91 | #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) | |
92 | ||
93 | /* If we (don't) have I18N. */ | |
94 | /* glibc defines _ */ | |
95 | #ifndef _ | |
96 | # ifdef HAVE_LIBINTL_H | |
97 | # include <libintl.h> | |
98 | # ifndef _ | |
99 | # define _(Str) gettext (Str) | |
100 | # endif | |
101 | # else | |
102 | # define _(Str) (Str) | |
103 | # endif | |
104 | #endif | |
105 | ||
984263bc | 106 | #include <gnuregex.h> |
984263bc MD |
107 | #include "dfa.h" |
108 | ||
109 | /* HPUX, define those as macros in sys/param.h */ | |
110 | #ifdef setbit | |
111 | # undef setbit | |
112 | #endif | |
113 | #ifdef clrbit | |
114 | # undef clrbit | |
115 | #endif | |
116 | ||
117 | static void dfamust PARAMS ((struct dfa *dfa)); | |
118 | ||
119 | static ptr_t xcalloc PARAMS ((size_t n, size_t s)); | |
120 | static ptr_t xmalloc PARAMS ((size_t n)); | |
121 | static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); | |
122 | #ifdef DEBUG | |
123 | static void prtok PARAMS ((token t)); | |
124 | #endif | |
125 | static int tstbit PARAMS ((int b, charclass c)); | |
126 | static void setbit PARAMS ((int b, charclass c)); | |
127 | static void clrbit PARAMS ((int b, charclass c)); | |
128 | static void copyset PARAMS ((charclass src, charclass dst)); | |
129 | static void zeroset PARAMS ((charclass s)); | |
130 | static void notset PARAMS ((charclass s)); | |
131 | static int equal PARAMS ((charclass s1, charclass s2)); | |
132 | static int charclass_index PARAMS ((charclass s)); | |
133 | static int looking_at PARAMS ((const char *s)); | |
134 | static token lex PARAMS ((void)); | |
135 | static void addtok PARAMS ((token t)); | |
136 | static void atom PARAMS ((void)); | |
137 | static int nsubtoks PARAMS ((int tindex)); | |
138 | static void copytoks PARAMS ((int tindex, int ntokens)); | |
139 | static void closure PARAMS ((void)); | |
140 | static void branch PARAMS ((void)); | |
141 | static void regexp PARAMS ((int toplevel)); | |
142 | static void copy PARAMS ((position_set *src, position_set *dst)); | |
143 | static void insert PARAMS ((position p, position_set *s)); | |
144 | static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); | |
145 | static void delete PARAMS ((position p, position_set *s)); | |
146 | static int state_index PARAMS ((struct dfa *d, position_set *s, | |
147 | int newline, int letter)); | |
148 | static void build_state PARAMS ((int s, struct dfa *d)); | |
149 | static void build_state_zero PARAMS ((struct dfa *d)); | |
150 | static char *icatalloc PARAMS ((char *old, char *new)); | |
151 | static char *icpyalloc PARAMS ((char *string)); | |
152 | static char *istrstr PARAMS ((char *lookin, char *lookfor)); | |
153 | static void ifree PARAMS ((char *cp)); | |
154 | static void freelist PARAMS ((char **cpp)); | |
155 | static char **enlist PARAMS ((char **cpp, char *new, size_t len)); | |
156 | static char **comsubs PARAMS ((char *left, char *right)); | |
157 | static char **addlists PARAMS ((char **old, char **new)); | |
158 | static char **inboth PARAMS ((char **left, char **right)); | |
159 | ||
160 | static ptr_t | |
161 | xcalloc (size_t n, size_t s) | |
162 | { | |
163 | ptr_t r = calloc(n, s); | |
164 | ||
165 | if (!r) | |
166 | dfaerror(_("Memory exhausted")); | |
167 | return r; | |
168 | } | |
169 | ||
170 | static ptr_t | |
171 | xmalloc (size_t n) | |
172 | { | |
173 | ptr_t r = malloc(n); | |
174 | ||
175 | assert(n != 0); | |
176 | if (!r) | |
177 | dfaerror(_("Memory exhausted")); | |
178 | return r; | |
179 | } | |
180 | ||
181 | static ptr_t | |
182 | xrealloc (ptr_t p, size_t n) | |
183 | { | |
184 | ptr_t r = realloc(p, n); | |
185 | ||
186 | assert(n != 0); | |
187 | if (!r) | |
188 | dfaerror(_("Memory exhausted")); | |
189 | return r; | |
190 | } | |
191 | ||
192 | #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) | |
193 | #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) | |
194 | #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) | |
195 | ||
196 | /* Reallocate an array of type t if nalloc is too small for index. */ | |
197 | #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ | |
198 | if ((index) >= (nalloc)) \ | |
199 | { \ | |
200 | while ((index) >= (nalloc)) \ | |
201 | (nalloc) *= 2; \ | |
202 | REALLOC(p, t, nalloc); \ | |
203 | } | |
204 | ||
205 | #ifdef DEBUG | |
206 | ||
207 | static void | |
208 | prtok (token t) | |
209 | { | |
210 | char *s; | |
211 | ||
212 | if (t < 0) | |
213 | fprintf(stderr, "END"); | |
214 | else if (t < NOTCHAR) | |
215 | fprintf(stderr, "%c", t); | |
216 | else | |
217 | { | |
218 | switch (t) | |
219 | { | |
220 | case EMPTY: s = "EMPTY"; break; | |
221 | case BACKREF: s = "BACKREF"; break; | |
222 | case BEGLINE: s = "BEGLINE"; break; | |
223 | case ENDLINE: s = "ENDLINE"; break; | |
224 | case BEGWORD: s = "BEGWORD"; break; | |
225 | case ENDWORD: s = "ENDWORD"; break; | |
226 | case LIMWORD: s = "LIMWORD"; break; | |
227 | case NOTLIMWORD: s = "NOTLIMWORD"; break; | |
228 | case QMARK: s = "QMARK"; break; | |
229 | case STAR: s = "STAR"; break; | |
230 | case PLUS: s = "PLUS"; break; | |
231 | case CAT: s = "CAT"; break; | |
232 | case OR: s = "OR"; break; | |
233 | case ORTOP: s = "ORTOP"; break; | |
234 | case LPAREN: s = "LPAREN"; break; | |
235 | case RPAREN: s = "RPAREN"; break; | |
236 | default: s = "CSET"; break; | |
237 | } | |
238 | fprintf(stderr, "%s", s); | |
239 | } | |
240 | } | |
241 | #endif /* DEBUG */ | |
242 | ||
243 | /* Stuff pertaining to charclasses. */ | |
244 | ||
245 | static int | |
246 | tstbit (int b, charclass c) | |
247 | { | |
248 | return c[b / INTBITS] & 1 << b % INTBITS; | |
249 | } | |
250 | ||
251 | static void | |
252 | setbit (int b, charclass c) | |
253 | { | |
254 | c[b / INTBITS] |= 1 << b % INTBITS; | |
255 | } | |
256 | ||
257 | static void | |
258 | clrbit (int b, charclass c) | |
259 | { | |
260 | c[b / INTBITS] &= ~(1 << b % INTBITS); | |
261 | } | |
262 | ||
263 | static void | |
264 | copyset (charclass src, charclass dst) | |
265 | { | |
266 | int i; | |
267 | ||
268 | for (i = 0; i < CHARCLASS_INTS; ++i) | |
269 | dst[i] = src[i]; | |
270 | } | |
271 | ||
272 | static void | |
273 | zeroset (charclass s) | |
274 | { | |
275 | int i; | |
276 | ||
277 | for (i = 0; i < CHARCLASS_INTS; ++i) | |
278 | s[i] = 0; | |
279 | } | |
280 | ||
281 | static void | |
282 | notset (charclass s) | |
283 | { | |
284 | int i; | |
285 | ||
286 | for (i = 0; i < CHARCLASS_INTS; ++i) | |
287 | s[i] = ~s[i]; | |
288 | } | |
289 | ||
290 | static int | |
291 | equal (charclass s1, charclass s2) | |
292 | { | |
293 | int i; | |
294 | ||
295 | for (i = 0; i < CHARCLASS_INTS; ++i) | |
296 | if (s1[i] != s2[i]) | |
297 | return 0; | |
298 | return 1; | |
299 | } | |
300 | ||
301 | /* A pointer to the current dfa is kept here during parsing. */ | |
302 | static struct dfa *dfa; | |
303 | ||
304 | /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ | |
305 | static int | |
306 | charclass_index (charclass s) | |
307 | { | |
308 | int i; | |
309 | ||
310 | for (i = 0; i < dfa->cindex; ++i) | |
311 | if (equal(s, dfa->charclasses[i])) | |
312 | return i; | |
313 | REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); | |
314 | ++dfa->cindex; | |
315 | copyset(s, dfa->charclasses[i]); | |
316 | return i; | |
317 | } | |
318 | ||
319 | /* Syntax bits controlling the behavior of the lexical analyzer. */ | |
320 | static reg_syntax_t syntax_bits, syntax_bits_set; | |
321 | ||
322 | /* Flag for case-folding letters into sets. */ | |
323 | static int case_fold; | |
324 | ||
325 | /* End-of-line byte in data. */ | |
326 | static unsigned char eolbyte; | |
327 | ||
328 | /* Entry point to set syntax options. */ | |
329 | void | |
330 | dfasyntax (reg_syntax_t bits, int fold, int eol) | |
331 | { | |
332 | syntax_bits_set = 1; | |
333 | syntax_bits = bits; | |
334 | case_fold = fold; | |
335 | eolbyte = eol; | |
336 | } | |
337 | ||
338 | /* Lexical analyzer. All the dross that deals with the obnoxious | |
339 | GNU Regex syntax bits is located here. The poor, suffering | |
340 | reader is referred to the GNU Regex documentation for the | |
341 | meaning of the @#%!@#%^!@ syntax bits. */ | |
342 | ||
343 | static char *lexstart; /* Pointer to beginning of input string. */ | |
344 | static char *lexptr; /* Pointer to next input character. */ | |
345 | static int lexleft; /* Number of characters remaining. */ | |
346 | static token lasttok; /* Previous token returned; initially END. */ | |
347 | static int laststart; /* True if we're separated from beginning or (, | | |
348 | only by zero-width characters. */ | |
349 | static int parens; /* Count of outstanding left parens. */ | |
350 | static int minrep, maxrep; /* Repeat counts for {m,n}. */ | |
351 | ||
352 | /* Note that characters become unsigned here. */ | |
353 | #define FETCH(c, eoferr) \ | |
354 | { \ | |
355 | if (! lexleft) \ | |
356 | { \ | |
357 | if (eoferr != 0) \ | |
358 | dfaerror (eoferr); \ | |
359 | else \ | |
360 | return lasttok = END; \ | |
361 | } \ | |
362 | (c) = (unsigned char) *lexptr++; \ | |
363 | --lexleft; \ | |
364 | } | |
365 | ||
366 | #ifdef __STDC__ | |
367 | #define FUNC(F, P) static int F(int c) { return P(c); } | |
368 | #else | |
369 | #define FUNC(F, P) static int F(c) int c; { return P(c); } | |
370 | #endif | |
371 | ||
372 | FUNC(is_alpha, ISALPHA) | |
373 | FUNC(is_upper, ISUPPER) | |
374 | FUNC(is_lower, ISLOWER) | |
375 | FUNC(is_digit, ISDIGIT) | |
376 | FUNC(is_xdigit, ISXDIGIT) | |
377 | FUNC(is_space, ISSPACE) | |
378 | FUNC(is_punct, ISPUNCT) | |
379 | FUNC(is_alnum, ISALNUM) | |
380 | FUNC(is_print, ISPRINT) | |
381 | FUNC(is_graph, ISGRAPH) | |
382 | FUNC(is_cntrl, ISCNTRL) | |
383 | ||
384 | static int | |
385 | is_blank (int c) | |
386 | { | |
387 | return (c == ' ' || c == '\t'); | |
388 | } | |
389 | ||
390 | /* The following list maps the names of the Posix named character classes | |
391 | to predicate functions that determine whether a given character is in | |
392 | the class. The leading [ has already been eaten by the lexical analyzer. */ | |
393 | static struct { | |
394 | const char *name; | |
395 | int (*pred) PARAMS ((int)); | |
396 | } prednames[] = { | |
397 | { ":alpha:]", is_alpha }, | |
398 | { ":upper:]", is_upper }, | |
399 | { ":lower:]", is_lower }, | |
400 | { ":digit:]", is_digit }, | |
401 | { ":xdigit:]", is_xdigit }, | |
402 | { ":space:]", is_space }, | |
403 | { ":punct:]", is_punct }, | |
404 | { ":alnum:]", is_alnum }, | |
405 | { ":print:]", is_print }, | |
406 | { ":graph:]", is_graph }, | |
407 | { ":cntrl:]", is_cntrl }, | |
408 | { ":blank:]", is_blank }, | |
409 | { 0 } | |
410 | }; | |
411 | ||
412 | /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ | |
413 | #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') | |
414 | ||
415 | static int | |
416 | looking_at (char const *s) | |
417 | { | |
418 | size_t len; | |
419 | ||
420 | len = strlen(s); | |
421 | if (lexleft < len) | |
422 | return 0; | |
423 | return strncmp(s, lexptr, len) == 0; | |
424 | } | |
425 | ||
426 | static token | |
427 | lex (void) | |
428 | { | |
429 | token c, c1, c2; | |
430 | int backslash = 0, invert; | |
431 | charclass ccl; | |
432 | int i; | |
433 | char lo[2]; | |
434 | char hi[2]; | |
435 | ||
436 | /* Basic plan: We fetch a character. If it's a backslash, | |
437 | we set the backslash flag and go through the loop again. | |
438 | On the plus side, this avoids having a duplicate of the | |
439 | main switch inside the backslash case. On the minus side, | |
440 | it means that just about every case begins with | |
441 | "if (backslash) ...". */ | |
442 | for (i = 0; i < 2; ++i) | |
443 | { | |
444 | FETCH(c, 0); | |
445 | switch (c) | |
446 | { | |
447 | case '\\': | |
448 | if (backslash) | |
449 | goto normal_char; | |
450 | if (lexleft == 0) | |
451 | dfaerror(_("Unfinished \\ escape")); | |
452 | backslash = 1; | |
453 | break; | |
454 | ||
455 | case '^': | |
456 | if (backslash) | |
457 | goto normal_char; | |
458 | if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS | |
459 | || lasttok == END | |
460 | || lasttok == LPAREN | |
461 | || lasttok == OR) | |
462 | return lasttok = BEGLINE; | |
463 | goto normal_char; | |
464 | ||
465 | case '$': | |
466 | if (backslash) | |
467 | goto normal_char; | |
468 | if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS | |
469 | || lexleft == 0 | |
470 | || (syntax_bits & RE_NO_BK_PARENS | |
471 | ? lexleft > 0 && *lexptr == ')' | |
472 | : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') | |
473 | || (syntax_bits & RE_NO_BK_VBAR | |
474 | ? lexleft > 0 && *lexptr == '|' | |
475 | : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') | |
476 | || ((syntax_bits & RE_NEWLINE_ALT) | |
477 | && lexleft > 0 && *lexptr == '\n')) | |
478 | return lasttok = ENDLINE; | |
479 | goto normal_char; | |
480 | ||
481 | case '1': | |
482 | case '2': | |
483 | case '3': | |
484 | case '4': | |
485 | case '5': | |
486 | case '6': | |
487 | case '7': | |
488 | case '8': | |
489 | case '9': | |
490 | if (backslash && !(syntax_bits & RE_NO_BK_REFS)) | |
491 | { | |
492 | laststart = 0; | |
493 | return lasttok = BACKREF; | |
494 | } | |
495 | goto normal_char; | |
496 | ||
497 | case '`': | |
498 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
499 | return lasttok = BEGLINE; /* FIXME: should be beginning of string */ | |
500 | goto normal_char; | |
501 | ||
502 | case '\'': | |
503 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
504 | return lasttok = ENDLINE; /* FIXME: should be end of string */ | |
505 | goto normal_char; | |
506 | ||
507 | case '<': | |
508 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
509 | return lasttok = BEGWORD; | |
510 | goto normal_char; | |
511 | ||
512 | case '>': | |
513 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
514 | return lasttok = ENDWORD; | |
515 | goto normal_char; | |
516 | ||
517 | case 'b': | |
518 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
519 | return lasttok = LIMWORD; | |
520 | goto normal_char; | |
521 | ||
522 | case 'B': | |
523 | if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) | |
524 | return lasttok = NOTLIMWORD; | |
525 | goto normal_char; | |
526 | ||
527 | case '?': | |
528 | if (syntax_bits & RE_LIMITED_OPS) | |
529 | goto normal_char; | |
530 | if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) | |
531 | goto normal_char; | |
532 | if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) | |
533 | goto normal_char; | |
534 | return lasttok = QMARK; | |
535 | ||
536 | case '*': | |
537 | if (backslash) | |
538 | goto normal_char; | |
539 | if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) | |
540 | goto normal_char; | |
541 | return lasttok = STAR; | |
542 | ||
543 | case '+': | |
544 | if (syntax_bits & RE_LIMITED_OPS) | |
545 | goto normal_char; | |
546 | if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) | |
547 | goto normal_char; | |
548 | if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) | |
549 | goto normal_char; | |
550 | return lasttok = PLUS; | |
551 | ||
552 | case '{': | |
553 | if (!(syntax_bits & RE_INTERVALS)) | |
554 | goto normal_char; | |
555 | if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) | |
556 | goto normal_char; | |
557 | if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) | |
558 | goto normal_char; | |
559 | ||
560 | if (syntax_bits & RE_NO_BK_BRACES) | |
561 | { | |
562 | /* Scan ahead for a valid interval; if it's not valid, | |
563 | treat it as a literal '{'. */ | |
564 | int lo = -1, hi = -1; | |
565 | char const *p = lexptr; | |
566 | char const *lim = p + lexleft; | |
567 | for (; p != lim && ISASCIIDIGIT (*p); p++) | |
568 | lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; | |
569 | if (p != lim && *p == ',') | |
570 | while (++p != lim && ISASCIIDIGIT (*p)) | |
571 | hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; | |
572 | else | |
573 | hi = lo; | |
574 | if (p == lim || *p != '}' | |
575 | || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) | |
576 | goto normal_char; | |
577 | } | |
578 | ||
579 | minrep = 0; | |
580 | /* Cases: | |
581 | {M} - exact count | |
582 | {M,} - minimum count, maximum is infinity | |
583 | {M,N} - M through N */ | |
584 | FETCH(c, _("unfinished repeat count")); | |
585 | if (ISASCIIDIGIT (c)) | |
586 | { | |
587 | minrep = c - '0'; | |
588 | for (;;) | |
589 | { | |
590 | FETCH(c, _("unfinished repeat count")); | |
591 | if (! ISASCIIDIGIT (c)) | |
592 | break; | |
593 | minrep = 10 * minrep + c - '0'; | |
594 | } | |
595 | } | |
596 | else | |
597 | dfaerror(_("malformed repeat count")); | |
598 | if (c == ',') | |
599 | { | |
600 | FETCH (c, _("unfinished repeat count")); | |
601 | if (! ISASCIIDIGIT (c)) | |
602 | maxrep = -1; | |
603 | else | |
604 | { | |
605 | maxrep = c - '0'; | |
606 | for (;;) | |
607 | { | |
608 | FETCH (c, _("unfinished repeat count")); | |
609 | if (! ISASCIIDIGIT (c)) | |
610 | break; | |
611 | maxrep = 10 * maxrep + c - '0'; | |
612 | } | |
613 | if (0 <= maxrep && maxrep < minrep) | |
614 | dfaerror (_("malformed repeat count")); | |
615 | } | |
616 | } | |
617 | else | |
618 | maxrep = minrep; | |
619 | if (!(syntax_bits & RE_NO_BK_BRACES)) | |
620 | { | |
621 | if (c != '\\') | |
622 | dfaerror(_("malformed repeat count")); | |
623 | FETCH(c, _("unfinished repeat count")); | |
624 | } | |
625 | if (c != '}') | |
626 | dfaerror(_("malformed repeat count")); | |
627 | laststart = 0; | |
628 | return lasttok = REPMN; | |
629 | ||
630 | case '|': | |
631 | if (syntax_bits & RE_LIMITED_OPS) | |
632 | goto normal_char; | |
633 | if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) | |
634 | goto normal_char; | |
635 | laststart = 1; | |
636 | return lasttok = OR; | |
637 | ||
638 | case '\n': | |
639 | if (syntax_bits & RE_LIMITED_OPS | |
640 | || backslash | |
641 | || !(syntax_bits & RE_NEWLINE_ALT)) | |
642 | goto normal_char; | |
643 | laststart = 1; | |
644 | return lasttok = OR; | |
645 | ||
646 | case '(': | |
647 | if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) | |
648 | goto normal_char; | |
649 | ++parens; | |
650 | laststart = 1; | |
651 | return lasttok = LPAREN; | |
652 | ||
653 | case ')': | |
654 | if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) | |
655 | goto normal_char; | |
656 | if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
657 | goto normal_char; | |
658 | --parens; | |
659 | laststart = 0; | |
660 | return lasttok = RPAREN; | |
661 | ||
662 | case '.': | |
663 | if (backslash) | |
664 | goto normal_char; | |
665 | zeroset(ccl); | |
666 | notset(ccl); | |
667 | if (!(syntax_bits & RE_DOT_NEWLINE)) | |
668 | clrbit(eolbyte, ccl); | |
669 | if (syntax_bits & RE_DOT_NOT_NULL) | |
670 | clrbit('\0', ccl); | |
671 | laststart = 0; | |
672 | return lasttok = CSET + charclass_index(ccl); | |
673 | ||
674 | case 'w': | |
675 | case 'W': | |
676 | if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) | |
677 | goto normal_char; | |
678 | zeroset(ccl); | |
679 | for (c2 = 0; c2 < NOTCHAR; ++c2) | |
680 | if (IS_WORD_CONSTITUENT(c2)) | |
681 | setbit(c2, ccl); | |
682 | if (c == 'W') | |
683 | notset(ccl); | |
684 | laststart = 0; | |
685 | return lasttok = CSET + charclass_index(ccl); | |
686 | ||
687 | case '[': | |
688 | if (backslash) | |
689 | goto normal_char; | |
690 | zeroset(ccl); | |
691 | FETCH(c, _("Unbalanced [")); | |
692 | if (c == '^') | |
693 | { | |
694 | FETCH(c, _("Unbalanced [")); | |
695 | invert = 1; | |
696 | } | |
697 | else | |
698 | invert = 0; | |
699 | do | |
700 | { | |
701 | /* Nobody ever said this had to be fast. :-) | |
702 | Note that if we're looking at some other [:...:] | |
703 | construct, we just treat it as a bunch of ordinary | |
704 | characters. We can do this because we assume | |
705 | regex has checked for syntax errors before | |
706 | dfa is ever called. */ | |
707 | if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) | |
708 | for (c1 = 0; prednames[c1].name; ++c1) | |
709 | if (looking_at(prednames[c1].name)) | |
710 | { | |
711 | int (*pred)() = prednames[c1].pred; | |
712 | if (case_fold | |
713 | && (pred == is_upper || pred == is_lower)) | |
714 | pred = is_alpha; | |
715 | ||
716 | for (c2 = 0; c2 < NOTCHAR; ++c2) | |
717 | if ((*pred)(c2)) | |
718 | setbit(c2, ccl); | |
719 | lexptr += strlen(prednames[c1].name); | |
720 | lexleft -= strlen(prednames[c1].name); | |
721 | FETCH(c1, _("Unbalanced [")); | |
722 | goto skip; | |
723 | } | |
724 | if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) | |
725 | FETCH(c, _("Unbalanced [")); | |
726 | FETCH(c1, _("Unbalanced [")); | |
727 | if (c1 == '-') | |
728 | { | |
729 | FETCH(c2, _("Unbalanced [")); | |
730 | if (c2 == ']') | |
731 | { | |
732 | /* In the case [x-], the - is an ordinary hyphen, | |
733 | which is left in c1, the lookahead character. */ | |
734 | --lexptr; | |
735 | ++lexleft; | |
736 | c2 = c; | |
737 | } | |
738 | else | |
739 | { | |
740 | if (c2 == '\\' | |
741 | && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) | |
742 | FETCH(c2, _("Unbalanced [")); | |
743 | FETCH(c1, _("Unbalanced [")); | |
744 | } | |
745 | } | |
746 | else | |
747 | c2 = c; | |
748 | ||
749 | lo[0] = c; lo[1] = '\0'; | |
750 | hi[0] = c2; hi[1] = '\0'; | |
751 | for (c = 0; c < NOTCHAR; c++) | |
752 | { | |
753 | char ch[2]; | |
754 | ch[0] = c; ch[1] = '\0'; | |
755 | if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0) | |
756 | { | |
757 | setbit (c, ccl); | |
758 | if (case_fold) | |
759 | { | |
760 | if (ISUPPER (c)) | |
761 | setbit (tolower (c), ccl); | |
762 | else if (ISLOWER (c)) | |
763 | setbit (toupper (c), ccl); | |
764 | } | |
765 | } | |
766 | } | |
767 | ||
768 | skip: | |
769 | ; | |
770 | } | |
771 | while ((c = c1) != ']'); | |
772 | if (invert) | |
773 | { | |
774 | notset(ccl); | |
775 | if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) | |
776 | clrbit(eolbyte, ccl); | |
777 | } | |
778 | laststart = 0; | |
779 | return lasttok = CSET + charclass_index(ccl); | |
780 | ||
781 | default: | |
782 | normal_char: | |
783 | laststart = 0; | |
784 | if (case_fold && ISALPHA(c)) | |
785 | { | |
786 | zeroset(ccl); | |
787 | setbit(c, ccl); | |
788 | if (isupper(c)) | |
789 | setbit(tolower(c), ccl); | |
790 | else | |
791 | setbit(toupper(c), ccl); | |
792 | return lasttok = CSET + charclass_index(ccl); | |
793 | } | |
794 | return c; | |
795 | } | |
796 | } | |
797 | ||
798 | /* The above loop should consume at most a backslash | |
799 | and some other character. */ | |
800 | abort(); | |
801 | return END; /* keeps pedantic compilers happy. */ | |
802 | } | |
803 | ||
804 | /* Recursive descent parser for regular expressions. */ | |
805 | ||
806 | static token tok; /* Lookahead token. */ | |
807 | static int depth; /* Current depth of a hypothetical stack | |
808 | holding deferred productions. This is | |
809 | used to determine the depth that will be | |
810 | required of the real stack later on in | |
811 | dfaanalyze(). */ | |
812 | ||
813 | /* Add the given token to the parse tree, maintaining the depth count and | |
814 | updating the maximum depth if necessary. */ | |
815 | static void | |
816 | addtok (token t) | |
817 | { | |
818 | REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); | |
819 | dfa->tokens[dfa->tindex++] = t; | |
820 | ||
821 | switch (t) | |
822 | { | |
823 | case QMARK: | |
824 | case STAR: | |
825 | case PLUS: | |
826 | break; | |
827 | ||
828 | case CAT: | |
829 | case OR: | |
830 | case ORTOP: | |
831 | --depth; | |
832 | break; | |
833 | ||
834 | default: | |
835 | ++dfa->nleaves; | |
836 | case EMPTY: | |
837 | ++depth; | |
838 | break; | |
839 | } | |
840 | if (depth > dfa->depth) | |
841 | dfa->depth = depth; | |
842 | } | |
843 | ||
844 | /* The grammar understood by the parser is as follows. | |
845 | ||
846 | regexp: | |
847 | regexp OR branch | |
848 | branch | |
849 | ||
850 | branch: | |
851 | branch closure | |
852 | closure | |
853 | ||
854 | closure: | |
855 | closure QMARK | |
856 | closure STAR | |
857 | closure PLUS | |
858 | atom | |
859 | ||
860 | atom: | |
861 | <normal character> | |
862 | CSET | |
863 | BACKREF | |
864 | BEGLINE | |
865 | ENDLINE | |
866 | BEGWORD | |
867 | ENDWORD | |
868 | LIMWORD | |
869 | NOTLIMWORD | |
870 | <empty> | |
871 | ||
872 | The parser builds a parse tree in postfix form in an array of tokens. */ | |
873 | ||
874 | static void | |
875 | atom (void) | |
876 | { | |
877 | if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF | |
878 | || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD | |
879 | || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) | |
880 | { | |
881 | addtok(tok); | |
882 | tok = lex(); | |
883 | } | |
884 | else if (tok == LPAREN) | |
885 | { | |
886 | tok = lex(); | |
887 | regexp(0); | |
888 | if (tok != RPAREN) | |
889 | dfaerror(_("Unbalanced (")); | |
890 | tok = lex(); | |
891 | } | |
892 | else | |
893 | addtok(EMPTY); | |
894 | } | |
895 | ||
896 | /* Return the number of tokens in the given subexpression. */ | |
897 | static int | |
898 | nsubtoks (int tindex) | |
899 | { | |
900 | int ntoks1; | |
901 | ||
902 | switch (dfa->tokens[tindex - 1]) | |
903 | { | |
904 | default: | |
905 | return 1; | |
906 | case QMARK: | |
907 | case STAR: | |
908 | case PLUS: | |
909 | return 1 + nsubtoks(tindex - 1); | |
910 | case CAT: | |
911 | case OR: | |
912 | case ORTOP: | |
913 | ntoks1 = nsubtoks(tindex - 1); | |
914 | return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); | |
915 | } | |
916 | } | |
917 | ||
918 | /* Copy the given subexpression to the top of the tree. */ | |
919 | static void | |
920 | copytoks (int tindex, int ntokens) | |
921 | { | |
922 | int i; | |
923 | ||
924 | for (i = 0; i < ntokens; ++i) | |
925 | addtok(dfa->tokens[tindex + i]); | |
926 | } | |
927 | ||
928 | static void | |
929 | closure (void) | |
930 | { | |
931 | int tindex, ntokens, i; | |
932 | ||
933 | atom(); | |
934 | while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) | |
935 | if (tok == REPMN) | |
936 | { | |
937 | ntokens = nsubtoks(dfa->tindex); | |
938 | tindex = dfa->tindex - ntokens; | |
939 | if (maxrep < 0) | |
940 | addtok(PLUS); | |
941 | if (minrep == 0) | |
942 | addtok(QMARK); | |
943 | for (i = 1; i < minrep; ++i) | |
944 | { | |
945 | copytoks(tindex, ntokens); | |
946 | addtok(CAT); | |
947 | } | |
948 | for (; i < maxrep; ++i) | |
949 | { | |
950 | copytoks(tindex, ntokens); | |
951 | addtok(QMARK); | |
952 | addtok(CAT); | |
953 | } | |
954 | tok = lex(); | |
955 | } | |
956 | else | |
957 | { | |
958 | addtok(tok); | |
959 | tok = lex(); | |
960 | } | |
961 | } | |
962 | ||
963 | static void | |
964 | branch (void) | |
965 | { | |
966 | closure(); | |
967 | while (tok != RPAREN && tok != OR && tok >= 0) | |
968 | { | |
969 | closure(); | |
970 | addtok(CAT); | |
971 | } | |
972 | } | |
973 | ||
974 | static void | |
975 | regexp (int toplevel) | |
976 | { | |
977 | branch(); | |
978 | while (tok == OR) | |
979 | { | |
980 | tok = lex(); | |
981 | branch(); | |
982 | if (toplevel) | |
983 | addtok(ORTOP); | |
984 | else | |
985 | addtok(OR); | |
986 | } | |
987 | } | |
988 | ||
989 | /* Main entry point for the parser. S is a string to be parsed, len is the | |
990 | length of the string, so s can include NUL characters. D is a pointer to | |
991 | the struct dfa to parse into. */ | |
992 | void | |
993 | dfaparse (char *s, size_t len, struct dfa *d) | |
994 | { | |
995 | dfa = d; | |
996 | lexstart = lexptr = s; | |
997 | lexleft = len; | |
998 | lasttok = END; | |
999 | laststart = 1; | |
1000 | parens = 0; | |
1001 | ||
1002 | if (! syntax_bits_set) | |
1003 | dfaerror(_("No syntax specified")); | |
1004 | ||
1005 | tok = lex(); | |
1006 | depth = d->depth; | |
1007 | ||
1008 | regexp(1); | |
1009 | ||
1010 | if (tok != END) | |
1011 | dfaerror(_("Unbalanced )")); | |
1012 | ||
1013 | addtok(END - d->nregexps); | |
1014 | addtok(CAT); | |
1015 | ||
1016 | if (d->nregexps) | |
1017 | addtok(ORTOP); | |
1018 | ||
1019 | ++d->nregexps; | |
1020 | } | |
1021 | ||
1022 | /* Some primitives for operating on sets of positions. */ | |
1023 | ||
1024 | /* Copy one set to another; the destination must be large enough. */ | |
1025 | static void | |
1026 | copy (position_set *src, position_set *dst) | |
1027 | { | |
1028 | int i; | |
1029 | ||
1030 | for (i = 0; i < src->nelem; ++i) | |
1031 | dst->elems[i] = src->elems[i]; | |
1032 | dst->nelem = src->nelem; | |
1033 | } | |
1034 | ||
1035 | /* Insert a position in a set. Position sets are maintained in sorted | |
1036 | order according to index. If position already exists in the set with | |
1037 | the same index then their constraints are logically or'd together. | |
1038 | S->elems must point to an array large enough to hold the resulting set. */ | |
1039 | static void | |
1040 | insert (position p, position_set *s) | |
1041 | { | |
1042 | int i; | |
1043 | position t1, t2; | |
1044 | ||
1045 | for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) | |
1046 | continue; | |
1047 | if (i < s->nelem && p.index == s->elems[i].index) | |
1048 | s->elems[i].constraint |= p.constraint; | |
1049 | else | |
1050 | { | |
1051 | t1 = p; | |
1052 | ++s->nelem; | |
1053 | while (i < s->nelem) | |
1054 | { | |
1055 | t2 = s->elems[i]; | |
1056 | s->elems[i++] = t1; | |
1057 | t1 = t2; | |
1058 | } | |
1059 | } | |
1060 | } | |
1061 | ||
1062 | /* Merge two sets of positions into a third. The result is exactly as if | |
1063 | the positions of both sets were inserted into an initially empty set. */ | |
1064 | static void | |
1065 | merge (position_set *s1, position_set *s2, position_set *m) | |
1066 | { | |
1067 | int i = 0, j = 0; | |
1068 | ||
1069 | m->nelem = 0; | |
1070 | while (i < s1->nelem && j < s2->nelem) | |
1071 | if (s1->elems[i].index > s2->elems[j].index) | |
1072 | m->elems[m->nelem++] = s1->elems[i++]; | |
1073 | else if (s1->elems[i].index < s2->elems[j].index) | |
1074 | m->elems[m->nelem++] = s2->elems[j++]; | |
1075 | else | |
1076 | { | |
1077 | m->elems[m->nelem] = s1->elems[i++]; | |
1078 | m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; | |
1079 | } | |
1080 | while (i < s1->nelem) | |
1081 | m->elems[m->nelem++] = s1->elems[i++]; | |
1082 | while (j < s2->nelem) | |
1083 | m->elems[m->nelem++] = s2->elems[j++]; | |
1084 | } | |
1085 | ||
1086 | /* Delete a position from a set. */ | |
1087 | static void | |
1088 | delete (position p, position_set *s) | |
1089 | { | |
1090 | int i; | |
1091 | ||
1092 | for (i = 0; i < s->nelem; ++i) | |
1093 | if (p.index == s->elems[i].index) | |
1094 | break; | |
1095 | if (i < s->nelem) | |
1096 | for (--s->nelem; i < s->nelem; ++i) | |
1097 | s->elems[i] = s->elems[i + 1]; | |
1098 | } | |
1099 | ||
1100 | /* Find the index of the state corresponding to the given position set with | |
1101 | the given preceding context, or create a new state if there is no such | |
1102 | state. Newline and letter tell whether we got here on a newline or | |
1103 | letter, respectively. */ | |
1104 | static int | |
1105 | state_index (struct dfa *d, position_set *s, int newline, int letter) | |
1106 | { | |
1107 | int hash = 0; | |
1108 | int constraint; | |
1109 | int i, j; | |
1110 | ||
1111 | newline = newline ? 1 : 0; | |
1112 | letter = letter ? 1 : 0; | |
1113 | ||
1114 | for (i = 0; i < s->nelem; ++i) | |
1115 | hash ^= s->elems[i].index + s->elems[i].constraint; | |
1116 | ||
1117 | /* Try to find a state that exactly matches the proposed one. */ | |
1118 | for (i = 0; i < d->sindex; ++i) | |
1119 | { | |
1120 | if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem | |
1121 | || newline != d->states[i].newline || letter != d->states[i].letter) | |
1122 | continue; | |
1123 | for (j = 0; j < s->nelem; ++j) | |
1124 | if (s->elems[j].constraint | |
1125 | != d->states[i].elems.elems[j].constraint | |
1126 | || s->elems[j].index != d->states[i].elems.elems[j].index) | |
1127 | break; | |
1128 | if (j == s->nelem) | |
1129 | return i; | |
1130 | } | |
1131 | ||
1132 | /* We'll have to create a new state. */ | |
1133 | REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); | |
1134 | d->states[i].hash = hash; | |
1135 | MALLOC(d->states[i].elems.elems, position, s->nelem); | |
1136 | copy(s, &d->states[i].elems); | |
1137 | d->states[i].newline = newline; | |
1138 | d->states[i].letter = letter; | |
1139 | d->states[i].backref = 0; | |
1140 | d->states[i].constraint = 0; | |
1141 | d->states[i].first_end = 0; | |
1142 | for (j = 0; j < s->nelem; ++j) | |
1143 | if (d->tokens[s->elems[j].index] < 0) | |
1144 | { | |
1145 | constraint = s->elems[j].constraint; | |
1146 | if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) | |
1147 | || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) | |
1148 | || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) | |
1149 | || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) | |
1150 | d->states[i].constraint |= constraint; | |
1151 | if (! d->states[i].first_end) | |
1152 | d->states[i].first_end = d->tokens[s->elems[j].index]; | |
1153 | } | |
1154 | else if (d->tokens[s->elems[j].index] == BACKREF) | |
1155 | { | |
1156 | d->states[i].constraint = NO_CONSTRAINT; | |
1157 | d->states[i].backref = 1; | |
1158 | } | |
1159 | ||
1160 | ++d->sindex; | |
1161 | ||
1162 | return i; | |
1163 | } | |
1164 | ||
1165 | /* Find the epsilon closure of a set of positions. If any position of the set | |
1166 | contains a symbol that matches the empty string in some context, replace | |
1167 | that position with the elements of its follow labeled with an appropriate | |
1168 | constraint. Repeat exhaustively until no funny positions are left. | |
1169 | S->elems must be large enough to hold the result. */ | |
1170 | static void | |
1171 | epsclosure (position_set *s, struct dfa *d) | |
1172 | { | |
1173 | int i, j; | |
1174 | int *visited; | |
1175 | position p, old; | |
1176 | ||
1177 | MALLOC(visited, int, d->tindex); | |
1178 | for (i = 0; i < d->tindex; ++i) | |
1179 | visited[i] = 0; | |
1180 | ||
1181 | for (i = 0; i < s->nelem; ++i) | |
1182 | if (d->tokens[s->elems[i].index] >= NOTCHAR | |
1183 | && d->tokens[s->elems[i].index] != BACKREF | |
1184 | && d->tokens[s->elems[i].index] < CSET) | |
1185 | { | |
1186 | old = s->elems[i]; | |
1187 | p.constraint = old.constraint; | |
1188 | delete(s->elems[i], s); | |
1189 | if (visited[old.index]) | |
1190 | { | |
1191 | --i; | |
1192 | continue; | |
1193 | } | |
1194 | visited[old.index] = 1; | |
1195 | switch (d->tokens[old.index]) | |
1196 | { | |
1197 | case BEGLINE: | |
1198 | p.constraint &= BEGLINE_CONSTRAINT; | |
1199 | break; | |
1200 | case ENDLINE: | |
1201 | p.constraint &= ENDLINE_CONSTRAINT; | |
1202 | break; | |
1203 | case BEGWORD: | |
1204 | p.constraint &= BEGWORD_CONSTRAINT; | |
1205 | break; | |
1206 | case ENDWORD: | |
1207 | p.constraint &= ENDWORD_CONSTRAINT; | |
1208 | break; | |
1209 | case LIMWORD: | |
1210 | p.constraint &= LIMWORD_CONSTRAINT; | |
1211 | break; | |
1212 | case NOTLIMWORD: | |
1213 | p.constraint &= NOTLIMWORD_CONSTRAINT; | |
1214 | break; | |
1215 | default: | |
1216 | break; | |
1217 | } | |
1218 | for (j = 0; j < d->follows[old.index].nelem; ++j) | |
1219 | { | |
1220 | p.index = d->follows[old.index].elems[j].index; | |
1221 | insert(p, s); | |
1222 | } | |
1223 | /* Force rescan to start at the beginning. */ | |
1224 | i = -1; | |
1225 | } | |
1226 | ||
1227 | free(visited); | |
1228 | } | |
1229 | ||
1230 | /* Perform bottom-up analysis on the parse tree, computing various functions. | |
1231 | Note that at this point, we're pretending constructs like \< are real | |
1232 | characters rather than constraints on what can follow them. | |
1233 | ||
1234 | Nullable: A node is nullable if it is at the root of a regexp that can | |
1235 | match the empty string. | |
1236 | * EMPTY leaves are nullable. | |
1237 | * No other leaf is nullable. | |
1238 | * A QMARK or STAR node is nullable. | |
1239 | * A PLUS node is nullable if its argument is nullable. | |
1240 | * A CAT node is nullable if both its arguments are nullable. | |
1241 | * An OR node is nullable if either argument is nullable. | |
1242 | ||
1243 | Firstpos: The firstpos of a node is the set of positions (nonempty leaves) | |
1244 | that could correspond to the first character of a string matching the | |
1245 | regexp rooted at the given node. | |
1246 | * EMPTY leaves have empty firstpos. | |
1247 | * The firstpos of a nonempty leaf is that leaf itself. | |
1248 | * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its | |
1249 | argument. | |
1250 | * The firstpos of a CAT node is the firstpos of the left argument, union | |
1251 | the firstpos of the right if the left argument is nullable. | |
1252 | * The firstpos of an OR node is the union of firstpos of each argument. | |
1253 | ||
1254 | Lastpos: The lastpos of a node is the set of positions that could | |
1255 | correspond to the last character of a string matching the regexp at | |
1256 | the given node. | |
1257 | * EMPTY leaves have empty lastpos. | |
1258 | * The lastpos of a nonempty leaf is that leaf itself. | |
1259 | * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its | |
1260 | argument. | |
1261 | * The lastpos of a CAT node is the lastpos of its right argument, union | |
1262 | the lastpos of the left if the right argument is nullable. | |
1263 | * The lastpos of an OR node is the union of the lastpos of each argument. | |
1264 | ||
1265 | Follow: The follow of a position is the set of positions that could | |
1266 | correspond to the character following a character matching the node in | |
1267 | a string matching the regexp. At this point we consider special symbols | |
1268 | that match the empty string in some context to be just normal characters. | |
1269 | Later, if we find that a special symbol is in a follow set, we will | |
1270 | replace it with the elements of its follow, labeled with an appropriate | |
1271 | constraint. | |
1272 | * Every node in the firstpos of the argument of a STAR or PLUS node is in | |
1273 | the follow of every node in the lastpos. | |
1274 | * Every node in the firstpos of the second argument of a CAT node is in | |
1275 | the follow of every node in the lastpos of the first argument. | |
1276 | ||
1277 | Because of the postfix representation of the parse tree, the depth-first | |
1278 | analysis is conveniently done by a linear scan with the aid of a stack. | |
1279 | Sets are stored as arrays of the elements, obeying a stack-like allocation | |
1280 | scheme; the number of elements in each set deeper in the stack can be | |
1281 | used to determine the address of a particular set's array. */ | |
1282 | void | |
1283 | dfaanalyze (struct dfa *d, int searchflag) | |
1284 | { | |
1285 | int *nullable; /* Nullable stack. */ | |
1286 | int *nfirstpos; /* Element count stack for firstpos sets. */ | |
1287 | position *firstpos; /* Array where firstpos elements are stored. */ | |
1288 | int *nlastpos; /* Element count stack for lastpos sets. */ | |
1289 | position *lastpos; /* Array where lastpos elements are stored. */ | |
1290 | int *nalloc; /* Sizes of arrays allocated to follow sets. */ | |
1291 | position_set tmp; /* Temporary set for merging sets. */ | |
1292 | position_set merged; /* Result of merging sets. */ | |
1293 | int wants_newline; /* True if some position wants newline info. */ | |
1294 | int *o_nullable; | |
1295 | int *o_nfirst, *o_nlast; | |
1296 | position *o_firstpos, *o_lastpos; | |
1297 | int i, j; | |
1298 | position *pos; | |
1299 | ||
1300 | #ifdef DEBUG | |
1301 | fprintf(stderr, "dfaanalyze:\n"); | |
1302 | for (i = 0; i < d->tindex; ++i) | |
1303 | { | |
1304 | fprintf(stderr, " %d:", i); | |
1305 | prtok(d->tokens[i]); | |
1306 | } | |
1307 | putc('\n', stderr); | |
1308 | #endif | |
1309 | ||
1310 | d->searchflag = searchflag; | |
1311 | ||
1312 | MALLOC(nullable, int, d->depth); | |
1313 | o_nullable = nullable; | |
1314 | MALLOC(nfirstpos, int, d->depth); | |
1315 | o_nfirst = nfirstpos; | |
1316 | MALLOC(firstpos, position, d->nleaves); | |
1317 | o_firstpos = firstpos, firstpos += d->nleaves; | |
1318 | MALLOC(nlastpos, int, d->depth); | |
1319 | o_nlast = nlastpos; | |
1320 | MALLOC(lastpos, position, d->nleaves); | |
1321 | o_lastpos = lastpos, lastpos += d->nleaves; | |
1322 | MALLOC(nalloc, int, d->tindex); | |
1323 | for (i = 0; i < d->tindex; ++i) | |
1324 | nalloc[i] = 0; | |
1325 | MALLOC(merged.elems, position, d->nleaves); | |
1326 | ||
1327 | CALLOC(d->follows, position_set, d->tindex); | |
1328 | ||
1329 | for (i = 0; i < d->tindex; ++i) | |
1330 | #ifdef DEBUG | |
1331 | { /* Nonsyntactic #ifdef goo... */ | |
1332 | #endif | |
1333 | switch (d->tokens[i]) | |
1334 | { | |
1335 | case EMPTY: | |
1336 | /* The empty set is nullable. */ | |
1337 | *nullable++ = 1; | |
1338 | ||
1339 | /* The firstpos and lastpos of the empty leaf are both empty. */ | |
1340 | *nfirstpos++ = *nlastpos++ = 0; | |
1341 | break; | |
1342 | ||
1343 | case STAR: | |
1344 | case PLUS: | |
1345 | /* Every element in the firstpos of the argument is in the follow | |
1346 | of every element in the lastpos. */ | |
1347 | tmp.nelem = nfirstpos[-1]; | |
1348 | tmp.elems = firstpos; | |
1349 | pos = lastpos; | |
1350 | for (j = 0; j < nlastpos[-1]; ++j) | |
1351 | { | |
1352 | merge(&tmp, &d->follows[pos[j].index], &merged); | |
1353 | REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, | |
1354 | nalloc[pos[j].index], merged.nelem - 1); | |
1355 | copy(&merged, &d->follows[pos[j].index]); | |
1356 | } | |
1357 | ||
1358 | case QMARK: | |
1359 | /* A QMARK or STAR node is automatically nullable. */ | |
1360 | if (d->tokens[i] != PLUS) | |
1361 | nullable[-1] = 1; | |
1362 | break; | |
1363 | ||
1364 | case CAT: | |
1365 | /* Every element in the firstpos of the second argument is in the | |
1366 | follow of every element in the lastpos of the first argument. */ | |
1367 | tmp.nelem = nfirstpos[-1]; | |
1368 | tmp.elems = firstpos; | |
1369 | pos = lastpos + nlastpos[-1]; | |
1370 | for (j = 0; j < nlastpos[-2]; ++j) | |
1371 | { | |
1372 | merge(&tmp, &d->follows[pos[j].index], &merged); | |
1373 | REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, | |
1374 | nalloc[pos[j].index], merged.nelem - 1); | |
1375 | copy(&merged, &d->follows[pos[j].index]); | |
1376 | } | |
1377 | ||
1378 | /* The firstpos of a CAT node is the firstpos of the first argument, | |
1379 | union that of the second argument if the first is nullable. */ | |
1380 | if (nullable[-2]) | |
1381 | nfirstpos[-2] += nfirstpos[-1]; | |
1382 | else | |
1383 | firstpos += nfirstpos[-1]; | |
1384 | --nfirstpos; | |
1385 | ||
1386 | /* The lastpos of a CAT node is the lastpos of the second argument, | |
1387 | union that of the first argument if the second is nullable. */ | |
1388 | if (nullable[-1]) | |
1389 | nlastpos[-2] += nlastpos[-1]; | |
1390 | else | |
1391 | { | |
1392 | pos = lastpos + nlastpos[-2]; | |
1393 | for (j = nlastpos[-1] - 1; j >= 0; --j) | |
1394 | pos[j] = lastpos[j]; | |
1395 | lastpos += nlastpos[-2]; | |
1396 | nlastpos[-2] = nlastpos[-1]; | |
1397 | } | |
1398 | --nlastpos; | |
1399 | ||
1400 | /* A CAT node is nullable if both arguments are nullable. */ | |
1401 | nullable[-2] = nullable[-1] && nullable[-2]; | |
1402 | --nullable; | |
1403 | break; | |
1404 | ||
1405 | case OR: | |
1406 | case ORTOP: | |
1407 | /* The firstpos is the union of the firstpos of each argument. */ | |
1408 | nfirstpos[-2] += nfirstpos[-1]; | |
1409 | --nfirstpos; | |
1410 | ||
1411 | /* The lastpos is the union of the lastpos of each argument. */ | |
1412 | nlastpos[-2] += nlastpos[-1]; | |
1413 | --nlastpos; | |
1414 | ||
1415 | /* An OR node is nullable if either argument is nullable. */ | |
1416 | nullable[-2] = nullable[-1] || nullable[-2]; | |
1417 | --nullable; | |
1418 | break; | |
1419 | ||
1420 | default: | |
1421 | /* Anything else is a nonempty position. (Note that special | |
1422 | constructs like \< are treated as nonempty strings here; | |
1423 | an "epsilon closure" effectively makes them nullable later. | |
1424 | Backreferences have to get a real position so we can detect | |
1425 | transitions on them later. But they are nullable. */ | |
1426 | *nullable++ = d->tokens[i] == BACKREF; | |
1427 | ||
1428 | /* This position is in its own firstpos and lastpos. */ | |
1429 | *nfirstpos++ = *nlastpos++ = 1; | |
1430 | --firstpos, --lastpos; | |
1431 | firstpos->index = lastpos->index = i; | |
1432 | firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; | |
1433 | ||
1434 | /* Allocate the follow set for this position. */ | |
1435 | nalloc[i] = 1; | |
1436 | MALLOC(d->follows[i].elems, position, nalloc[i]); | |
1437 | break; | |
1438 | } | |
1439 | #ifdef DEBUG | |
1440 | /* ... balance the above nonsyntactic #ifdef goo... */ | |
1441 | fprintf(stderr, "node %d:", i); | |
1442 | prtok(d->tokens[i]); | |
1443 | putc('\n', stderr); | |
1444 | fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); | |
1445 | fprintf(stderr, " firstpos:"); | |
1446 | for (j = nfirstpos[-1] - 1; j >= 0; --j) | |
1447 | { | |
1448 | fprintf(stderr, " %d:", firstpos[j].index); | |
1449 | prtok(d->tokens[firstpos[j].index]); | |
1450 | } | |
1451 | fprintf(stderr, "\n lastpos:"); | |
1452 | for (j = nlastpos[-1] - 1; j >= 0; --j) | |
1453 | { | |
1454 | fprintf(stderr, " %d:", lastpos[j].index); | |
1455 | prtok(d->tokens[lastpos[j].index]); | |
1456 | } | |
1457 | putc('\n', stderr); | |
1458 | } | |
1459 | #endif | |
1460 | ||
1461 | /* For each follow set that is the follow set of a real position, replace | |
1462 | it with its epsilon closure. */ | |
1463 | for (i = 0; i < d->tindex; ++i) | |
1464 | if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF | |
1465 | || d->tokens[i] >= CSET) | |
1466 | { | |
1467 | #ifdef DEBUG | |
1468 | fprintf(stderr, "follows(%d:", i); | |
1469 | prtok(d->tokens[i]); | |
1470 | fprintf(stderr, "):"); | |
1471 | for (j = d->follows[i].nelem - 1; j >= 0; --j) | |
1472 | { | |
1473 | fprintf(stderr, " %d:", d->follows[i].elems[j].index); | |
1474 | prtok(d->tokens[d->follows[i].elems[j].index]); | |
1475 | } | |
1476 | putc('\n', stderr); | |
1477 | #endif | |
1478 | copy(&d->follows[i], &merged); | |
1479 | epsclosure(&merged, d); | |
1480 | if (d->follows[i].nelem < merged.nelem) | |
1481 | REALLOC(d->follows[i].elems, position, merged.nelem); | |
1482 | copy(&merged, &d->follows[i]); | |
1483 | } | |
1484 | ||
1485 | /* Get the epsilon closure of the firstpos of the regexp. The result will | |
1486 | be the set of positions of state 0. */ | |
1487 | merged.nelem = 0; | |
1488 | for (i = 0; i < nfirstpos[-1]; ++i) | |
1489 | insert(firstpos[i], &merged); | |
1490 | epsclosure(&merged, d); | |
1491 | ||
1492 | /* Check if any of the positions of state 0 will want newline context. */ | |
1493 | wants_newline = 0; | |
1494 | for (i = 0; i < merged.nelem; ++i) | |
1495 | if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) | |
1496 | wants_newline = 1; | |
1497 | ||
1498 | /* Build the initial state. */ | |
1499 | d->salloc = 1; | |
1500 | d->sindex = 0; | |
1501 | MALLOC(d->states, dfa_state, d->salloc); | |
1502 | state_index(d, &merged, wants_newline, 0); | |
1503 | ||
1504 | free(o_nullable); | |
1505 | free(o_nfirst); | |
1506 | free(o_firstpos); | |
1507 | free(o_nlast); | |
1508 | free(o_lastpos); | |
1509 | free(nalloc); | |
1510 | free(merged.elems); | |
1511 | } | |
1512 | ||
1513 | /* Find, for each character, the transition out of state s of d, and store | |
1514 | it in the appropriate slot of trans. | |
1515 | ||
1516 | We divide the positions of s into groups (positions can appear in more | |
1517 | than one group). Each group is labeled with a set of characters that | |
1518 | every position in the group matches (taking into account, if necessary, | |
1519 | preceding context information of s). For each group, find the union | |
1520 | of the its elements' follows. This set is the set of positions of the | |
1521 | new state. For each character in the group's label, set the transition | |
1522 | on this character to be to a state corresponding to the set's positions, | |
1523 | and its associated backward context information, if necessary. | |
1524 | ||
1525 | If we are building a searching matcher, we include the positions of state | |
1526 | 0 in every state. | |
1527 | ||
1528 | The collection of groups is constructed by building an equivalence-class | |
1529 | partition of the positions of s. | |
1530 | ||
1531 | For each position, find the set of characters C that it matches. Eliminate | |
1532 | any characters from C that fail on grounds of backward context. | |
1533 | ||
1534 | Search through the groups, looking for a group whose label L has nonempty | |
1535 | intersection with C. If L - C is nonempty, create a new group labeled | |
1536 | L - C and having the same positions as the current group, and set L to | |
1537 | the intersection of L and C. Insert the position in this group, set | |
1538 | C = C - L, and resume scanning. | |
1539 | ||
1540 | If after comparing with every group there are characters remaining in C, | |
1541 | create a new group labeled with the characters of C and insert this | |
1542 | position in that group. */ | |
1543 | void | |
1544 | dfastate (int s, struct dfa *d, int trans[]) | |
1545 | { | |
1546 | position_set grps[NOTCHAR]; /* As many as will ever be needed. */ | |
1547 | charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ | |
1548 | int ngrps = 0; /* Number of groups actually used. */ | |
1549 | position pos; /* Current position being considered. */ | |
1550 | charclass matches; /* Set of matching characters. */ | |
1551 | int matchesf; /* True if matches is nonempty. */ | |
1552 | charclass intersect; /* Intersection with some label set. */ | |
1553 | int intersectf; /* True if intersect is nonempty. */ | |
1554 | charclass leftovers; /* Stuff in the label that didn't match. */ | |
1555 | int leftoversf; /* True if leftovers is nonempty. */ | |
1556 | static charclass letters; /* Set of characters considered letters. */ | |
1557 | static charclass newline; /* Set of characters that aren't newline. */ | |
1558 | position_set follows; /* Union of the follows of some group. */ | |
1559 | position_set tmp; /* Temporary space for merging sets. */ | |
1560 | int state; /* New state. */ | |
1561 | int wants_newline; /* New state wants to know newline context. */ | |
1562 | int state_newline; /* New state on a newline transition. */ | |
1563 | int wants_letter; /* New state wants to know letter context. */ | |
1564 | int state_letter; /* New state on a letter transition. */ | |
1565 | static int initialized; /* Flag for static initialization. */ | |
1566 | int i, j, k; | |
1567 | ||
1568 | /* Initialize the set of letters, if necessary. */ | |
1569 | if (! initialized) | |
1570 | { | |
1571 | initialized = 1; | |
1572 | for (i = 0; i < NOTCHAR; ++i) | |
1573 | if (IS_WORD_CONSTITUENT(i)) | |
1574 | setbit(i, letters); | |
1575 | setbit(eolbyte, newline); | |
1576 | } | |
1577 | ||
1578 | zeroset(matches); | |
1579 | ||
1580 | for (i = 0; i < d->states[s].elems.nelem; ++i) | |
1581 | { | |
1582 | pos = d->states[s].elems.elems[i]; | |
1583 | if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) | |
1584 | setbit(d->tokens[pos.index], matches); | |
1585 | else if (d->tokens[pos.index] >= CSET) | |
1586 | copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); | |
1587 | else | |
1588 | continue; | |
1589 | ||
1590 | /* Some characters may need to be eliminated from matches because | |
1591 | they fail in the current context. */ | |
1592 | if (pos.constraint != 0xFF) | |
1593 | { | |
1594 | if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1595 | d->states[s].newline, 1)) | |
1596 | clrbit(eolbyte, matches); | |
1597 | if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1598 | d->states[s].newline, 0)) | |
1599 | for (j = 0; j < CHARCLASS_INTS; ++j) | |
1600 | matches[j] &= newline[j]; | |
1601 | if (! MATCHES_LETTER_CONTEXT(pos.constraint, | |
1602 | d->states[s].letter, 1)) | |
1603 | for (j = 0; j < CHARCLASS_INTS; ++j) | |
1604 | matches[j] &= ~letters[j]; | |
1605 | if (! MATCHES_LETTER_CONTEXT(pos.constraint, | |
1606 | d->states[s].letter, 0)) | |
1607 | for (j = 0; j < CHARCLASS_INTS; ++j) | |
1608 | matches[j] &= letters[j]; | |
1609 | ||
1610 | /* If there are no characters left, there's no point in going on. */ | |
1611 | for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) | |
1612 | continue; | |
1613 | if (j == CHARCLASS_INTS) | |
1614 | continue; | |
1615 | } | |
1616 | ||
1617 | for (j = 0; j < ngrps; ++j) | |
1618 | { | |
1619 | /* If matches contains a single character only, and the current | |
1620 | group's label doesn't contain that character, go on to the | |
1621 | next group. */ | |
1622 | if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR | |
1623 | && !tstbit(d->tokens[pos.index], labels[j])) | |
1624 | continue; | |
1625 | ||
1626 | /* Check if this group's label has a nonempty intersection with | |
1627 | matches. */ | |
1628 | intersectf = 0; | |
1629 | for (k = 0; k < CHARCLASS_INTS; ++k) | |
1630 | (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; | |
1631 | if (! intersectf) | |
1632 | continue; | |
1633 | ||
1634 | /* It does; now find the set differences both ways. */ | |
1635 | leftoversf = matchesf = 0; | |
1636 | for (k = 0; k < CHARCLASS_INTS; ++k) | |
1637 | { | |
1638 | /* Even an optimizing compiler can't know this for sure. */ | |
1639 | int match = matches[k], label = labels[j][k]; | |
1640 | ||
1641 | (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; | |
1642 | (matches[k] = match & ~label) ? (matchesf = 1) : 0; | |
1643 | } | |
1644 | ||
1645 | /* If there were leftovers, create a new group labeled with them. */ | |
1646 | if (leftoversf) | |
1647 | { | |
1648 | copyset(leftovers, labels[ngrps]); | |
1649 | copyset(intersect, labels[j]); | |
1650 | MALLOC(grps[ngrps].elems, position, d->nleaves); | |
1651 | copy(&grps[j], &grps[ngrps]); | |
1652 | ++ngrps; | |
1653 | } | |
1654 | ||
1655 | /* Put the position in the current group. Note that there is no | |
1656 | reason to call insert() here. */ | |
1657 | grps[j].elems[grps[j].nelem++] = pos; | |
1658 | ||
1659 | /* If every character matching the current position has been | |
1660 | accounted for, we're done. */ | |
1661 | if (! matchesf) | |
1662 | break; | |
1663 | } | |
1664 | ||
1665 | /* If we've passed the last group, and there are still characters | |
1666 | unaccounted for, then we'll have to create a new group. */ | |
1667 | if (j == ngrps) | |
1668 | { | |
1669 | copyset(matches, labels[ngrps]); | |
1670 | zeroset(matches); | |
1671 | MALLOC(grps[ngrps].elems, position, d->nleaves); | |
1672 | grps[ngrps].nelem = 1; | |
1673 | grps[ngrps].elems[0] = pos; | |
1674 | ++ngrps; | |
1675 | } | |
1676 | } | |
1677 | ||
1678 | MALLOC(follows.elems, position, d->nleaves); | |
1679 | MALLOC(tmp.elems, position, d->nleaves); | |
1680 | ||
1681 | /* If we are a searching matcher, the default transition is to a state | |
1682 | containing the positions of state 0, otherwise the default transition | |
1683 | is to fail miserably. */ | |
1684 | if (d->searchflag) | |
1685 | { | |
1686 | wants_newline = 0; | |
1687 | wants_letter = 0; | |
1688 | for (i = 0; i < d->states[0].elems.nelem; ++i) | |
1689 | { | |
1690 | if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) | |
1691 | wants_newline = 1; | |
1692 | if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) | |
1693 | wants_letter = 1; | |
1694 | } | |
1695 | copy(&d->states[0].elems, &follows); | |
1696 | state = state_index(d, &follows, 0, 0); | |
1697 | if (wants_newline) | |
1698 | state_newline = state_index(d, &follows, 1, 0); | |
1699 | else | |
1700 | state_newline = state; | |
1701 | if (wants_letter) | |
1702 | state_letter = state_index(d, &follows, 0, 1); | |
1703 | else | |
1704 | state_letter = state; | |
1705 | for (i = 0; i < NOTCHAR; ++i) | |
1706 | trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; | |
1707 | trans[eolbyte] = state_newline; | |
1708 | } | |
1709 | else | |
1710 | for (i = 0; i < NOTCHAR; ++i) | |
1711 | trans[i] = -1; | |
1712 | ||
1713 | for (i = 0; i < ngrps; ++i) | |
1714 | { | |
1715 | follows.nelem = 0; | |
1716 | ||
1717 | /* Find the union of the follows of the positions of the group. | |
1718 | This is a hideously inefficient loop. Fix it someday. */ | |
1719 | for (j = 0; j < grps[i].nelem; ++j) | |
1720 | for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) | |
1721 | insert(d->follows[grps[i].elems[j].index].elems[k], &follows); | |
1722 | ||
1723 | /* If we are building a searching matcher, throw in the positions | |
1724 | of state 0 as well. */ | |
1725 | if (d->searchflag) | |
1726 | for (j = 0; j < d->states[0].elems.nelem; ++j) | |
1727 | insert(d->states[0].elems.elems[j], &follows); | |
1728 | ||
1729 | /* Find out if the new state will want any context information. */ | |
1730 | wants_newline = 0; | |
1731 | if (tstbit(eolbyte, labels[i])) | |
1732 | for (j = 0; j < follows.nelem; ++j) | |
1733 | if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) | |
1734 | wants_newline = 1; | |
1735 | ||
1736 | wants_letter = 0; | |
1737 | for (j = 0; j < CHARCLASS_INTS; ++j) | |
1738 | if (labels[i][j] & letters[j]) | |
1739 | break; | |
1740 | if (j < CHARCLASS_INTS) | |
1741 | for (j = 0; j < follows.nelem; ++j) | |
1742 | if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) | |
1743 | wants_letter = 1; | |
1744 | ||
1745 | /* Find the state(s) corresponding to the union of the follows. */ | |
1746 | state = state_index(d, &follows, 0, 0); | |
1747 | if (wants_newline) | |
1748 | state_newline = state_index(d, &follows, 1, 0); | |
1749 | else | |
1750 | state_newline = state; | |
1751 | if (wants_letter) | |
1752 | state_letter = state_index(d, &follows, 0, 1); | |
1753 | else | |
1754 | state_letter = state; | |
1755 | ||
1756 | /* Set the transitions for each character in the current label. */ | |
1757 | for (j = 0; j < CHARCLASS_INTS; ++j) | |
1758 | for (k = 0; k < INTBITS; ++k) | |
1759 | if (labels[i][j] & 1 << k) | |
1760 | { | |
1761 | int c = j * INTBITS + k; | |
1762 | ||
1763 | if (c == eolbyte) | |
1764 | trans[c] = state_newline; | |
1765 | else if (IS_WORD_CONSTITUENT(c)) | |
1766 | trans[c] = state_letter; | |
1767 | else if (c < NOTCHAR) | |
1768 | trans[c] = state; | |
1769 | } | |
1770 | } | |
1771 | ||
1772 | for (i = 0; i < ngrps; ++i) | |
1773 | free(grps[i].elems); | |
1774 | free(follows.elems); | |
1775 | free(tmp.elems); | |
1776 | } | |
1777 | ||
1778 | /* Some routines for manipulating a compiled dfa's transition tables. | |
1779 | Each state may or may not have a transition table; if it does, and it | |
1780 | is a non-accepting state, then d->trans[state] points to its table. | |
1781 | If it is an accepting state then d->fails[state] points to its table. | |
1782 | If it has no table at all, then d->trans[state] is NULL. | |
1783 | TODO: Improve this comment, get rid of the unnecessary redundancy. */ | |
1784 | ||
1785 | static void | |
1786 | build_state (int s, struct dfa *d) | |
1787 | { | |
1788 | int *trans; /* The new transition table. */ | |
1789 | int i; | |
1790 | ||
1791 | /* Set an upper limit on the number of transition tables that will ever | |
1792 | exist at once. 1024 is arbitrary. The idea is that the frequently | |
1793 | used transition tables will be quickly rebuilt, whereas the ones that | |
1794 | were only needed once or twice will be cleared away. */ | |
1795 | if (d->trcount >= 1024) | |
1796 | { | |
1797 | for (i = 0; i < d->tralloc; ++i) | |
1798 | if (d->trans[i]) | |
1799 | { | |
1800 | free((ptr_t) d->trans[i]); | |
1801 | d->trans[i] = NULL; | |
1802 | } | |
1803 | else if (d->fails[i]) | |
1804 | { | |
1805 | free((ptr_t) d->fails[i]); | |
1806 | d->fails[i] = NULL; | |
1807 | } | |
1808 | d->trcount = 0; | |
1809 | } | |
1810 | ||
1811 | ++d->trcount; | |
1812 | ||
1813 | /* Set up the success bits for this state. */ | |
1814 | d->success[s] = 0; | |
1815 | if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, | |
1816 | s, *d)) | |
1817 | d->success[s] |= 4; | |
1818 | if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, | |
1819 | s, *d)) | |
1820 | d->success[s] |= 2; | |
1821 | if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, | |
1822 | s, *d)) | |
1823 | d->success[s] |= 1; | |
1824 | ||
1825 | MALLOC(trans, int, NOTCHAR); | |
1826 | dfastate(s, d, trans); | |
1827 | ||
1828 | /* Now go through the new transition table, and make sure that the trans | |
1829 | and fail arrays are allocated large enough to hold a pointer for the | |
1830 | largest state mentioned in the table. */ | |
1831 | for (i = 0; i < NOTCHAR; ++i) | |
1832 | if (trans[i] >= d->tralloc) | |
1833 | { | |
1834 | int oldalloc = d->tralloc; | |
1835 | ||
1836 | while (trans[i] >= d->tralloc) | |
1837 | d->tralloc *= 2; | |
1838 | REALLOC(d->realtrans, int *, d->tralloc + 1); | |
1839 | d->trans = d->realtrans + 1; | |
1840 | REALLOC(d->fails, int *, d->tralloc); | |
1841 | REALLOC(d->success, int, d->tralloc); | |
1842 | REALLOC(d->newlines, int, d->tralloc); | |
1843 | while (oldalloc < d->tralloc) | |
1844 | { | |
1845 | d->trans[oldalloc] = NULL; | |
1846 | d->fails[oldalloc++] = NULL; | |
1847 | } | |
1848 | } | |
1849 | ||
1850 | /* Keep the newline transition in a special place so we can use it as | |
1851 | a sentinel. */ | |
1852 | d->newlines[s] = trans[eolbyte]; | |
1853 | trans[eolbyte] = -1; | |
1854 | ||
1855 | if (ACCEPTING(s, *d)) | |
1856 | d->fails[s] = trans; | |
1857 | else | |
1858 | d->trans[s] = trans; | |
1859 | } | |
1860 | ||
1861 | static void | |
1862 | build_state_zero (struct dfa *d) | |
1863 | { | |
1864 | d->tralloc = 1; | |
1865 | d->trcount = 0; | |
1866 | CALLOC(d->realtrans, int *, d->tralloc + 1); | |
1867 | d->trans = d->realtrans + 1; | |
1868 | CALLOC(d->fails, int *, d->tralloc); | |
1869 | MALLOC(d->success, int, d->tralloc); | |
1870 | MALLOC(d->newlines, int, d->tralloc); | |
1871 | build_state(0, d); | |
1872 | } | |
1873 | ||
1874 | /* Search through a buffer looking for a match to the given struct dfa. | |
1875 | Find the first occurrence of a string matching the regexp in the buffer, | |
1876 | and the shortest possible version thereof. Return a pointer to the first | |
1877 | character after the match, or NULL if none is found. Begin points to | |
1878 | the beginning of the buffer, and end points to the first character after | |
1879 | its end. We store a newline in *end to act as a sentinel, so end had | |
1880 | better point somewhere valid. Newline is a flag indicating whether to | |
1881 | allow newlines to be in the matching string. If count is non- | |
1882 | NULL it points to a place we're supposed to increment every time we | |
1883 | see a newline. Finally, if backref is non-NULL it points to a place | |
1884 | where we're supposed to store a 1 if backreferencing happened and the | |
1885 | match needs to be verified by a backtracking matcher. Otherwise | |
1886 | we store a 0 in *backref. */ | |
1887 | char * | |
1888 | dfaexec (struct dfa *d, char *begin, char *end, | |
1889 | int newline, int *count, int *backref) | |
1890 | { | |
1891 | register int s, s1, tmp; /* Current state. */ | |
1892 | register unsigned char *p; /* Current input character. */ | |
1893 | register int **trans, *t; /* Copy of d->trans so it can be optimized | |
1894 | into a register. */ | |
1895 | register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ | |
1896 | static int sbit[NOTCHAR]; /* Table for anding with d->success. */ | |
1897 | static int sbit_init; | |
1898 | ||
1899 | if (! sbit_init) | |
1900 | { | |
1901 | int i; | |
1902 | ||
1903 | sbit_init = 1; | |
1904 | for (i = 0; i < NOTCHAR; ++i) | |
1905 | sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; | |
1906 | sbit[eol] = 4; | |
1907 | } | |
1908 | ||
1909 | if (! d->tralloc) | |
1910 | build_state_zero(d); | |
1911 | ||
1912 | s = s1 = 0; | |
1913 | p = (unsigned char *) begin; | |
1914 | trans = d->trans; | |
1915 | *end = eol; | |
1916 | ||
1917 | for (;;) | |
1918 | { | |
1919 | while ((t = trans[s]) != 0) { /* hand-optimized loop */ | |
1920 | s1 = t[*p++]; | |
1921 | if ((t = trans[s1]) == 0) { | |
1922 | tmp = s ; s = s1 ; s1 = tmp ; /* swap */ | |
1923 | break; | |
1924 | } | |
1925 | s = t[*p++]; | |
1926 | } | |
1927 | ||
1928 | if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) | |
1929 | { | |
1930 | if (d->success[s] & sbit[*p]) | |
1931 | { | |
1932 | if (backref) | |
1933 | *backref = (d->states[s].backref != 0); | |
1934 | return (char *) p; | |
1935 | } | |
1936 | ||
1937 | s1 = s; | |
1938 | s = d->fails[s][*p++]; | |
1939 | continue; | |
1940 | } | |
1941 | ||
1942 | /* If the previous character was a newline, count it. */ | |
1943 | if (count && (char *) p <= end && p[-1] == eol) | |
1944 | ++*count; | |
1945 | ||
1946 | /* Check if we've run off the end of the buffer. */ | |
1947 | if ((char *) p > end) | |
1948 | return NULL; | |
1949 | ||
1950 | if (s >= 0) | |
1951 | { | |
1952 | build_state(s, d); | |
1953 | trans = d->trans; | |
1954 | continue; | |
1955 | } | |
1956 | ||
1957 | if (p[-1] == eol && newline) | |
1958 | { | |
1959 | s = d->newlines[s1]; | |
1960 | continue; | |
1961 | } | |
1962 | ||
1963 | s = 0; | |
1964 | } | |
1965 | } | |
1966 | ||
1967 | /* Initialize the components of a dfa that the other routines don't | |
1968 | initialize for themselves. */ | |
1969 | void | |
1970 | dfainit (struct dfa *d) | |
1971 | { | |
1972 | d->calloc = 1; | |
1973 | MALLOC(d->charclasses, charclass, d->calloc); | |
1974 | d->cindex = 0; | |
1975 | ||
1976 | d->talloc = 1; | |
1977 | MALLOC(d->tokens, token, d->talloc); | |
1978 | d->tindex = d->depth = d->nleaves = d->nregexps = 0; | |
1979 | ||
1980 | d->searchflag = 0; | |
1981 | d->tralloc = 0; | |
1982 | ||
1983 | d->musts = 0; | |
1984 | } | |
1985 | ||
1986 | /* Parse and analyze a single string of the given length. */ | |
1987 | void | |
1988 | dfacomp (char *s, size_t len, struct dfa *d, int searchflag) | |
1989 | { | |
1990 | if (case_fold) /* dummy folding in service of dfamust() */ | |
1991 | { | |
1992 | char *lcopy; | |
1993 | int i; | |
1994 | ||
1995 | lcopy = malloc(len); | |
1996 | if (!lcopy) | |
1997 | dfaerror(_("out of memory")); | |
1998 | ||
1999 | /* This is a kludge. */ | |
2000 | case_fold = 0; | |
2001 | for (i = 0; i < len; ++i) | |
2002 | if (ISUPPER ((unsigned char) s[i])) | |
2003 | lcopy[i] = tolower ((unsigned char) s[i]); | |
2004 | else | |
2005 | lcopy[i] = s[i]; | |
2006 | ||
2007 | dfainit(d); | |
2008 | dfaparse(lcopy, len, d); | |
2009 | free(lcopy); | |
2010 | dfamust(d); | |
2011 | d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; | |
2012 | case_fold = 1; | |
2013 | dfaparse(s, len, d); | |
2014 | dfaanalyze(d, searchflag); | |
2015 | } | |
2016 | else | |
2017 | { | |
2018 | dfainit(d); | |
2019 | dfaparse(s, len, d); | |
2020 | dfamust(d); | |
2021 | dfaanalyze(d, searchflag); | |
2022 | } | |
2023 | } | |
2024 | ||
2025 | /* Free the storage held by the components of a dfa. */ | |
2026 | void | |
2027 | dfafree (struct dfa *d) | |
2028 | { | |
2029 | int i; | |
2030 | struct dfamust *dm, *ndm; | |
2031 | ||
2032 | free((ptr_t) d->charclasses); | |
2033 | free((ptr_t) d->tokens); | |
2034 | for (i = 0; i < d->sindex; ++i) | |
2035 | free((ptr_t) d->states[i].elems.elems); | |
2036 | free((ptr_t) d->states); | |
2037 | for (i = 0; i < d->tindex; ++i) | |
2038 | if (d->follows[i].elems) | |
2039 | free((ptr_t) d->follows[i].elems); | |
2040 | free((ptr_t) d->follows); | |
2041 | for (i = 0; i < d->tralloc; ++i) | |
2042 | if (d->trans[i]) | |
2043 | free((ptr_t) d->trans[i]); | |
2044 | else if (d->fails[i]) | |
2045 | free((ptr_t) d->fails[i]); | |
2046 | if (d->realtrans) free((ptr_t) d->realtrans); | |
2047 | if (d->fails) free((ptr_t) d->fails); | |
2048 | if (d->newlines) free((ptr_t) d->newlines); | |
2049 | if (d->success) free((ptr_t) d->success); | |
2050 | for (dm = d->musts; dm; dm = ndm) | |
2051 | { | |
2052 | ndm = dm->next; | |
2053 | free(dm->must); | |
2054 | free((ptr_t) dm); | |
2055 | } | |
2056 | } | |
2057 | ||
2058 | /* Having found the postfix representation of the regular expression, | |
2059 | try to find a long sequence of characters that must appear in any line | |
2060 | containing the r.e. | |
2061 | Finding a "longest" sequence is beyond the scope here; | |
2062 | we take an easy way out and hope for the best. | |
2063 | (Take "(ab|a)b"--please.) | |
2064 | ||
2065 | We do a bottom-up calculation of sequences of characters that must appear | |
2066 | in matches of r.e.'s represented by trees rooted at the nodes of the postfix | |
2067 | representation: | |
2068 | sequences that must appear at the left of the match ("left") | |
2069 | sequences that must appear at the right of the match ("right") | |
2070 | lists of sequences that must appear somewhere in the match ("in") | |
2071 | sequences that must constitute the match ("is") | |
2072 | ||
2073 | When we get to the root of the tree, we use one of the longest of its | |
2074 | calculated "in" sequences as our answer. The sequence we find is returned in | |
2075 | d->must (where "d" is the single argument passed to "dfamust"); | |
2076 | the length of the sequence is returned in d->mustn. | |
2077 | ||
2078 | The sequences calculated for the various types of node (in pseudo ANSI c) | |
2079 | are shown below. "p" is the operand of unary operators (and the left-hand | |
2080 | operand of binary operators); "q" is the right-hand operand of binary | |
2081 | operators. | |
2082 | ||
2083 | "ZERO" means "a zero-length sequence" below. | |
2084 | ||
2085 | Type left right is in | |
2086 | ---- ---- ----- -- -- | |
2087 | char c # c # c # c # c | |
2088 | ||
2089 | CSET ZERO ZERO ZERO ZERO | |
2090 | ||
2091 | STAR ZERO ZERO ZERO ZERO | |
2092 | ||
2093 | QMARK ZERO ZERO ZERO ZERO | |
2094 | ||
2095 | PLUS p->left p->right ZERO p->in | |
2096 | ||
2097 | CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus | |
2098 | p->left : q->right : q->is!=ZERO) ? q->in plus | |
2099 | p->is##q->left p->right##q->is p->is##q->is : p->right##q->left | |
2100 | ZERO | |
2101 | ||
2102 | OR longest common longest common (do p->is and substrings common to | |
2103 | leading trailing q->is have same p->in and q->in | |
2104 | (sub)sequence (sub)sequence length and | |
2105 | of p->left of p->right content) ? | |
2106 | and q->left and q->right p->is : NULL | |
2107 | ||
2108 | If there's anything else we recognize in the tree, all four sequences get set | |
2109 | to zero-length sequences. If there's something we don't recognize in the tree, | |
2110 | we just return a zero-length sequence. | |
2111 | ||
2112 | Break ties in favor of infrequent letters (choosing 'zzz' in preference to | |
2113 | 'aaa')? | |
2114 | ||
2115 | And. . .is it here or someplace that we might ponder "optimizations" such as | |
2116 | egrep 'psi|epsilon' -> egrep 'psi' | |
2117 | egrep 'pepsi|epsilon' -> egrep 'epsi' | |
2118 | (Yes, we now find "epsi" as a "string | |
2119 | that must occur", but we might also | |
2120 | simplify the *entire* r.e. being sought) | |
2121 | grep '[c]' -> grep 'c' | |
2122 | grep '(ab|a)b' -> grep 'ab' | |
2123 | grep 'ab*' -> grep 'a' | |
2124 | grep 'a*b' -> grep 'b' | |
2125 | ||
2126 | There are several issues: | |
2127 | ||
2128 | Is optimization easy (enough)? | |
2129 | ||
2130 | Does optimization actually accomplish anything, | |
2131 | or is the automaton you get from "psi|epsilon" (for example) | |
2132 | the same as the one you get from "psi" (for example)? | |
2133 | ||
2134 | Are optimizable r.e.'s likely to be used in real-life situations | |
2135 | (something like 'ab*' is probably unlikely; something like is | |
2136 | 'psi|epsilon' is likelier)? */ | |
2137 | ||
2138 | static char * | |
2139 | icatalloc (char *old, char *new) | |
2140 | { | |
2141 | char *result; | |
2142 | size_t oldsize, newsize; | |
2143 | ||
2144 | newsize = (new == NULL) ? 0 : strlen(new); | |
2145 | if (old == NULL) | |
2146 | oldsize = 0; | |
2147 | else if (newsize == 0) | |
2148 | return old; | |
2149 | else oldsize = strlen(old); | |
2150 | if (old == NULL) | |
2151 | result = (char *) malloc(newsize + 1); | |
2152 | else | |
2153 | result = (char *) realloc((void *) old, oldsize + newsize + 1); | |
2154 | if (result != NULL && new != NULL) | |
2155 | (void) strcpy(result + oldsize, new); | |
2156 | return result; | |
2157 | } | |
2158 | ||
2159 | static char * | |
2160 | icpyalloc (char *string) | |
2161 | { | |
2162 | return icatalloc((char *) NULL, string); | |
2163 | } | |
2164 | ||
2165 | static char * | |
2166 | istrstr (char *lookin, char *lookfor) | |
2167 | { | |
2168 | char *cp; | |
2169 | size_t len; | |
2170 | ||
2171 | len = strlen(lookfor); | |
2172 | for (cp = lookin; *cp != '\0'; ++cp) | |
2173 | if (strncmp(cp, lookfor, len) == 0) | |
2174 | return cp; | |
2175 | return NULL; | |
2176 | } | |
2177 | ||
2178 | static void | |
2179 | ifree (char *cp) | |
2180 | { | |
2181 | if (cp != NULL) | |
2182 | free(cp); | |
2183 | } | |
2184 | ||
2185 | static void | |
2186 | freelist (char **cpp) | |
2187 | { | |
2188 | int i; | |
2189 | ||
2190 | if (cpp == NULL) | |
2191 | return; | |
2192 | for (i = 0; cpp[i] != NULL; ++i) | |
2193 | { | |
2194 | free(cpp[i]); | |
2195 | cpp[i] = NULL; | |
2196 | } | |
2197 | } | |
2198 | ||
2199 | static char ** | |
2200 | enlist (char **cpp, char *new, size_t len) | |
2201 | { | |
2202 | int i, j; | |
2203 | ||
2204 | if (cpp == NULL) | |
2205 | return NULL; | |
2206 | if ((new = icpyalloc(new)) == NULL) | |
2207 | { | |
2208 | freelist(cpp); | |
2209 | return NULL; | |
2210 | } | |
2211 | new[len] = '\0'; | |
2212 | /* Is there already something in the list that's new (or longer)? */ | |
2213 | for (i = 0; cpp[i] != NULL; ++i) | |
2214 | if (istrstr(cpp[i], new) != NULL) | |
2215 | { | |
2216 | free(new); | |
2217 | return cpp; | |
2218 | } | |
2219 | /* Eliminate any obsoleted strings. */ | |
2220 | j = 0; | |
2221 | while (cpp[j] != NULL) | |
2222 | if (istrstr(new, cpp[j]) == NULL) | |
2223 | ++j; | |
2224 | else | |
2225 | { | |
2226 | free(cpp[j]); | |
2227 | if (--i == j) | |
2228 | break; | |
2229 | cpp[j] = cpp[i]; | |
2230 | cpp[i] = NULL; | |
2231 | } | |
2232 | /* Add the new string. */ | |
2233 | cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); | |
2234 | if (cpp == NULL) | |
2235 | return NULL; | |
2236 | cpp[i] = new; | |
2237 | cpp[i + 1] = NULL; | |
2238 | return cpp; | |
2239 | } | |
2240 | ||
2241 | /* Given pointers to two strings, return a pointer to an allocated | |
2242 | list of their distinct common substrings. Return NULL if something | |
2243 | seems wild. */ | |
2244 | static char ** | |
2245 | comsubs (char *left, char *right) | |
2246 | { | |
2247 | char **cpp; | |
2248 | char *lcp; | |
2249 | char *rcp; | |
2250 | size_t i, len; | |
2251 | ||
2252 | if (left == NULL || right == NULL) | |
2253 | return NULL; | |
2254 | cpp = (char **) malloc(sizeof *cpp); | |
2255 | if (cpp == NULL) | |
2256 | return NULL; | |
2257 | cpp[0] = NULL; | |
2258 | for (lcp = left; *lcp != '\0'; ++lcp) | |
2259 | { | |
2260 | len = 0; | |
2261 | rcp = index(right, *lcp); | |
2262 | while (rcp != NULL) | |
2263 | { | |
2264 | for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) | |
2265 | continue; | |
2266 | if (i > len) | |
2267 | len = i; | |
2268 | rcp = index(rcp + 1, *lcp); | |
2269 | } | |
2270 | if (len == 0) | |
2271 | continue; | |
2272 | if ((cpp = enlist(cpp, lcp, len)) == NULL) | |
2273 | break; | |
2274 | } | |
2275 | return cpp; | |
2276 | } | |
2277 | ||
2278 | static char ** | |
2279 | addlists (char **old, char **new) | |
2280 | { | |
2281 | int i; | |
2282 | ||
2283 | if (old == NULL || new == NULL) | |
2284 | return NULL; | |
2285 | for (i = 0; new[i] != NULL; ++i) | |
2286 | { | |
2287 | old = enlist(old, new[i], strlen(new[i])); | |
2288 | if (old == NULL) | |
2289 | break; | |
2290 | } | |
2291 | return old; | |
2292 | } | |
2293 | ||
2294 | /* Given two lists of substrings, return a new list giving substrings | |
2295 | common to both. */ | |
2296 | static char ** | |
2297 | inboth (char **left, char **right) | |
2298 | { | |
2299 | char **both; | |
2300 | char **temp; | |
2301 | int lnum, rnum; | |
2302 | ||
2303 | if (left == NULL || right == NULL) | |
2304 | return NULL; | |
2305 | both = (char **) malloc(sizeof *both); | |
2306 | if (both == NULL) | |
2307 | return NULL; | |
2308 | both[0] = NULL; | |
2309 | for (lnum = 0; left[lnum] != NULL; ++lnum) | |
2310 | { | |
2311 | for (rnum = 0; right[rnum] != NULL; ++rnum) | |
2312 | { | |
2313 | temp = comsubs(left[lnum], right[rnum]); | |
2314 | if (temp == NULL) | |
2315 | { | |
2316 | freelist(both); | |
2317 | return NULL; | |
2318 | } | |
2319 | both = addlists(both, temp); | |
2320 | freelist(temp); | |
2321 | free(temp); | |
2322 | if (both == NULL) | |
2323 | return NULL; | |
2324 | } | |
2325 | } | |
2326 | return both; | |
2327 | } | |
2328 | ||
2329 | typedef struct | |
2330 | { | |
2331 | char **in; | |
2332 | char *left; | |
2333 | char *right; | |
2334 | char *is; | |
2335 | } must; | |
2336 | ||
2337 | static void | |
2338 | resetmust (must *mp) | |
2339 | { | |
2340 | mp->left[0] = mp->right[0] = mp->is[0] = '\0'; | |
2341 | freelist(mp->in); | |
2342 | } | |
2343 | ||
2344 | static void | |
2345 | dfamust (struct dfa *dfa) | |
2346 | { | |
2347 | must *musts; | |
2348 | must *mp; | |
2349 | char *result; | |
2350 | int ri; | |
2351 | int i; | |
2352 | int exact; | |
2353 | token t; | |
2354 | static must must0; | |
2355 | struct dfamust *dm; | |
2356 | static char empty_string[] = ""; | |
2357 | ||
2358 | result = empty_string; | |
2359 | exact = 0; | |
2360 | musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); | |
2361 | if (musts == NULL) | |
2362 | return; | |
2363 | mp = musts; | |
2364 | for (i = 0; i <= dfa->tindex; ++i) | |
2365 | mp[i] = must0; | |
2366 | for (i = 0; i <= dfa->tindex; ++i) | |
2367 | { | |
2368 | mp[i].in = (char **) malloc(sizeof *mp[i].in); | |
2369 | mp[i].left = malloc(2); | |
2370 | mp[i].right = malloc(2); | |
2371 | mp[i].is = malloc(2); | |
2372 | if (mp[i].in == NULL || mp[i].left == NULL || | |
2373 | mp[i].right == NULL || mp[i].is == NULL) | |
2374 | goto done; | |
2375 | mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; | |
2376 | mp[i].in[0] = NULL; | |
2377 | } | |
2378 | #ifdef DEBUG | |
2379 | fprintf(stderr, "dfamust:\n"); | |
2380 | for (i = 0; i < dfa->tindex; ++i) | |
2381 | { | |
2382 | fprintf(stderr, " %d:", i); | |
2383 | prtok(dfa->tokens[i]); | |
2384 | } | |
2385 | putc('\n', stderr); | |
2386 | #endif | |
2387 | for (ri = 0; ri < dfa->tindex; ++ri) | |
2388 | { | |
2389 | switch (t = dfa->tokens[ri]) | |
2390 | { | |
2391 | case LPAREN: | |
2392 | case RPAREN: | |
2393 | goto done; /* "cannot happen" */ | |
2394 | case EMPTY: | |
2395 | case BEGLINE: | |
2396 | case ENDLINE: | |
2397 | case BEGWORD: | |
2398 | case ENDWORD: | |
2399 | case LIMWORD: | |
2400 | case NOTLIMWORD: | |
2401 | case BACKREF: | |
2402 | resetmust(mp); | |
2403 | break; | |
2404 | case STAR: | |
2405 | case QMARK: | |
2406 | if (mp <= musts) | |
2407 | goto done; /* "cannot happen" */ | |
2408 | --mp; | |
2409 | resetmust(mp); | |
2410 | break; | |
2411 | case OR: | |
2412 | case ORTOP: | |
2413 | if (mp < &musts[2]) | |
2414 | goto done; /* "cannot happen" */ | |
2415 | { | |
2416 | char **new; | |
2417 | must *lmp; | |
2418 | must *rmp; | |
2419 | int j, ln, rn, n; | |
2420 | ||
2421 | rmp = --mp; | |
2422 | lmp = --mp; | |
2423 | /* Guaranteed to be. Unlikely, but. . . */ | |
2424 | if (strcmp(lmp->is, rmp->is) != 0) | |
2425 | lmp->is[0] = '\0'; | |
2426 | /* Left side--easy */ | |
2427 | i = 0; | |
2428 | while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) | |
2429 | ++i; | |
2430 | lmp->left[i] = '\0'; | |
2431 | /* Right side */ | |
2432 | ln = strlen(lmp->right); | |
2433 | rn = strlen(rmp->right); | |
2434 | n = ln; | |
2435 | if (n > rn) | |
2436 | n = rn; | |
2437 | for (i = 0; i < n; ++i) | |
2438 | if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) | |
2439 | break; | |
2440 | for (j = 0; j < i; ++j) | |
2441 | lmp->right[j] = lmp->right[(ln - i) + j]; | |
2442 | lmp->right[j] = '\0'; | |
2443 | new = inboth(lmp->in, rmp->in); | |
2444 | if (new == NULL) | |
2445 | goto done; | |
2446 | freelist(lmp->in); | |
2447 | free((char *) lmp->in); | |
2448 | lmp->in = new; | |
2449 | } | |
2450 | break; | |
2451 | case PLUS: | |
2452 | if (mp <= musts) | |
2453 | goto done; /* "cannot happen" */ | |
2454 | --mp; | |
2455 | mp->is[0] = '\0'; | |
2456 | break; | |
2457 | case END: | |
2458 | if (mp != &musts[1]) | |
2459 | goto done; /* "cannot happen" */ | |
2460 | for (i = 0; musts[0].in[i] != NULL; ++i) | |
2461 | if (strlen(musts[0].in[i]) > strlen(result)) | |
2462 | result = musts[0].in[i]; | |
2463 | if (strcmp(result, musts[0].is) == 0) | |
2464 | exact = 1; | |
2465 | goto done; | |
2466 | case CAT: | |
2467 | if (mp < &musts[2]) | |
2468 | goto done; /* "cannot happen" */ | |
2469 | { | |
2470 | must *lmp; | |
2471 | must *rmp; | |
2472 | ||
2473 | rmp = --mp; | |
2474 | lmp = --mp; | |
2475 | /* In. Everything in left, plus everything in | |
2476 | right, plus catenation of | |
2477 | left's right and right's left. */ | |
2478 | lmp->in = addlists(lmp->in, rmp->in); | |
2479 | if (lmp->in == NULL) | |
2480 | goto done; | |
2481 | if (lmp->right[0] != '\0' && | |
2482 | rmp->left[0] != '\0') | |
2483 | { | |
2484 | char *tp; | |
2485 | ||
2486 | tp = icpyalloc(lmp->right); | |
2487 | if (tp == NULL) | |
2488 | goto done; | |
2489 | tp = icatalloc(tp, rmp->left); | |
2490 | if (tp == NULL) | |
2491 | goto done; | |
2492 | lmp->in = enlist(lmp->in, tp, | |
2493 | strlen(tp)); | |
2494 | free(tp); | |
2495 | if (lmp->in == NULL) | |
2496 | goto done; | |
2497 | } | |
2498 | /* Left-hand */ | |
2499 | if (lmp->is[0] != '\0') | |
2500 | { | |
2501 | lmp->left = icatalloc(lmp->left, | |
2502 | rmp->left); | |
2503 | if (lmp->left == NULL) | |
2504 | goto done; | |
2505 | } | |
2506 | /* Right-hand */ | |
2507 | if (rmp->is[0] == '\0') | |
2508 | lmp->right[0] = '\0'; | |
2509 | lmp->right = icatalloc(lmp->right, rmp->right); | |
2510 | if (lmp->right == NULL) | |
2511 | goto done; | |
2512 | /* Guaranteed to be */ | |
2513 | if (lmp->is[0] != '\0' && rmp->is[0] != '\0') | |
2514 | { | |
2515 | lmp->is = icatalloc(lmp->is, rmp->is); | |
2516 | if (lmp->is == NULL) | |
2517 | goto done; | |
2518 | } | |
2519 | else | |
2520 | lmp->is[0] = '\0'; | |
2521 | } | |
2522 | break; | |
2523 | default: | |
2524 | if (t < END) | |
2525 | { | |
2526 | /* "cannot happen" */ | |
2527 | goto done; | |
2528 | } | |
2529 | else if (t == '\0') | |
2530 | { | |
2531 | /* not on *my* shift */ | |
2532 | goto done; | |
2533 | } | |
2534 | else if (t >= CSET) | |
2535 | { | |
2536 | /* easy enough */ | |
2537 | resetmust(mp); | |
2538 | } | |
2539 | else | |
2540 | { | |
2541 | /* plain character */ | |
2542 | resetmust(mp); | |
2543 | mp->is[0] = mp->left[0] = mp->right[0] = t; | |
2544 | mp->is[1] = mp->left[1] = mp->right[1] = '\0'; | |
2545 | mp->in = enlist(mp->in, mp->is, (size_t)1); | |
2546 | if (mp->in == NULL) | |
2547 | goto done; | |
2548 | } | |
2549 | break; | |
2550 | } | |
2551 | #ifdef DEBUG | |
2552 | fprintf(stderr, " node: %d:", ri); | |
2553 | prtok(dfa->tokens[ri]); | |
2554 | fprintf(stderr, "\n in:"); | |
2555 | for (i = 0; mp->in[i]; ++i) | |
2556 | fprintf(stderr, " \"%s\"", mp->in[i]); | |
2557 | fprintf(stderr, "\n is: \"%s\"\n", mp->is); | |
2558 | fprintf(stderr, " left: \"%s\"\n", mp->left); | |
2559 | fprintf(stderr, " right: \"%s\"\n", mp->right); | |
2560 | #endif | |
2561 | ++mp; | |
2562 | } | |
2563 | done: | |
2564 | if (strlen(result)) | |
2565 | { | |
2566 | dm = (struct dfamust *) malloc(sizeof (struct dfamust)); | |
2567 | dm->exact = exact; | |
2568 | dm->must = malloc(strlen(result) + 1); | |
2569 | strcpy(dm->must, result); | |
2570 | dm->next = dfa->musts; | |
2571 | dfa->musts = dm; | |
2572 | } | |
2573 | mp = musts; | |
2574 | for (i = 0; i <= dfa->tindex; ++i) | |
2575 | { | |
2576 | freelist(mp[i].in); | |
2577 | ifree((char *) mp[i].in); | |
2578 | ifree(mp[i].left); | |
2579 | ifree(mp[i].right); | |
2580 | ifree(mp[i].is); | |
2581 | } | |
2582 | free((char *) mp); | |
2583 | } |