1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
29 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
30 a list of strings, at least one of which is known to occur in
31 any string matching the regexp. */
34 /* DFA compiled regexp. */
37 /* Regex compiled regexps. */
38 struct re_pattern_buffer *patterns;
40 struct re_registers regs;
42 /* Number of compiled fixed strings known to exactly match the regexp.
43 If kwsexec returns < kwset_exact_matches, then we don't need to
44 call the regexp matcher at all. */
45 ptrdiff_t kwset_exact_matches;
51 dfaerror (char const *mesg)
53 die (EXIT_TROUBLE, 0, "%s", mesg);
56 /* For now, the sole dfawarn-eliciting condition (use of a regexp
57 like '[:lower:]') is unequivocally an error, so treat it as such,
60 dfawarn (char const *mesg)
62 if (!getenv ("POSIXLY_CORRECT"))
66 /* If the DFA turns out to have some set of fixed strings one of
67 which must occur in the match, then we build a kwset matcher
68 to find those strings, and thus quickly filter out impossible
71 kwsmusts (struct dfa_comp *dc)
73 struct dfamust *dm = dfamust (dc->dfa);
76 dc->kwset = kwsinit (false);
79 /* Prepare a substring whose presence implies a match.
80 The kwset matcher will return the index of the matching
81 string that it chooses. */
82 ++dc->kwset_exact_matches;
83 ptrdiff_t old_len = strlen (dm->must);
84 ptrdiff_t new_len = old_len + dm->begline + dm->endline;
85 char *must = xmalloc (new_len);
89 dc->begline |= dm->begline;
90 memcpy (mp, dm->must, old_len);
92 mp[old_len] = eolbyte;
93 kwsincr (dc->kwset, must, new_len);
98 /* Otherwise, filtering with this substring should help reduce the
99 search space, but we'll still have to use the regexp matcher. */
100 kwsincr (dc->kwset, dm->must, strlen (dm->must));
106 /* Return true if KEYS, of length LEN, might contain a back-reference.
107 Return false if KEYS cannot contain a back-reference.
108 BS_SAFE is true of encodings where a backslash cannot appear as the
109 last byte of a multibyte character. */
110 static bool _GL_ATTRIBUTE_PURE
111 possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
113 /* Normally a backslash, but in an unsafe encoding this is a non-char
114 value so that the comparison below always fails, because if there
115 are two adjacent '\' bytes, the first might be the last byte of a
116 multibyte character. */
117 int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
119 /* This code can return true even if KEYS lacks a back-reference, for
120 patterns like [\2], or for encodings where '\' appears as the last
121 byte of a multibyte character. However, false alarms should be
122 rare and do not affect correctness. */
124 /* Do not look for a backslash in the pattern's last byte, since it
125 can't be part of a back-reference and this streamlines the code. */
130 char const *lim = keys + len;
131 for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
133 if ('1' <= p[1] && p[1] <= '9')
135 if (p[1] == second_backslash)
147 regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
148 ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
150 struct re_pattern_buffer pat0;
151 struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
155 /* Do not use a fastmap with -i, to work around glibc Bug#20381. */
156 pat->fastmap = syntax_only | match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
158 pat->translate = NULL;
160 char const *err = re_compile_pattern (p, len, pat);
164 /* Emit a filename:lineno: prefix for patterns taken from files. */
165 size_t pat_lineno = lineno;
166 char const *pat_filename
167 = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
169 if (*pat_filename == '\0')
170 error (0, 0, "%s", err);
172 error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
178 GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
181 struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
183 dc->dfa = dfaalloc ();
186 syntax_bits |= RE_ICASE;
187 re_set_syntax (syntax_bits);
188 int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
189 dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
190 bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
192 /* For GNU regex, pass the patterns separately to detect errors like
193 "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
194 this should be a syntax error. The same for backref, where the
195 backref should be local to each pattern. */
196 char const *p = pattern;
197 char const *patlim = pattern + size;
198 bool compilation_failed = false;
200 dc->patterns = xmalloc (sizeof *dc->patterns);
205 char const *prev = pattern;
207 /* Buffer containing back-reference-free patterns. */
209 ptrdiff_t buflen = 0;
212 ptrdiff_t lineno = 0;
217 char const *sep = memchr (p, '\n', patlim - p);
226 bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
228 if (backref && prev < p)
230 ptrdiff_t prevlen = p - prev;
231 while (bufalloc < buflen + prevlen)
232 buf = x2realloc (buf, &bufalloc);
233 memcpy (buf + buflen, prev, prevlen);
237 /* Ensure room for at least two more patterns. The extra one is
238 for the regex_compile that may be executed after this loop
239 exits, and its (unused) slot is patterns[-1] until then. */
240 while (palloc <= dc->pcount + 1)
242 dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
243 sizeof *dc->patterns);
247 if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
248 compilation_failed = true;
261 if (compilation_failed)
268 ptrdiff_t prevlen = patlim - prev;
269 buf = xrealloc (buf, buflen + prevlen);
270 memcpy (buf + buflen, prev, prevlen);
285 if (!regex_compile (dc, buf, buflen, 0, -1, false))
292 /* In the match_words and match_lines cases, we use a different pattern
293 for the DFA matcher that will quickly throw out cases that won't work.
294 Then if DFA succeeds we do some hairy stuff using the regex matcher
295 to decide whether the match should really count. */
296 if (match_words || match_lines)
298 static char const line_beg_no_bk[] = "^(";
299 static char const line_end_no_bk[] = ")$";
300 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
301 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
302 static char const line_beg_bk[] = "^\\(";
303 static char const line_end_bk[] = "\\)$";
304 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
305 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
306 int bk = !(syntax_bits & RE_NO_BK_PARENS);
307 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
309 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
310 : (bk ? word_beg_bk : word_beg_no_bk));
311 size_t total = strlen (n);
312 memcpy (n + total, pattern, size);
314 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
315 : (bk ? word_end_bk : word_end_no_bk));
316 total += strlen (n + total);
323 dfaparse (pattern, size, dc->dfa);
325 dfacomp (NULL, 0, dc->dfa, 1);
333 EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
334 char const *start_ptr)
336 char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
339 size_t len, best_len;
340 struct kwsmatch kwsm;
342 struct dfa_comp *dc = vdc;
343 struct dfa *superset = dfasuperset (dc->dfa);
344 bool dfafast = dfaisfast (dc->dfa);
349 for (beg = end = buf; end < buflim; beg = end)
355 char const *next_beg, *dfa_beg = beg;
357 bool exact_kwset_match = false;
358 bool backref = false;
360 /* Try matching with KWset, if it's defined. */
363 char const *prev_beg;
365 /* Find a possible match using the KWset matcher. */
366 ptrdiff_t offset = kwsexec (dc->kwset, beg - dc->begline,
367 buflim - beg + dc->begline,
371 match = beg + offset;
374 /* Narrow down to the line containing the possible match. */
375 beg = memrchr (buf, eol, match - buf);
376 beg = beg ? beg + 1 : buf;
379 /* Determine the end pointer to give the DFA next. Typically
380 this is after the first newline after MATCH; but if the KWset
381 match is not exact, the DFA is fast, and the offset from
382 PREV_BEG is less than 64 or (MATCH - PREV_BEG), this is the
383 greater of the latter two values; this temporarily prefers
385 exact_kwset_match = kwsm.index < dc->kwset_exact_matches;
386 end = ((exact_kwset_match || !dfafast
387 || MAX (16, match - beg) < (match - prev_beg) >> 2)
389 : MAX (16, match - beg) < (buflim - prev_beg) >> 2
390 ? prev_beg + 4 * MAX (16, match - beg)
392 end = memchr (end, eol, buflim - end);
393 end = end ? end + 1 : buflim;
395 if (exact_kwset_match)
397 if (!localeinfo.multibyte | localeinfo.using_utf8)
401 if (mb_goback (&mb_start, NULL, match, buflim) == 0)
403 /* The matched line starts in the middle of a multibyte
404 character. Perform the DFA search starting from the
405 beginning of the next character. */
410 /* Try matching with the superset of DFA, if it's defined. */
411 if (superset && !exact_kwset_match)
413 /* Keep using the superset while it reports multiline
414 potential matches; this is more likely to be fast
415 than falling back to KWset would be. */
416 next_beg = dfaexec (superset, dfa_beg, (char *) end, 0,
418 if (next_beg == NULL || next_beg == end)
421 /* Narrow down to the line we've found. */
424 beg = memrchr (buf, eol, next_beg - buf);
428 end = memchr (next_beg, eol, buflim - next_beg);
429 end = end ? end + 1 : buflim;
434 /* Try matching with DFA. */
435 next_beg = dfaexec (dc->dfa, dfa_beg, (char *) end, 0, &count,
438 /* If there's no match, or if we've matched the sentinel,
440 if (next_beg == NULL || next_beg == end)
443 /* Narrow down to the line we've found. */
446 beg = memrchr (buf, eol, next_beg - buf);
449 end = memchr (next_beg, eol, buflim - next_beg);
450 end = end ? end + 1 : buflim;
452 /* Successful, no back-references encountered! */
459 /* We are looking for the leftmost (then longest) exact match.
460 We will go through the outer loop only once. */
464 /* If the "line" is longer than the maximum regexp offset,
465 die as if we've run out of memory. */
466 if (TYPE_MAXIMUM (regoff_t) < end - beg - 1)
469 /* Run the possible match through Regex. */
472 for (i = 0; i < dc->pcount; i++)
474 dc->patterns[i].not_eol = 0;
475 dc->patterns[i].newline_anchor = eolbyte == '\n';
476 start = re_search (&dc->patterns[i], beg, end - beg - 1,
477 ptr - beg, end - ptr - 1, &dc->regs);
482 len = dc->regs.end[0] - start;
484 if (match > best_match)
486 if (start_ptr && !match_words)
487 goto assess_pattern_match;
488 if ((!match_lines && !match_words)
489 || (match_lines && len == end - ptr - 1))
493 goto assess_pattern_match;
495 /* If -w and not -x, check whether the match aligns with
496 word boundaries. Do this iteratively because:
497 (a) the line may contain more than one occurrence of the
499 (b) Several alternatives in the pattern might be valid at a
500 given point, and we may need to consider a shorter one to
501 find a word boundary. */
502 if (!match_lines && match_words)
503 while (match <= best_match)
505 regoff_t shorter_len = 0;
506 if (! wordchar_next (match + len, end - 1)
507 && ! wordchar_prev (beg, match, end - 1))
508 goto assess_pattern_match;
511 /* Try a shorter length anchored at the same place. */
513 dc->patterns[i].not_eol = 1;
514 shorter_len = re_match (&dc->patterns[i], beg,
515 match + len - ptr, match - beg,
517 if (shorter_len < -1)
524 /* Try looking further on. */
525 if (match == end - 1)
528 dc->patterns[i].not_eol = 0;
529 start = re_search (&dc->patterns[i], beg, end - beg - 1,
530 match - beg, end - match - 1,
538 len = dc->regs.end[0] - start;
541 } /* while (match <= best_match) */
543 assess_pattern_match:
546 /* Good enough for a non-exact match.
547 No need to look at further patterns, if any. */
550 if (match < best_match || (match == best_match && len > best_len))
552 /* Best exact match: leftmost, then longest. */
556 } /* if re_search >= 0 */
557 } /* for Regex patterns. */
558 if (best_match < end)
560 /* We have found an exact match. We were just
561 waiting for the best one (leftmost then longest). */
566 } /* for (beg = end ..) */
574 size_t off = beg - buf;