1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
26 /* For -w, we also consider _ to be word constituent. */
27 #define WCHAR(C) (isalnum (C) || (C) == '_')
29 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
30 a list of strings, at least one of which is known to occur in
31 any string matching the regexp. */
34 /* DFA compiled regexp. */
35 static struct dfa *dfa;
37 /* The Regex compiled patterns. */
38 static struct patterns
40 /* Regex compiled regexp. */
41 struct re_pattern_buffer regexbuf;
42 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
43 Q@#%!# library interface in regex.c. */
46 static struct patterns *patterns;
50 dfaerror (char const *mesg)
52 error (EXIT_TROUBLE, 0, "%s", mesg);
55 /* Tell static analyzers that this function does not return. */
59 /* For now, the sole dfawarn-eliciting condition (use of a regexp
60 like '[:lower:]') is unequivocally an error, so treat it as such,
63 dfawarn (char const *mesg)
65 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
67 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
72 /* Number of compiled fixed strings known to exactly match the regexp.
73 If kwsexec returns < kwset_exact_matches, then we don't need to
74 call the regexp matcher at all. */
75 static size_t kwset_exact_matches;
78 kwsincr_case (const char *must)
80 size_t n = strlen (must);
81 const char *buf = (match_icase && MB_CUR_MAX > 1
82 ? mbtolower (must, &n)
84 return kwsincr (kwset, buf, n);
87 /* If the DFA turns out to have some set of fixed strings one of
88 which must occur in the match, then we build a kwset matcher
89 to find those strings, and thus quickly filter out impossible
94 struct dfamust const *dm;
101 /* First, we compile in the substrings known to be exact
102 matches. The kwset matcher will return the index
103 of the matching string that it chooses. */
104 for (; dm; dm = dm->next)
108 ++kwset_exact_matches;
109 if ((err = kwsincr_case (dm->must)) != NULL)
110 error (EXIT_TROUBLE, 0, "%s", err);
112 /* Now, we compile the substrings that will require
113 the use of the regexp matcher. */
114 for (dm = dfamusts (dfa); dm; dm = dm->next)
118 if ((err = kwsincr_case (dm->must)) != NULL)
119 error (EXIT_TROUBLE, 0, "%s", err);
121 if ((err = kwsprep (kwset)) != NULL)
122 error (EXIT_TROUBLE, 0, "%s", err);
127 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
135 syntax_bits |= RE_ICASE;
136 re_set_syntax (syntax_bits);
137 dfasyntax (syntax_bits, match_icase, eolbyte);
139 /* For GNU regex compiler we have to pass the patterns separately to detect
140 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
141 GNU regex should have raise a syntax error. The same for backref, where
142 the backref should have been local to each pattern. */
147 sep = memchr (p, '\n', total);
160 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
161 patterns[pcount] = patterns0;
163 if ((err = re_compile_pattern (p, len,
164 &(patterns[pcount].regexbuf))) != NULL)
165 error (EXIT_TROUBLE, 0, "%s", err);
169 } while (sep && total != 0);
171 /* In the match_words and match_lines cases, we use a different pattern
172 for the DFA matcher that will quickly throw out cases that won't work.
173 Then if DFA succeeds we do some hairy stuff using the regex matcher
174 to decide whether the match should really count. */
175 if (match_words || match_lines)
177 static char const line_beg_no_bk[] = "^(";
178 static char const line_end_no_bk[] = ")$";
179 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
180 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
181 static char const line_beg_bk[] = "^\\(";
182 static char const line_end_bk[] = "\\)$";
183 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
184 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
185 int bk = !(syntax_bits & RE_NO_BK_PARENS);
186 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
188 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
189 : (bk ? word_beg_bk : word_beg_no_bk));
191 memcpy (n + total, pattern, size);
193 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
194 : (bk ? word_end_bk : word_end_no_bk));
195 total += strlen (n + total);
203 dfacomp (pattern, size, dfa, 1);
210 EGexecute (char const *buf, size_t size, size_t *match_size,
211 char const *start_ptr)
213 char const *buflim, *beg, *end, *match, *best_match, *mb_start;
217 ptrdiff_t len, best_len;
218 struct kwsmatch kwsm;
224 /* mbtolower adds a NUL byte at the end. That will provide
225 space for the sentinel byte dfaexec may add. */
226 char *case_buf = mbtolower (buf, &size);
228 start_ptr = case_buf + (start_ptr - buf);
236 for (beg = end = buf; end < buflim; beg = end)
240 /* We don't care about an exact match. */
243 /* Find a possible match using the KWset matcher. */
244 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
245 if (offset == (size_t) -1)
248 /* Narrow down to the line containing the candidate, and
249 run it through DFA. */
250 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
255 while (beg > buf && beg[-1] != eol)
257 if (kwsm.index < kwset_exact_matches)
265 || !is_mb_middle (&mb_start, match, buflim,
269 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
274 /* No good fixed strings; start with DFA. */
275 char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
277 if (next_beg == NULL)
279 /* Narrow down to the line we've found. */
281 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
285 while (beg > buf && beg[-1] != eol)
288 /* Successful, no backreferences encountered! */
294 /* We are looking for the leftmost (then longest) exact match.
295 We will go through the outer loop only once. */
300 /* If the "line" is longer than the maximum regexp offset,
301 die as if we've run out of memory. */
302 if (TYPE_MAXIMUM (regoff_t) < end - buf - 1)
305 /* If we've made it to this point, this means DFA has seen
306 a probable match, and we need to run it through Regex. */
309 for (i = 0; i < pcount; i++)
311 patterns[i].regexbuf.not_eol = 0;
312 start = re_search (&(patterns[i].regexbuf),
314 beg - buf, end - beg - 1,
315 &(patterns[i].regs));
320 len = patterns[i].regs.end[0] - start;
322 if (match > best_match)
324 if (start_ptr && !match_words)
325 goto assess_pattern_match;
326 if ((!match_lines && !match_words)
327 || (match_lines && len == end - beg - 1))
331 goto assess_pattern_match;
333 /* If -w, check if the match aligns with word boundaries.
334 We do this iteratively because:
335 (a) the line may contain more than one occurrence of the
337 (b) Several alternatives in the pattern might be valid at a
338 given point, and we may need to consider a shorter one to
339 find a word boundary. */
341 while (match <= best_match)
343 if ((match == buf || !WCHAR ((unsigned char) match[-1]))
344 && (start + len == end - buf - 1
345 || !WCHAR ((unsigned char) match[len])))
346 goto assess_pattern_match;
349 /* Try a shorter length anchored at the same place. */
351 patterns[i].regexbuf.not_eol = 1;
352 len = re_match (&(patterns[i].regexbuf),
353 buf, match + len - beg, match - buf,
354 &(patterns[i].regs));
360 /* Try looking further on. */
361 if (match == end - 1)
364 patterns[i].regexbuf.not_eol = 0;
365 start = re_search (&(patterns[i].regexbuf),
367 match - buf, end - match - 1,
368 &(patterns[i].regs));
375 len = patterns[i].regs.end[0] - start;
378 } /* while (match <= best_match) */
380 assess_pattern_match:
383 /* Good enough for a non-exact match.
384 No need to look at further patterns, if any. */
387 if (match < best_match || (match == best_match && len > best_len))
389 /* Best exact match: leftmost, then longest. */
393 } /* if re_search >= 0 */
394 } /* for Regex patterns. */
395 if (best_match < end)
397 /* We have found an exact match. We were just
398 waiting for the best one (leftmost then longest). */
403 } /* for (beg = end ..) */