1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
25 /* For -w, we also consider _ to be word constituent. */
26 #define WCHAR(C) (isalnum (C) || (C) == '_')
28 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
29 a list of strings, at least one of which is known to occur in
30 any string matching the regexp. */
33 /* DFA compiled regexp. */
34 static struct dfa *dfa;
36 /* The Regex compiled patterns. */
37 static struct patterns
39 /* Regex compiled regexp. */
40 struct re_pattern_buffer regexbuf;
41 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
42 Q@#%!# library interface in regex.c. */
45 static struct patterns *patterns;
49 dfaerror (char const *mesg)
51 error (EXIT_TROUBLE, 0, "%s", mesg);
54 /* Tell static analyzers that this function does not return. */
58 /* For now, the sole dfawarn-eliciting condition (use of a regexp
59 like '[:lower:]') is unequivocally an error, so treat it as such,
62 dfawarn (char const *mesg)
64 static enum { NONE = 0, POSIX, GNU } mode;
66 mode = (getenv ("POSIXLY_CORRECT") ? POSIX : GNU);
71 /* Number of compiled fixed strings known to exactly match the regexp.
72 If kwsexec returns < kwset_exact_matches, then we don't need to
73 call the regexp matcher at all. */
74 static int kwset_exact_matches;
77 kwsincr_case (const char *must)
84 if (match_icase && MB_CUR_MAX > 1)
85 buf = mbtolower (must, &n);
89 return kwsincr (kwset, buf, n);
92 /* If the DFA turns out to have some set of fixed strings one of
93 which must occur in the match, then we build a kwset matcher
94 to find those strings, and thus quickly filter out impossible
99 struct dfamust const *dm;
106 /* First, we compile in the substrings known to be exact
107 matches. The kwset matcher will return the index
108 of the matching string that it chooses. */
109 for (; dm; dm = dm->next)
113 ++kwset_exact_matches;
114 if ((err = kwsincr_case (dm->must)) != NULL)
115 error (EXIT_TROUBLE, 0, "%s", err);
117 /* Now, we compile the substrings that will require
118 the use of the regexp matcher. */
119 for (dm = dfamusts (dfa); dm; dm = dm->next)
123 if ((err = kwsincr_case (dm->must)) != NULL)
124 error (EXIT_TROUBLE, 0, "%s", err);
126 if ((err = kwsprep (kwset)) != NULL)
127 error (EXIT_TROUBLE, 0, "%s", err);
132 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
140 syntax_bits |= RE_ICASE;
141 re_set_syntax (syntax_bits);
142 dfasyntax (syntax_bits, match_icase, eolbyte);
144 /* For GNU regex compiler we have to pass the patterns separately to detect
145 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
146 GNU regex should have raise a syntax error. The same for backref, where
147 the backref should have been local to each pattern. */
152 sep = memchr (p, '\n', total);
165 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
166 if (patterns == NULL)
167 error (EXIT_TROUBLE, errno, _("memory exhausted"));
168 patterns[pcount] = patterns0;
170 if ((err = re_compile_pattern (p, len,
171 &(patterns[pcount].regexbuf))) != NULL)
172 error (EXIT_TROUBLE, 0, "%s", err);
176 } while (sep && total != 0);
178 /* In the match_words and match_lines cases, we use a different pattern
179 for the DFA matcher that will quickly throw out cases that won't work.
180 Then if DFA succeeds we do some hairy stuff using the regex matcher
181 to decide whether the match should really count. */
182 if (match_words || match_lines)
184 static char const line_beg_no_bk[] = "^(";
185 static char const line_end_no_bk[] = ")$";
186 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
187 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
188 static char const line_beg_bk[] = "^\\(";
189 static char const line_end_bk[] = "\\)$";
190 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
191 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
192 int bk = !(syntax_bits & RE_NO_BK_PARENS);
193 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
195 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
196 : (bk ? word_beg_bk : word_beg_no_bk));
198 memcpy (n + total, pattern, size);
200 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
201 : (bk ? word_end_bk : word_end_no_bk));
202 total += strlen (n + total);
210 dfacomp (pattern, size, dfa, 1);
217 EGexecute (char const *buf, size_t size, size_t *match_size,
218 char const *start_ptr)
220 char const *buflim, *beg, *end, *match, *best_match, *mb_start;
222 int backref, start, len, best_len;
223 struct kwsmatch kwsm;
230 /* mbtolower adds a NUL byte at the end. That will provide
231 space for the sentinel byte dfaexec may add. */
232 char *case_buf = mbtolower (buf, &size);
234 start_ptr = case_buf + (start_ptr - buf);
238 #endif /* MBS_SUPPORT */
243 for (beg = end = buf; end < buflim; beg = end)
247 /* We don't care about an exact match. */
250 /* Find a possible match using the KWset matcher. */
251 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
252 if (offset == (size_t) -1)
255 /* Narrow down to the line containing the candidate, and
256 run it through DFA. */
257 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
262 while (beg > buf && beg[-1] != eol)
264 if (kwsm.index < kwset_exact_matches)
270 || !is_mb_middle (&mb_start, match, buflim,
275 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
280 /* No good fixed strings; start with DFA. */
281 char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
283 if (next_beg == NULL)
285 /* Narrow down to the line we've found. */
287 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
291 while (beg > buf && beg[-1] != eol)
294 /* Successful, no backreferences encountered! */
300 /* We are looking for the leftmost (then longest) exact match.
301 We will go through the outer loop only once. */
306 /* If we've made it to this point, this means DFA has seen
307 a probable match, and we need to run it through Regex. */
310 for (i = 0; i < pcount; i++)
312 patterns[i].regexbuf.not_eol = 0;
313 if (0 <= (start = re_search (&(patterns[i].regexbuf),
315 beg - buf, end - beg - 1,
316 &(patterns[i].regs))))
318 len = patterns[i].regs.end[0] - start;
320 if (match > best_match)
322 if (start_ptr && !match_words)
323 goto assess_pattern_match;
324 if ((!match_lines && !match_words)
325 || (match_lines && len == end - beg - 1))
329 goto assess_pattern_match;
331 /* If -w, check if the match aligns with word boundaries.
332 We do this iteratively because:
333 (a) the line may contain more than one occurence of the
335 (b) Several alternatives in the pattern might be valid at a
336 given point, and we may need to consider a shorter one to
337 find a word boundary. */
339 while (match <= best_match)
341 if ((match == buf || !WCHAR ((unsigned char) match[-1]))
342 && (start + len == end - buf - 1
343 || !WCHAR ((unsigned char) match[len])))
344 goto assess_pattern_match;
347 /* Try a shorter length anchored at the same place. */
349 patterns[i].regexbuf.not_eol = 1;
350 len = re_match (&(patterns[i].regexbuf),
351 buf, match + len - beg, match - buf,
352 &(patterns[i].regs));
356 /* Try looking further on. */
357 if (match == end - 1)
360 patterns[i].regexbuf.not_eol = 0;
361 start = re_search (&(patterns[i].regexbuf),
363 match - buf, end - match - 1,
364 &(patterns[i].regs));
367 len = patterns[i].regs.end[0] - start;
370 } /* while (match <= best_match) */
372 assess_pattern_match:
375 /* Good enough for a non-exact match.
376 No need to look at further patterns, if any. */
379 if (match < best_match || (match == best_match && len > best_len))
381 /* Best exact match: leftmost, then longest. */
385 } /* if re_search >= 0 */
386 } /* for Regex patterns. */
387 if (best_match < end)
389 /* We have found an exact match. We were just
390 waiting for the best one (leftmost then longest). */
395 } /* for (beg = end ..) */