1 /* dfasearch.c - searching subroutines using dfa and regex for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2011 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
25 /* For -w, we also consider _ to be word constituent. */
26 #define WCHAR(C) (isalnum (C) || (C) == '_')
28 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
29 a list of strings, at least one of which is known to occur in
30 any string matching the regexp. */
33 /* DFA compiled regexp. */
34 static struct dfa *dfa;
36 /* The Regex compiled patterns. */
37 static struct patterns
39 /* Regex compiled regexp. */
40 struct re_pattern_buffer regexbuf;
41 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
42 Q@#%!# library interface in regex.c. */
45 static struct patterns *patterns;
49 dfaerror (char const *mesg)
51 error (EXIT_TROUBLE, 0, "%s", mesg);
54 /* Tell static analyzers that this function does not return. */
58 /* For now, the sole dfawarn-eliciting condition (use of a regexp
59 like '[:lower:]') is unequivocally an error, so treat it as such,
62 dfawarn (char const *mesg)
64 static enum { DW_NONE = 0, DW_POSIX, DW_GNU } mode;
66 mode = (getenv ("POSIXLY_CORRECT") ? DW_POSIX : DW_GNU);
71 /* Number of compiled fixed strings known to exactly match the regexp.
72 If kwsexec returns < kwset_exact_matches, then we don't need to
73 call the regexp matcher at all. */
74 static int kwset_exact_matches;
77 kwsincr_case (const char *must)
84 if (match_icase && MB_CUR_MAX > 1)
85 buf = mbtolower (must, &n);
89 return kwsincr (kwset, buf, n);
92 /* If the DFA turns out to have some set of fixed strings one of
93 which must occur in the match, then we build a kwset matcher
94 to find those strings, and thus quickly filter out impossible
99 struct dfamust const *dm;
106 /* First, we compile in the substrings known to be exact
107 matches. The kwset matcher will return the index
108 of the matching string that it chooses. */
109 for (; dm; dm = dm->next)
113 ++kwset_exact_matches;
114 if ((err = kwsincr_case (dm->must)) != NULL)
115 error (EXIT_TROUBLE, 0, "%s", err);
117 /* Now, we compile the substrings that will require
118 the use of the regexp matcher. */
119 for (dm = dfamusts (dfa); dm; dm = dm->next)
123 if ((err = kwsincr_case (dm->must)) != NULL)
124 error (EXIT_TROUBLE, 0, "%s", err);
126 if ((err = kwsprep (kwset)) != NULL)
127 error (EXIT_TROUBLE, 0, "%s", err);
132 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
140 syntax_bits |= RE_ICASE;
141 re_set_syntax (syntax_bits);
142 dfasyntax (syntax_bits, match_icase, eolbyte);
144 /* For GNU regex compiler we have to pass the patterns separately to detect
145 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
146 GNU regex should have raise a syntax error. The same for backref, where
147 the backref should have been local to each pattern. */
152 sep = memchr (p, '\n', total);
165 patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
166 patterns[pcount] = patterns0;
168 if ((err = re_compile_pattern (p, len,
169 &(patterns[pcount].regexbuf))) != NULL)
170 error (EXIT_TROUBLE, 0, "%s", err);
174 } while (sep && total != 0);
176 /* In the match_words and match_lines cases, we use a different pattern
177 for the DFA matcher that will quickly throw out cases that won't work.
178 Then if DFA succeeds we do some hairy stuff using the regex matcher
179 to decide whether the match should really count. */
180 if (match_words || match_lines)
182 static char const line_beg_no_bk[] = "^(";
183 static char const line_end_no_bk[] = ")$";
184 static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
185 static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
186 static char const line_beg_bk[] = "^\\(";
187 static char const line_end_bk[] = "\\)$";
188 static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
189 static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
190 int bk = !(syntax_bits & RE_NO_BK_PARENS);
191 char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
193 strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
194 : (bk ? word_beg_bk : word_beg_no_bk));
196 memcpy (n + total, pattern, size);
198 strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
199 : (bk ? word_end_bk : word_end_no_bk));
200 total += strlen (n + total);
208 dfacomp (pattern, size, dfa, 1);
215 EGexecute (char const *buf, size_t size, size_t *match_size,
216 char const *start_ptr)
218 char const *buflim, *beg, *end, *match, *best_match, *mb_start;
220 int backref, start, len, best_len;
221 struct kwsmatch kwsm;
228 /* mbtolower adds a NUL byte at the end. That will provide
229 space for the sentinel byte dfaexec may add. */
230 char *case_buf = mbtolower (buf, &size);
232 start_ptr = case_buf + (start_ptr - buf);
236 #endif /* MBS_SUPPORT */
241 for (beg = end = buf; end < buflim; beg = end)
245 /* We don't care about an exact match. */
248 /* Find a possible match using the KWset matcher. */
249 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
250 if (offset == (size_t) -1)
253 /* Narrow down to the line containing the candidate, and
254 run it through DFA. */
255 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
260 while (beg > buf && beg[-1] != eol)
262 if (kwsm.index < kwset_exact_matches)
268 || !is_mb_middle (&mb_start, match, buflim,
273 if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
278 /* No good fixed strings; start with DFA. */
279 char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
281 if (next_beg == NULL)
283 /* Narrow down to the line we've found. */
285 if ((end = memchr(beg, eol, buflim - beg)) != NULL)
289 while (beg > buf && beg[-1] != eol)
292 /* Successful, no backreferences encountered! */
298 /* We are looking for the leftmost (then longest) exact match.
299 We will go through the outer loop only once. */
304 /* If we've made it to this point, this means DFA has seen
305 a probable match, and we need to run it through Regex. */
308 for (i = 0; i < pcount; i++)
310 patterns[i].regexbuf.not_eol = 0;
311 if (0 <= (start = re_search (&(patterns[i].regexbuf),
313 beg - buf, end - beg - 1,
314 &(patterns[i].regs))))
316 len = patterns[i].regs.end[0] - start;
318 if (match > best_match)
320 if (start_ptr && !match_words)
321 goto assess_pattern_match;
322 if ((!match_lines && !match_words)
323 || (match_lines && len == end - beg - 1))
327 goto assess_pattern_match;
329 /* If -w, check if the match aligns with word boundaries.
330 We do this iteratively because:
331 (a) the line may contain more than one occurence of the
333 (b) Several alternatives in the pattern might be valid at a
334 given point, and we may need to consider a shorter one to
335 find a word boundary. */
337 while (match <= best_match)
339 if ((match == buf || !WCHAR ((unsigned char) match[-1]))
340 && (start + len == end - buf - 1
341 || !WCHAR ((unsigned char) match[len])))
342 goto assess_pattern_match;
345 /* Try a shorter length anchored at the same place. */
347 patterns[i].regexbuf.not_eol = 1;
348 len = re_match (&(patterns[i].regexbuf),
349 buf, match + len - beg, match - buf,
350 &(patterns[i].regs));
354 /* Try looking further on. */
355 if (match == end - 1)
358 patterns[i].regexbuf.not_eol = 0;
359 start = re_search (&(patterns[i].regexbuf),
361 match - buf, end - match - 1,
362 &(patterns[i].regs));
365 len = patterns[i].regs.end[0] - start;
368 } /* while (match <= best_match) */
370 assess_pattern_match:
373 /* Good enough for a non-exact match.
374 No need to look at further patterns, if any. */
377 if (match < best_match || (match == best_match && len > best_len))
379 /* Best exact match: leftmost, then longest. */
383 } /* if re_search >= 0 */
384 } /* for Regex patterns. */
385 if (best_match < end)
387 /* We have found an exact match. We were just
388 waiting for the best one (leftmost then longest). */
393 } /* for (beg = end ..) */