1 /* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright (C) 1992, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
21 /* $FreeBSD: src/gnu/usr.bin/grep/search.c,v 1.10 2000/01/31 13:28:57 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/search.c,v 1.2 2003/06/17 04:25:45 dillon Exp $ */
27 #include <sys/types.h>
38 #define NCHAR (UCHAR_MAX + 1)
40 static void Gcompile PARAMS((char *, size_t));
41 static void Ecompile PARAMS((char *, size_t));
42 static char *EGexecute PARAMS((char *, size_t, char **));
43 static void Fcompile PARAMS((char *, size_t));
44 static char *Fexecute PARAMS((char *, size_t, char **));
45 static void kwsinit PARAMS((void));
47 /* Here is the matchers vector for the main program. */
48 struct matcher matchers[] = {
49 { "default", Gcompile, EGexecute },
50 { "grep", Gcompile, EGexecute },
51 { "egrep", Ecompile, EGexecute },
52 { "awk", Ecompile, EGexecute },
53 { "fgrep", Fcompile, Fexecute },
57 /* For -w, we also consider _ to be word constituent. */
58 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
60 /* DFA compiled regexp. */
61 static struct dfa dfa;
63 /* Regex compiled regexp. */
64 static struct re_pattern_buffer regexbuf;
66 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
67 a list of strings, at least one of which is known to occur in
68 any string matching the regexp. */
71 /* Last compiled fixed string known to exactly match the regexp.
72 If kwsexec() returns < lastexact, then we don't need to
73 call the regexp matcher at all. */
77 dfaerror (char const *mesg)
85 static char trans[NCHAR];
89 for (i = 0; i < NCHAR; ++i)
90 trans[i] = TOLOWER(i);
92 if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
93 fatal("memory exhausted", 0);
96 /* If the DFA turns out to have some set of fixed strings one of
97 which must occur in the match, then we build a kwset matcher
98 to find those strings, and thus quickly filter out impossible
109 /* First, we compile in the substrings known to be exact
110 matches. The kwset matcher will return the index
111 of the matching string that it chooses. */
112 for (dm = dfa.musts; dm; dm = dm->next)
117 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
120 /* Now, we compile the substrings that will require
121 the use of the regexp matcher. */
122 for (dm = dfa.musts; dm; dm = dm->next)
126 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
129 if ((err = kwsprep(kwset)) != 0)
135 Gcompile (char *pattern, size_t size)
139 re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
140 dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
142 if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0)
145 /* In the match_words and match_lines cases, we use a different pattern
146 for the DFA matcher that will quickly throw out cases that won't work.
147 Then if DFA succeeds we do some hairy stuff using the regex matcher
148 to decide whether the match should really count. */
149 if (match_words || match_lines)
151 /* In the whole-word case, we use the pattern:
152 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
153 In the whole-line case, we use the pattern:
155 BUG: Using [A-Za-z_] is locale-dependent!
156 So will use [:alnum:] */
158 char *n = malloc(size + 50);
166 strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
169 memcpy(n + i, pattern, size);
173 strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
175 strcpy(n + i, "\\)$");
178 dfacomp(n, i, &dfa, 1);
181 dfacomp(pattern, size, &dfa, 1);
187 Ecompile (char *pattern, size_t size)
191 if (strcmp(matcher, "awk") == 0)
193 re_set_syntax(RE_SYNTAX_AWK);
194 dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
198 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
199 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
202 if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0)
205 /* In the match_words and match_lines cases, we use a different pattern
206 for the DFA matcher that will quickly throw out cases that won't work.
207 Then if DFA succeeds we do some hairy stuff using the regex matcher
208 to decide whether the match should really count. */
209 if (match_words || match_lines)
211 /* In the whole-word case, we use the pattern:
212 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
213 In the whole-line case, we use the pattern:
215 BUG: Using [A-Za-z_] is locale-dependent!
216 so will use the char class */
218 char *n = malloc(size + 50);
226 strcpy(n, "(^|[^[:alnum:]_])(");
229 memcpy(n + i, pattern, size);
233 strcpy(n + i, ")([^[:alnum:]_]|$)");
238 dfacomp(n, i, &dfa, 1);
241 dfacomp(pattern, size, &dfa, 1);
247 EGexecute (char *buf, size_t size, char **endp)
249 register char *buflim, *beg, *end, save;
251 int backref, start, len;
252 struct kwsmatch kwsm;
253 static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
254 Q@#%!# library interface in regex.c. */
258 for (beg = end = buf; end < buflim; beg = end + 1)
262 /* Find a possible match using the KWset matcher. */
263 beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
266 /* Narrow down to the line containing the candidate, and
267 run it through DFA. */
268 end = memchr(beg, eol, buflim - beg);
271 while (beg > buf && beg[-1] != eol)
274 if (kwsm.index < lastexact)
276 if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
282 /* Successful, no backreferences encountered. */
288 /* No good fixed strings; start with DFA. */
290 beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
294 /* Narrow down to the line we've found. */
295 end = memchr(beg, eol, buflim - beg);
298 while (beg > buf && beg[-1] != eol)
300 /* Successful, no backreferences encountered! */
304 /* If we've made it to this point, this means DFA has seen
305 a probable match, and we need to run it through Regex. */
306 regexbuf.not_eol = 0;
307 if ((start = re_search(®exbuf, beg, end - beg, 0, end - beg, ®s)) >= 0)
309 len = regs.end[0] - start;
310 if ((!match_lines && !match_words)
311 || (match_lines && len == end - beg))
313 /* If -w, check if the match aligns with word boundaries.
314 We do this iteratively because:
315 (a) the line may contain more than one occurence of the pattern, and
316 (b) Several alternatives in the pattern might be valid at a given
317 point, and we may need to consider a shorter one to find a word
322 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
324 || !WCHAR ((unsigned char) beg[start + len])))
328 /* Try a shorter length anchored at the same place. */
330 regexbuf.not_eol = 1;
331 len = re_match(®exbuf, beg, start + len, start, ®s);
335 /* Try looking further on. */
336 if (start == end - beg)
339 regexbuf.not_eol = 0;
340 start = re_search(®exbuf, beg, end - beg,
341 start, end - beg - start, ®s);
342 len = regs.end[0] - start;
352 *endp = end < buflim ? end + 1 : end;
357 Fcompile (char *pattern, size_t size)
359 char *beg, *lim, *err;
365 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
367 if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
369 if (lim < pattern + size)
373 while (beg < pattern + size);
375 if ((err = kwsprep(kwset)) != 0)
380 Fexecute (char *buf, size_t size, char **endp)
382 register char *beg, *try, *end;
385 struct kwsmatch kwsmatch;
387 for (beg = buf; beg <= buf + size; ++beg)
389 if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
391 len = kwsmatch.size[0];
394 if (beg > buf && beg[-1] != eol)
396 if (beg + len < buf + size && beg[len] != eol)
400 else if (match_words)
401 for (try = beg; len && try;)
403 if (try > buf && WCHAR((unsigned char) try[-1]))
405 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
407 try = kwsexec(kwset, beg, --len, &kwsmatch);
408 len = kwsmatch.size[0];
420 if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
425 while (beg > buf && beg[-1] != '\n')