1 /* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright (C) 1992, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
21 /* $FreeBSD: src/gnu/usr.bin/grep/search.c,v 1.10 2000/01/31 13:28:57 ru Exp $ */
26 #include <sys/types.h>
37 #define NCHAR (UCHAR_MAX + 1)
39 static void Gcompile PARAMS((char *, size_t));
40 static void Ecompile PARAMS((char *, size_t));
41 static char *EGexecute PARAMS((char *, size_t, char **));
42 static void Fcompile PARAMS((char *, size_t));
43 static char *Fexecute PARAMS((char *, size_t, char **));
44 static void kwsinit PARAMS((void));
46 /* Here is the matchers vector for the main program. */
47 struct matcher matchers[] = {
48 { "default", Gcompile, EGexecute },
49 { "grep", Gcompile, EGexecute },
50 { "egrep", Ecompile, EGexecute },
51 { "awk", Ecompile, EGexecute },
52 { "fgrep", Fcompile, Fexecute },
56 /* For -w, we also consider _ to be word constituent. */
57 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
59 /* DFA compiled regexp. */
60 static struct dfa dfa;
62 /* Regex compiled regexp. */
63 static struct re_pattern_buffer regexbuf;
65 /* KWset compiled pattern. For Ecompile and Gcompile, we compile
66 a list of strings, at least one of which is known to occur in
67 any string matching the regexp. */
70 /* Last compiled fixed string known to exactly match the regexp.
71 If kwsexec() returns < lastexact, then we don't need to
72 call the regexp matcher at all. */
76 dfaerror (char const *mesg)
84 static char trans[NCHAR];
88 for (i = 0; i < NCHAR; ++i)
89 trans[i] = TOLOWER(i);
91 if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
92 fatal("memory exhausted", 0);
95 /* If the DFA turns out to have some set of fixed strings one of
96 which must occur in the match, then we build a kwset matcher
97 to find those strings, and thus quickly filter out impossible
108 /* First, we compile in the substrings known to be exact
109 matches. The kwset matcher will return the index
110 of the matching string that it chooses. */
111 for (dm = dfa.musts; dm; dm = dm->next)
116 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
119 /* Now, we compile the substrings that will require
120 the use of the regexp matcher. */
121 for (dm = dfa.musts; dm; dm = dm->next)
125 if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
128 if ((err = kwsprep(kwset)) != 0)
134 Gcompile (char *pattern, size_t size)
138 re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
139 dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
141 if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0)
144 /* In the match_words and match_lines cases, we use a different pattern
145 for the DFA matcher that will quickly throw out cases that won't work.
146 Then if DFA succeeds we do some hairy stuff using the regex matcher
147 to decide whether the match should really count. */
148 if (match_words || match_lines)
150 /* In the whole-word case, we use the pattern:
151 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
152 In the whole-line case, we use the pattern:
154 BUG: Using [A-Za-z_] is locale-dependent!
155 So will use [:alnum:] */
157 char *n = malloc(size + 50);
165 strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
168 memcpy(n + i, pattern, size);
172 strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
174 strcpy(n + i, "\\)$");
177 dfacomp(n, i, &dfa, 1);
180 dfacomp(pattern, size, &dfa, 1);
186 Ecompile (char *pattern, size_t size)
190 if (strcmp(matcher, "awk") == 0)
192 re_set_syntax(RE_SYNTAX_AWK);
193 dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
197 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
198 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
201 if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0)
204 /* In the match_words and match_lines cases, we use a different pattern
205 for the DFA matcher that will quickly throw out cases that won't work.
206 Then if DFA succeeds we do some hairy stuff using the regex matcher
207 to decide whether the match should really count. */
208 if (match_words || match_lines)
210 /* In the whole-word case, we use the pattern:
211 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
212 In the whole-line case, we use the pattern:
214 BUG: Using [A-Za-z_] is locale-dependent!
215 so will use the char class */
217 char *n = malloc(size + 50);
225 strcpy(n, "(^|[^[:alnum:]_])(");
228 memcpy(n + i, pattern, size);
232 strcpy(n + i, ")([^[:alnum:]_]|$)");
237 dfacomp(n, i, &dfa, 1);
240 dfacomp(pattern, size, &dfa, 1);
246 EGexecute (char *buf, size_t size, char **endp)
248 register char *buflim, *beg, *end, save;
250 int backref, start, len;
251 struct kwsmatch kwsm;
252 static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
253 Q@#%!# library interface in regex.c. */
257 for (beg = end = buf; end < buflim; beg = end + 1)
261 /* Find a possible match using the KWset matcher. */
262 beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
265 /* Narrow down to the line containing the candidate, and
266 run it through DFA. */
267 end = memchr(beg, eol, buflim - beg);
270 while (beg > buf && beg[-1] != eol)
273 if (kwsm.index < lastexact)
275 if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
281 /* Successful, no backreferences encountered. */
287 /* No good fixed strings; start with DFA. */
289 beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
293 /* Narrow down to the line we've found. */
294 end = memchr(beg, eol, buflim - beg);
297 while (beg > buf && beg[-1] != eol)
299 /* Successful, no backreferences encountered! */
303 /* If we've made it to this point, this means DFA has seen
304 a probable match, and we need to run it through Regex. */
305 regexbuf.not_eol = 0;
306 if ((start = re_search(®exbuf, beg, end - beg, 0, end - beg, ®s)) >= 0)
308 len = regs.end[0] - start;
309 if ((!match_lines && !match_words)
310 || (match_lines && len == end - beg))
312 /* If -w, check if the match aligns with word boundaries.
313 We do this iteratively because:
314 (a) the line may contain more than one occurence of the pattern, and
315 (b) Several alternatives in the pattern might be valid at a given
316 point, and we may need to consider a shorter one to find a word
321 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
323 || !WCHAR ((unsigned char) beg[start + len])))
327 /* Try a shorter length anchored at the same place. */
329 regexbuf.not_eol = 1;
330 len = re_match(®exbuf, beg, start + len, start, ®s);
334 /* Try looking further on. */
335 if (start == end - beg)
338 regexbuf.not_eol = 0;
339 start = re_search(®exbuf, beg, end - beg,
340 start, end - beg - start, ®s);
341 len = regs.end[0] - start;
351 *endp = end < buflim ? end + 1 : end;
356 Fcompile (char *pattern, size_t size)
358 char *beg, *lim, *err;
364 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
366 if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
368 if (lim < pattern + size)
372 while (beg < pattern + size);
374 if ((err = kwsprep(kwset)) != 0)
379 Fexecute (char *buf, size_t size, char **endp)
381 register char *beg, *try, *end;
384 struct kwsmatch kwsmatch;
386 for (beg = buf; beg <= buf + size; ++beg)
388 if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
390 len = kwsmatch.size[0];
393 if (beg > buf && beg[-1] != eol)
395 if (beg + len < buf + size && beg[len] != eol)
399 else if (match_words)
400 for (try = beg; len && try;)
402 if (try > buf && WCHAR((unsigned char) try[-1]))
404 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
406 try = kwsexec(kwset, beg, --len, &kwsmatch);
407 len = kwsmatch.size[0];
419 if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
424 while (beg > buf && beg[-1] != '\n')