1 /* kwsearch.c - searching subroutines using kwset for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
19 /* Written August 1992 by Mike Haertel. */
24 /* A compiled -F pattern list. */
28 /* The kwset for this pattern list. */
31 /* The number of user-specified patterns. This is less than
32 'kwswords (kwset)' when some extra one-character words have been
33 appended, one for each troublesome character that will require a
37 /* The user's pattern and its size in bytes. */
41 /* The user's pattern compiled as a regular expression,
42 or null if it has not been compiled. */
46 /* Compile the -F style PATTERN, containing SIZE bytes. Return a
47 description of the compiled pattern. */
50 Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
53 ptrdiff_t total = size;
57 kwset = kwsinit (true);
59 char const *p = pattern;
63 char const *sep = memchr (p, '\n', total);
78 if (eolbyte == '\n' && pattern < p && sep)
82 if (bufalloc < len + 2)
86 buf = x2realloc (NULL, &bufalloc);
89 memcpy (buf + 1, p, len);
90 buf[len + 1] = eolbyte;
95 kwsincr (kwset, p, len);
102 ptrdiff_t words = kwswords (kwset);
106 /* For each pattern character C that has a case folded
107 counterpart F that is multibyte and so cannot easily be
108 implemented via translating a single byte, append a pattern
109 containing just F. That way, if the data contains F, the
110 matcher can fall back on DFA. For example, if C is 'i' and
111 the locale is en_US.utf8, append a pattern containing just
112 the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
113 Fexecute will use a DFA if the data contain U+0131. */
114 mbstate_t mbs = { 0 };
115 char checked[NCHAR] = {0,};
116 for (p = pattern; p < pattern + size; p++)
118 unsigned char c = *p;
123 wint_t wc = localeinfo.sbctowc[c];
124 wchar_t folded[CASE_FOLDED_BUFSIZE];
126 for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
129 int nbytes = wcrtomb (s, folded[i], &mbs);
131 kwsincr (kwset, s, nbytes);
138 struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
139 kwsearch->kwset = kwset;
140 kwsearch->words = words;
141 kwsearch->pattern = pattern;
142 kwsearch->size = size;
147 /* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
148 If found, return the offset of the first match and store its
149 size into *MATCH_SIZE. If not found, return SIZE_MAX.
150 If START_PTR is nonnull, start searching there. */
152 Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
153 char const *start_ptr)
155 char const *beg, *end, *mb_start;
158 struct kwsmatch kwsmatch;
162 struct kwsearch *kwsearch = vcp;
163 kwset_t kwset = kwsearch->kwset;
167 mb_check = longest = false;
170 mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
171 longest = mb_check | !!start_ptr | match_words;
174 for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
176 ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
177 buf + size - beg + match_lines, &kwsmatch,
181 len = kwsmatch.size[0] - 2 * match_lines;
183 if (kwsearch->words <= kwsmatch.index)
185 /* The data contain a multibyte character that matches
186 some pattern character that is a case folded counterpart.
187 Since the kwset code cannot handle this case, fall back
188 on the DFA code, which can. */
191 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
192 kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
195 return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
200 && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
202 /* We have matched a single byte that is not at the beginning of a
203 multibyte character. mb_goback has advanced MB_START past that
204 multibyte character. Now, we want to position BEG so that the
205 next kwsexec search starts there. Thus, to compensate for the
206 for-loop's BEG++, above, subtract one here. This code is
207 unusually hard to reach, and exceptionally, let's show how to
210 printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
212 That assumes the named locale is installed.
213 Note that your system's shift-JIS locale may have a different
214 name, possibly including "sjis". */
219 if (!!start_ptr & !match_words)
220 goto success_in_beg_and_len;
223 len += start_ptr == NULL;
224 goto success_in_beg_and_len;
229 /* We need a preceding mb_start pointer. Use the beginning of line
230 if there is a preceding newline. */
233 char const *nl = memrchr (mb_start, eol, beg - mb_start);
238 /* Succeed if neither the preceding nor the following character is a
239 word constituent. If the preceding is not, yet the following
240 character IS a word constituent, keep trying with shorter matches. */
242 ? ! wordchar_next (beg - mbclen, buf + size)
243 : ! wordchar_prev (mb_start, beg, buf + size))
246 if (! wordchar_next (beg + len, buf + size))
249 goto success_in_beg_and_len;
253 if (!start_ptr && !localeinfo.multibyte)
257 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
258 kwsearch->re = GEAcompile (kwsearch->pattern,
262 end = memchr (beg + len, eol, (buf + size) - (beg + len));
263 end = end ? end + 1 : buf + size;
264 if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
266 goto success_match_words;
272 offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
275 len = kwsmatch.size[0];
278 /* No word match was found at BEG. Skip past word constituents,
279 since they cannot precede the next match and not skipping
280 them could make things much slower. */
281 beg += wordchars_size (beg, buf + size);
283 } /* for (beg in buf) */
288 end = memchr (beg + len, eol, (buf + size) - (beg + len));
289 end = end ? end + 1 : buf + size;
291 beg = memrchr (buf, eol, beg - buf);
292 beg = beg ? beg + 1 : buf;
294 success_in_beg_and_len:;
295 size_t off = beg - buf;