1 /* search.c -- searching large bodies of text.
2 $Id: search.c,v 1.9 2008/06/11 09:55:42 gray Exp $
4 Copyright (C) 1993, 1997, 1998, 2002, 2004, 2007, 2008
5 Free Software Foundation, Inc.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 Originally written by Brian Fox (bfox@ai.mit.edu). */
28 /* The search functions take two arguments:
30 1) a string to search for, and
32 2) a pointer to a SEARCH_BINDING which contains the buffer, start,
33 and end of the search.
35 They return a long, which is the offset from the start of the buffer
36 at which the match was found. An offset of -1 indicates failure. */
38 /* A function which makes a binding with buffer and bounds. */
40 make_binding (char *buffer, long int start, long int end)
42 SEARCH_BINDING *binding;
44 binding = xmalloc (sizeof (SEARCH_BINDING));
45 binding->buffer = buffer;
46 binding->start = start;
53 /* Make a copy of BINDING without duplicating the data. */
55 copy_binding (SEARCH_BINDING *binding)
59 copy = make_binding (binding->buffer, binding->start, binding->end);
60 copy->flags = binding->flags;
65 /* **************************************************************** */
67 /* The Actual Searching Functions */
69 /* **************************************************************** */
71 /* Search forwards or backwards for the text delimited by BINDING.
72 The search is forwards if BINDING->start is greater than BINDING->end. */
74 search (char *string, SEARCH_BINDING *binding)
78 /* If the search is backwards, then search backwards, otherwise forwards. */
79 if (binding->start > binding->end)
80 result = search_backward (string, binding);
82 result = search_forward (string, binding);
87 /* Search forwards or backwards for anything matching the regexp in the text
88 delimited by BINDING. The search is forwards if BINDING->start is greater
91 If PRET is specified, it receives a copy of BINDING at the end of a
92 succeded search. Its START and END fields contain bounds of the found
96 regexp_search (char *regexp, SEARCH_BINDING *binding, long length,
99 static char *previous_regexp = NULL;
100 static char *previous_content = NULL;
101 static int was_insensitive = 0;
103 static regmatch_t *matches;
104 static int match_alloc = 0;
105 static int match_count = 0;
108 if (previous_regexp == NULL
109 || ((binding->flags & S_FoldCase) != was_insensitive)
110 || (strcmp (previous_regexp, regexp) != 0))
112 /* need to compile a new regexp */
114 char *unescaped_regexp;
117 previous_content = NULL;
119 if (previous_regexp != NULL)
121 free (previous_regexp);
122 previous_regexp = NULL;
126 was_insensitive = binding->flags & S_FoldCase;
128 /* expand the \n and \t in regexp */
129 unescaped_regexp = xmalloc (1 + strlen (regexp));
130 for (p = regexp, q = unescaped_regexp; *p != '\0'; p++, q++)
155 result = regcomp (&preg, unescaped_regexp,
158 (was_insensitive ? REG_ICASE : 0));
159 free (unescaped_regexp);
163 int size = regerror (result, &preg, NULL, 0);
164 char *buf = xmalloc (size);
165 regerror (result, &preg, buf, size);
166 info_error (_("regexp error: %s"), buf, NULL);
170 previous_regexp = xstrdup(regexp);
173 if (previous_content != binding->buffer)
175 /* new buffer to search in, let's scan it */
179 previous_content = binding->buffer;
180 saved_char = previous_content[length-1];
181 previous_content[length-1] = '\0';
183 for (match_count = 0; start < length; )
186 if (match_count >= match_alloc)
188 /* match list full. Initially allocate 256 entries, then double
189 every time we fill it */
190 match_alloc = (match_alloc > 0 ? match_alloc * 2 : 256);
191 matches = xrealloc (matches,
192 match_alloc * sizeof(regmatch_t));
195 result = regexec (&preg, &previous_content[start],
196 1, &matches[match_count], 0);
199 if (matches[match_count].rm_eo == 0)
201 /* ignore empty matches */
206 matches[match_count].rm_so += start;
207 matches[match_count].rm_eo += start;
208 start = matches[match_count++].rm_eo;
216 previous_content[length-1] = saved_char;
219 pos = binding->start;
220 if (pos > binding->end)
222 /* searching backward */
224 for (i = match_count - 1; i >= 0; i--)
226 if (matches[i].rm_so <= pos)
230 pret->buffer = binding->buffer;
231 pret->flags = binding->flags;
232 pret->start = matches[i].rm_so;
233 pret->end = matches[i].rm_eo;
235 return matches[i].rm_so;
241 /* searching forward */
243 for (i = 0; i < match_count; i++)
245 if (matches[i].rm_so >= pos)
249 pret->buffer = binding->buffer;
250 pret->flags = binding->flags;
251 pret->start = matches[i].rm_so;
252 pret->end = matches[i].rm_eo;
254 if (binding->flags & S_SkipDest)
255 return matches[i].rm_eo;
257 return matches[i].rm_so;
266 /* Search forwards for STRING through the text delimited in BINDING. */
268 search_forward (char *string, SEARCH_BINDING *binding)
270 register int c, i, len;
271 register char *buff, *end;
272 char *alternate = NULL;
274 len = strlen (string);
276 /* We match characters in the search buffer against STRING and ALTERNATE.
277 ALTERNATE is a case reversed version of STRING; this is cheaper than
278 case folding each character before comparison. Alternate is only
279 used if the case folding bit is turned on in the passed BINDING. */
281 if (binding->flags & S_FoldCase)
283 alternate = xstrdup (string);
285 for (i = 0; i < len; i++)
287 if (islower (alternate[i]))
288 alternate[i] = toupper (alternate[i]);
289 else if (isupper (alternate[i]))
290 alternate[i] = tolower (alternate[i]);
294 buff = binding->buffer + binding->start;
295 end = binding->buffer + binding->end + 1;
297 while (buff < (end - len))
299 for (i = 0; i < len; i++)
303 if ((c != string[i]) && (!alternate || c != alternate[i]))
311 if (binding->flags & S_SkipDest)
313 return buff - binding->buffer;
325 /* Search for STRING backwards through the text delimited in BINDING. */
327 search_backward (char *input_string, SEARCH_BINDING *binding)
329 register int c, i, len;
330 register char *buff, *end;
332 char *alternate = NULL;
334 len = strlen (input_string);
336 /* Reverse the characters in the search string. */
337 string = xmalloc (1 + len);
338 for (c = 0, i = len - 1; input_string[c]; c++, i--)
339 string[i] = input_string[c];
343 /* We match characters in the search buffer against STRING and ALTERNATE.
344 ALTERNATE is a case reversed version of STRING; this is cheaper than
345 case folding each character before comparison. ALTERNATE is only
346 used if the case folding bit is turned on in the passed BINDING. */
348 if (binding->flags & S_FoldCase)
350 alternate = xstrdup (string);
352 for (i = 0; i < len; i++)
354 if (islower (alternate[i]))
355 alternate[i] = toupper (alternate[i]);
356 else if (isupper (alternate[i]))
357 alternate[i] = tolower (alternate[i]);
361 buff = binding->buffer + binding->start - 1;
362 end = binding->buffer + binding->end;
364 while (buff > (end + len))
366 for (i = 0; i < len; i++)
370 if (c != string[i] && (!alternate || c != alternate[i]))
380 if (binding->flags & S_SkipDest)
382 return 1 + buff - binding->buffer;
395 /* Find STRING in LINE, returning the offset of the end of the string.
396 Return an offset of -1 if STRING does not appear in LINE. The search
397 is bound by the end of the line (i.e., either NEWLINE or 0). */
399 string_in_line (char *string, char *line)
402 SEARCH_BINDING binding;
404 /* Find the end of the line. */
405 for (end = 0; line[end] && line[end] != '\n'; end++);
407 /* Search for STRING within these confines. */
408 binding.buffer = line;
411 binding.flags = S_FoldCase | S_SkipDest;
413 return search_forward (string, &binding);
416 /* Return non-zero if STRING is the first text to appear at BINDING. */
418 looking_at (char *string, SEARCH_BINDING *binding)
422 search_end = search (string, binding);
424 /* If the string was not found, SEARCH_END is -1. If the string was found,
425 but not right away, SEARCH_END is != binding->start. Otherwise, the
426 string was found at binding->start. */
427 return search_end == binding->start;
430 /* **************************************************************** */
432 /* Small String Searches */
434 /* **************************************************************** */
436 /* Function names that start with "skip" are passed a string, and return
437 an offset from the start of that string. Function names that start
438 with "find" are passed a SEARCH_BINDING, and return an absolute position
439 marker of the item being searched for. "Find" functions return a value
440 of -1 if the item being looked for couldn't be found. */
442 /* Return the index of the first non-whitespace character in STRING. */
444 skip_whitespace (char *string)
448 for (i = 0; string && whitespace (string[i]); i++);
452 /* Return the index of the first non-whitespace or newline character in
455 skip_whitespace_and_newlines (char *string)
459 for (i = 0; string && whitespace_or_newline (string[i]); i++);
463 /* Return the index of the first whitespace character in STRING. */
465 skip_non_whitespace (char *string)
469 for (i = 0; string && string[i] && !whitespace (string[i]); i++);
473 /* Return the index of the first non-node character in STRING. Note that
474 this function contains quite a bit of hair to ignore periods in some
475 special cases. This is because we here at GNU ship some info files which
476 contain nodenames that contain periods. No such nodename can start with
477 a period, or continue with whitespace, newline, or ')' immediately following
478 the period. If second argument NEWLINES_OKAY is non-zero, newlines should
479 be skipped while parsing out the nodename specification. */
481 skip_node_characters (char *string, int newlines_okay)
483 register int c, i = 0;
487 /* Handle special case. This is when another function has parsed out the
488 filename component of the node name, and we just want to parse out the
489 nodename proper. In that case, a period at the start of the nodename
490 indicates an empty nodename. */
491 if (string && *string == '.')
494 if (string && *string == '(')
501 for (; string && (c = string[i]); i++)
513 /* If the character following the close paren is a space or period,
514 then this node name has no more characters associated with it. */
518 ((!newlines_okay) && (c == '\n')) ||
519 ((paren_seen && string[i - 1] == ')') &&
520 (c == ' ' || c == '.')) ||
524 /* This test causes a node name ending in a period, like `This.', not to
525 be found. The trailing . is stripped. This occurs in the jargon
526 file (`I see no X here.' is a node name). */
529 (whitespace_or_newline (string[i + 1])) ||
530 (string[i + 1] == ')'))))
537 /* **************************************************************** */
539 /* Searching FILE_BUFFER's */
541 /* **************************************************************** */
543 /* Return the absolute position of the first occurence of a node separator in
544 BINDING-buffer. The search starts at BINDING->start. Return -1 if no node
545 separator was found. */
547 find_node_separator (SEARCH_BINDING *binding)
552 body = binding->buffer;
554 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are
555 optional, but the DELETE and NEWLINE are not. This separator holds
556 true for all separated elements in an Info file, including the tags
557 table (if present) and the indirect tags table (if present). */
558 for (i = binding->start; i < binding->end - 1; i++)
559 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
560 (body[i + 2] == '\n' ||
561 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
562 ((body[i] == INFO_COOKIE) &&
563 (body[i + 1] == '\n' ||
564 (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
569 /* Return the length of the node separator characters that BODY is
570 currently pointing at. */
572 skip_node_separator (char *body)
578 if (body[i] == INFO_FF)
581 if (body[i++] != INFO_COOKIE)
584 if (body[i] == INFO_FF)
587 if (body[i++] != '\n')
593 /* Return the number of characters from STRING to the start of
596 skip_line (char *string)
600 for (i = 0; string && string[i] && string[i] != '\n'; i++);
602 if (string[i] == '\n')
608 /* Return the absolute position of the beginning of a tags table in this
609 binding starting the search at binding->start. */
611 find_tags_table (SEARCH_BINDING *binding)
613 SEARCH_BINDING tmp_search;
616 tmp_search.buffer = binding->buffer;
617 tmp_search.start = binding->start;
618 tmp_search.end = binding->end;
619 tmp_search.flags = S_FoldCase;
621 while ((position = find_node_separator (&tmp_search)) != -1 )
623 tmp_search.start = position;
624 tmp_search.start += skip_node_separator (tmp_search.buffer
627 if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
633 /* Return the absolute position of the node named NODENAME in BINDING.
634 This is a brute force search, and we wish to avoid it when possible.
635 This function is called when a tag (indirect or otherwise) doesn't
636 really point to the right node. It returns the absolute position of
637 the separator preceding the node. */
639 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
643 SEARCH_BINDING tmp_search;
645 namelen = strlen (nodename);
647 tmp_search.buffer = binding->buffer;
648 tmp_search.start = binding->start;
649 tmp_search.end = binding->end;
650 tmp_search.flags = 0;
652 while ((position = find_node_separator (&tmp_search)) != -1)
654 tmp_search.start = position;
655 tmp_search.start += skip_node_separator
656 (tmp_search.buffer + tmp_search.start);
658 offset = string_in_line
659 (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
664 tmp_search.start += offset;
665 tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
666 offset = skip_node_characters
667 (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
669 /* Notice that this is an exact match. You cannot grovel through
670 the buffer with this function looking for random nodes. */
671 if ((offset == namelen) &&
672 (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
673 (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))